diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,131573 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 18790, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00026609898882384245, + "grad_norm": 2.4823477268218994, + "learning_rate": 0.0, + "loss": 0.4294, + "step": 1 + }, + { + "epoch": 0.0005321979776476849, + "grad_norm": 2.3642499446868896, + "learning_rate": 1.0638297872340424e-09, + "loss": 0.4138, + "step": 2 + }, + { + "epoch": 0.0007982969664715274, + "grad_norm": 2.5261807441711426, + "learning_rate": 2.1276595744680848e-09, + "loss": 0.4335, + "step": 3 + }, + { + "epoch": 0.0010643959552953698, + "grad_norm": 2.208110809326172, + "learning_rate": 3.1914893617021273e-09, + "loss": 0.4127, + "step": 4 + }, + { + "epoch": 0.0013304949441192123, + "grad_norm": 2.3270974159240723, + "learning_rate": 4.2553191489361695e-09, + "loss": 0.3976, + "step": 5 + }, + { + "epoch": 0.0015965939329430547, + "grad_norm": 2.2224080562591553, + "learning_rate": 5.3191489361702125e-09, + "loss": 0.4095, + "step": 6 + }, + { + "epoch": 0.0018626929217668972, + "grad_norm": 2.2730791568756104, + "learning_rate": 6.382978723404255e-09, + "loss": 0.408, + "step": 7 + }, + { + "epoch": 0.0021287919105907396, + "grad_norm": 2.8397772312164307, + "learning_rate": 7.446808510638297e-09, + "loss": 0.4619, + "step": 8 + }, + { + "epoch": 0.002394890899414582, + "grad_norm": 2.4361422061920166, + "learning_rate": 8.510638297872339e-09, + "loss": 0.4155, + "step": 9 + }, + { + "epoch": 0.0026609898882384245, + "grad_norm": 2.290534257888794, + "learning_rate": 9.574468085106382e-09, + "loss": 0.4103, + "step": 10 + }, + { + "epoch": 0.002927088877062267, + "grad_norm": 2.3419768810272217, + "learning_rate": 1.0638297872340425e-08, + "loss": 0.4377, + "step": 11 + }, + { + "epoch": 0.0031931878658861094, + "grad_norm": 2.2947967052459717, + "learning_rate": 1.1702127659574468e-08, + "loss": 0.4077, + "step": 12 + }, + { + "epoch": 0.003459286854709952, + "grad_norm": 2.4373703002929688, + "learning_rate": 1.276595744680851e-08, + "loss": 0.4448, + "step": 13 + }, + { + "epoch": 0.0037253858435337944, + "grad_norm": 2.3161425590515137, + "learning_rate": 1.3829787234042552e-08, + "loss": 0.4405, + "step": 14 + }, + { + "epoch": 0.003991484832357637, + "grad_norm": 2.2258970737457275, + "learning_rate": 1.4893617021276594e-08, + "loss": 0.4152, + "step": 15 + }, + { + "epoch": 0.004257583821181479, + "grad_norm": 2.5679128170013428, + "learning_rate": 1.595744680851064e-08, + "loss": 0.4333, + "step": 16 + }, + { + "epoch": 0.004523682810005322, + "grad_norm": 2.2652180194854736, + "learning_rate": 1.7021276595744678e-08, + "loss": 0.4207, + "step": 17 + }, + { + "epoch": 0.004789781798829164, + "grad_norm": 2.22778582572937, + "learning_rate": 1.8085106382978724e-08, + "loss": 0.409, + "step": 18 + }, + { + "epoch": 0.005055880787653007, + "grad_norm": 2.1468400955200195, + "learning_rate": 1.9148936170212764e-08, + "loss": 0.3821, + "step": 19 + }, + { + "epoch": 0.005321979776476849, + "grad_norm": 2.4436581134796143, + "learning_rate": 2.0212765957446807e-08, + "loss": 0.4445, + "step": 20 + }, + { + "epoch": 0.0055880787653006915, + "grad_norm": 2.430718183517456, + "learning_rate": 2.127659574468085e-08, + "loss": 0.4186, + "step": 21 + }, + { + "epoch": 0.005854177754124534, + "grad_norm": 2.202714204788208, + "learning_rate": 2.2340425531914893e-08, + "loss": 0.3919, + "step": 22 + }, + { + "epoch": 0.0061202767429483764, + "grad_norm": 2.2579431533813477, + "learning_rate": 2.3404255319148936e-08, + "loss": 0.4053, + "step": 23 + }, + { + "epoch": 0.006386375731772219, + "grad_norm": 2.3549351692199707, + "learning_rate": 2.4468085106382976e-08, + "loss": 0.4328, + "step": 24 + }, + { + "epoch": 0.006652474720596061, + "grad_norm": 2.4747958183288574, + "learning_rate": 2.553191489361702e-08, + "loss": 0.4246, + "step": 25 + }, + { + "epoch": 0.006918573709419904, + "grad_norm": 2.6422274112701416, + "learning_rate": 2.6595744680851062e-08, + "loss": 0.4615, + "step": 26 + }, + { + "epoch": 0.007184672698243746, + "grad_norm": 2.115398406982422, + "learning_rate": 2.7659574468085105e-08, + "loss": 0.3926, + "step": 27 + }, + { + "epoch": 0.007450771687067589, + "grad_norm": 2.104806423187256, + "learning_rate": 2.872340425531915e-08, + "loss": 0.3922, + "step": 28 + }, + { + "epoch": 0.007716870675891432, + "grad_norm": 2.429518699645996, + "learning_rate": 2.9787234042553187e-08, + "loss": 0.426, + "step": 29 + }, + { + "epoch": 0.007982969664715274, + "grad_norm": 2.2860474586486816, + "learning_rate": 3.085106382978723e-08, + "loss": 0.4162, + "step": 30 + }, + { + "epoch": 0.008249068653539117, + "grad_norm": 2.4284541606903076, + "learning_rate": 3.191489361702128e-08, + "loss": 0.4357, + "step": 31 + }, + { + "epoch": 0.008515167642362959, + "grad_norm": 2.2753560543060303, + "learning_rate": 3.2978723404255316e-08, + "loss": 0.4323, + "step": 32 + }, + { + "epoch": 0.008781266631186802, + "grad_norm": 2.2276861667633057, + "learning_rate": 3.4042553191489356e-08, + "loss": 0.4034, + "step": 33 + }, + { + "epoch": 0.009047365620010643, + "grad_norm": 2.4479334354400635, + "learning_rate": 3.51063829787234e-08, + "loss": 0.4399, + "step": 34 + }, + { + "epoch": 0.009313464608834487, + "grad_norm": 2.2857301235198975, + "learning_rate": 3.617021276595745e-08, + "loss": 0.3851, + "step": 35 + }, + { + "epoch": 0.009579563597658328, + "grad_norm": 2.2455639839172363, + "learning_rate": 3.723404255319149e-08, + "loss": 0.4066, + "step": 36 + }, + { + "epoch": 0.009845662586482172, + "grad_norm": 2.733708143234253, + "learning_rate": 3.829787234042553e-08, + "loss": 0.4742, + "step": 37 + }, + { + "epoch": 0.010111761575306013, + "grad_norm": 2.2838940620422363, + "learning_rate": 3.9361702127659574e-08, + "loss": 0.4134, + "step": 38 + }, + { + "epoch": 0.010377860564129857, + "grad_norm": 2.4073679447174072, + "learning_rate": 4.0425531914893614e-08, + "loss": 0.4523, + "step": 39 + }, + { + "epoch": 0.010643959552953698, + "grad_norm": 2.3044168949127197, + "learning_rate": 4.1489361702127654e-08, + "loss": 0.4179, + "step": 40 + }, + { + "epoch": 0.010910058541777541, + "grad_norm": 2.232421875, + "learning_rate": 4.25531914893617e-08, + "loss": 0.4082, + "step": 41 + }, + { + "epoch": 0.011176157530601383, + "grad_norm": 2.500561475753784, + "learning_rate": 4.3617021276595746e-08, + "loss": 0.4503, + "step": 42 + }, + { + "epoch": 0.011442256519425226, + "grad_norm": 2.428791046142578, + "learning_rate": 4.4680851063829786e-08, + "loss": 0.4392, + "step": 43 + }, + { + "epoch": 0.011708355508249068, + "grad_norm": 2.1676881313323975, + "learning_rate": 4.5744680851063826e-08, + "loss": 0.4018, + "step": 44 + }, + { + "epoch": 0.011974454497072911, + "grad_norm": 2.4390740394592285, + "learning_rate": 4.680851063829787e-08, + "loss": 0.433, + "step": 45 + }, + { + "epoch": 0.012240553485896753, + "grad_norm": 2.227890968322754, + "learning_rate": 4.787234042553192e-08, + "loss": 0.4057, + "step": 46 + }, + { + "epoch": 0.012506652474720596, + "grad_norm": 2.5545506477355957, + "learning_rate": 4.893617021276595e-08, + "loss": 0.4372, + "step": 47 + }, + { + "epoch": 0.012772751463544438, + "grad_norm": 2.3603689670562744, + "learning_rate": 5e-08, + "loss": 0.4481, + "step": 48 + }, + { + "epoch": 0.013038850452368281, + "grad_norm": 2.235738515853882, + "learning_rate": 5.106382978723404e-08, + "loss": 0.4098, + "step": 49 + }, + { + "epoch": 0.013304949441192123, + "grad_norm": 2.1841166019439697, + "learning_rate": 5.2127659574468084e-08, + "loss": 0.3974, + "step": 50 + }, + { + "epoch": 0.013571048430015966, + "grad_norm": 2.0423407554626465, + "learning_rate": 5.3191489361702123e-08, + "loss": 0.3943, + "step": 51 + }, + { + "epoch": 0.013837147418839808, + "grad_norm": 2.2399420738220215, + "learning_rate": 5.425531914893616e-08, + "loss": 0.4184, + "step": 52 + }, + { + "epoch": 0.014103246407663651, + "grad_norm": 2.47058367729187, + "learning_rate": 5.531914893617021e-08, + "loss": 0.4448, + "step": 53 + }, + { + "epoch": 0.014369345396487493, + "grad_norm": 2.101189613342285, + "learning_rate": 5.638297872340425e-08, + "loss": 0.3983, + "step": 54 + }, + { + "epoch": 0.014635444385311336, + "grad_norm": 2.836261749267578, + "learning_rate": 5.74468085106383e-08, + "loss": 0.407, + "step": 55 + }, + { + "epoch": 0.014901543374135177, + "grad_norm": 2.207831621170044, + "learning_rate": 5.851063829787234e-08, + "loss": 0.4066, + "step": 56 + }, + { + "epoch": 0.01516764236295902, + "grad_norm": 2.4891929626464844, + "learning_rate": 5.9574468085106375e-08, + "loss": 0.4498, + "step": 57 + }, + { + "epoch": 0.015433741351782864, + "grad_norm": 2.3699867725372314, + "learning_rate": 6.063829787234042e-08, + "loss": 0.422, + "step": 58 + }, + { + "epoch": 0.015699840340606706, + "grad_norm": 2.262488603591919, + "learning_rate": 6.170212765957446e-08, + "loss": 0.425, + "step": 59 + }, + { + "epoch": 0.015965939329430547, + "grad_norm": 2.29249906539917, + "learning_rate": 6.27659574468085e-08, + "loss": 0.408, + "step": 60 + }, + { + "epoch": 0.016232038318254392, + "grad_norm": 2.434504508972168, + "learning_rate": 6.382978723404255e-08, + "loss": 0.4301, + "step": 61 + }, + { + "epoch": 0.016498137307078234, + "grad_norm": 2.5166099071502686, + "learning_rate": 6.489361702127659e-08, + "loss": 0.4407, + "step": 62 + }, + { + "epoch": 0.016764236295902075, + "grad_norm": 2.1207709312438965, + "learning_rate": 6.595744680851063e-08, + "loss": 0.4123, + "step": 63 + }, + { + "epoch": 0.017030335284725917, + "grad_norm": 2.2956714630126953, + "learning_rate": 6.702127659574469e-08, + "loss": 0.4316, + "step": 64 + }, + { + "epoch": 0.017296434273549762, + "grad_norm": 2.143887758255005, + "learning_rate": 6.808510638297871e-08, + "loss": 0.4122, + "step": 65 + }, + { + "epoch": 0.017562533262373604, + "grad_norm": 2.066054582595825, + "learning_rate": 6.914893617021277e-08, + "loss": 0.3992, + "step": 66 + }, + { + "epoch": 0.017828632251197445, + "grad_norm": 2.020817995071411, + "learning_rate": 7.02127659574468e-08, + "loss": 0.391, + "step": 67 + }, + { + "epoch": 0.018094731240021287, + "grad_norm": 2.110450506210327, + "learning_rate": 7.127659574468084e-08, + "loss": 0.3934, + "step": 68 + }, + { + "epoch": 0.018360830228845132, + "grad_norm": 2.1386616230010986, + "learning_rate": 7.23404255319149e-08, + "loss": 0.4049, + "step": 69 + }, + { + "epoch": 0.018626929217668974, + "grad_norm": 2.105788230895996, + "learning_rate": 7.340425531914894e-08, + "loss": 0.4037, + "step": 70 + }, + { + "epoch": 0.018893028206492815, + "grad_norm": 2.1114938259124756, + "learning_rate": 7.446808510638298e-08, + "loss": 0.4173, + "step": 71 + }, + { + "epoch": 0.019159127195316657, + "grad_norm": 2.2972190380096436, + "learning_rate": 7.553191489361702e-08, + "loss": 0.4358, + "step": 72 + }, + { + "epoch": 0.019425226184140502, + "grad_norm": 2.1496665477752686, + "learning_rate": 7.659574468085106e-08, + "loss": 0.4166, + "step": 73 + }, + { + "epoch": 0.019691325172964343, + "grad_norm": 2.4376273155212402, + "learning_rate": 7.76595744680851e-08, + "loss": 0.4532, + "step": 74 + }, + { + "epoch": 0.019957424161788185, + "grad_norm": 2.273815155029297, + "learning_rate": 7.872340425531915e-08, + "loss": 0.4174, + "step": 75 + }, + { + "epoch": 0.020223523150612027, + "grad_norm": 2.0958499908447266, + "learning_rate": 7.978723404255319e-08, + "loss": 0.4118, + "step": 76 + }, + { + "epoch": 0.02048962213943587, + "grad_norm": 1.9903168678283691, + "learning_rate": 8.085106382978723e-08, + "loss": 0.3974, + "step": 77 + }, + { + "epoch": 0.020755721128259713, + "grad_norm": 2.0548646450042725, + "learning_rate": 8.191489361702128e-08, + "loss": 0.4094, + "step": 78 + }, + { + "epoch": 0.021021820117083555, + "grad_norm": 2.027235746383667, + "learning_rate": 8.297872340425531e-08, + "loss": 0.4068, + "step": 79 + }, + { + "epoch": 0.021287919105907396, + "grad_norm": 2.1664140224456787, + "learning_rate": 8.404255319148936e-08, + "loss": 0.422, + "step": 80 + }, + { + "epoch": 0.02155401809473124, + "grad_norm": 1.9610613584518433, + "learning_rate": 8.51063829787234e-08, + "loss": 0.4006, + "step": 81 + }, + { + "epoch": 0.021820117083555083, + "grad_norm": 2.1196208000183105, + "learning_rate": 8.617021276595744e-08, + "loss": 0.404, + "step": 82 + }, + { + "epoch": 0.022086216072378925, + "grad_norm": 2.135197162628174, + "learning_rate": 8.723404255319149e-08, + "loss": 0.4095, + "step": 83 + }, + { + "epoch": 0.022352315061202766, + "grad_norm": 2.106204032897949, + "learning_rate": 8.829787234042553e-08, + "loss": 0.4148, + "step": 84 + }, + { + "epoch": 0.02261841405002661, + "grad_norm": 2.1044492721557617, + "learning_rate": 8.936170212765957e-08, + "loss": 0.4158, + "step": 85 + }, + { + "epoch": 0.022884513038850453, + "grad_norm": 1.886287808418274, + "learning_rate": 9.042553191489361e-08, + "loss": 0.3681, + "step": 86 + }, + { + "epoch": 0.023150612027674294, + "grad_norm": 2.2445383071899414, + "learning_rate": 9.148936170212765e-08, + "loss": 0.4207, + "step": 87 + }, + { + "epoch": 0.023416711016498136, + "grad_norm": 2.0126404762268066, + "learning_rate": 9.255319148936169e-08, + "loss": 0.3977, + "step": 88 + }, + { + "epoch": 0.02368281000532198, + "grad_norm": 2.161963701248169, + "learning_rate": 9.361702127659574e-08, + "loss": 0.4382, + "step": 89 + }, + { + "epoch": 0.023948908994145823, + "grad_norm": 1.756773591041565, + "learning_rate": 9.468085106382978e-08, + "loss": 0.3757, + "step": 90 + }, + { + "epoch": 0.024215007982969664, + "grad_norm": 2.331974744796753, + "learning_rate": 9.574468085106384e-08, + "loss": 0.4443, + "step": 91 + }, + { + "epoch": 0.024481106971793506, + "grad_norm": 1.7981693744659424, + "learning_rate": 9.680851063829786e-08, + "loss": 0.384, + "step": 92 + }, + { + "epoch": 0.02474720596061735, + "grad_norm": 1.6555863618850708, + "learning_rate": 9.78723404255319e-08, + "loss": 0.3767, + "step": 93 + }, + { + "epoch": 0.025013304949441192, + "grad_norm": 1.6948606967926025, + "learning_rate": 9.893617021276596e-08, + "loss": 0.3751, + "step": 94 + }, + { + "epoch": 0.025279403938265034, + "grad_norm": 1.7324317693710327, + "learning_rate": 1e-07, + "loss": 0.3893, + "step": 95 + }, + { + "epoch": 0.025545502927088876, + "grad_norm": 1.7222027778625488, + "learning_rate": 1.0106382978723405e-07, + "loss": 0.3979, + "step": 96 + }, + { + "epoch": 0.02581160191591272, + "grad_norm": 1.6481472253799438, + "learning_rate": 1.0212765957446807e-07, + "loss": 0.3724, + "step": 97 + }, + { + "epoch": 0.026077700904736562, + "grad_norm": 1.6349422931671143, + "learning_rate": 1.0319148936170213e-07, + "loss": 0.373, + "step": 98 + }, + { + "epoch": 0.026343799893560404, + "grad_norm": 1.6146180629730225, + "learning_rate": 1.0425531914893617e-07, + "loss": 0.377, + "step": 99 + }, + { + "epoch": 0.026609898882384245, + "grad_norm": 1.975437879562378, + "learning_rate": 1.053191489361702e-07, + "loss": 0.4297, + "step": 100 + }, + { + "epoch": 0.02687599787120809, + "grad_norm": 1.8450990915298462, + "learning_rate": 1.0638297872340425e-07, + "loss": 0.4143, + "step": 101 + }, + { + "epoch": 0.027142096860031932, + "grad_norm": 1.5597376823425293, + "learning_rate": 1.074468085106383e-07, + "loss": 0.3625, + "step": 102 + }, + { + "epoch": 0.027408195848855774, + "grad_norm": 2.240682363510132, + "learning_rate": 1.0851063829787233e-07, + "loss": 0.4535, + "step": 103 + }, + { + "epoch": 0.027674294837679615, + "grad_norm": 1.705177664756775, + "learning_rate": 1.0957446808510638e-07, + "loss": 0.4021, + "step": 104 + }, + { + "epoch": 0.02794039382650346, + "grad_norm": 1.9165663719177246, + "learning_rate": 1.1063829787234042e-07, + "loss": 0.4133, + "step": 105 + }, + { + "epoch": 0.028206492815327302, + "grad_norm": 1.8641730546951294, + "learning_rate": 1.1170212765957446e-07, + "loss": 0.4156, + "step": 106 + }, + { + "epoch": 0.028472591804151143, + "grad_norm": 2.3648598194122314, + "learning_rate": 1.127659574468085e-07, + "loss": 0.442, + "step": 107 + }, + { + "epoch": 0.028738690792974985, + "grad_norm": 1.6834380626678467, + "learning_rate": 1.1382978723404255e-07, + "loss": 0.3879, + "step": 108 + }, + { + "epoch": 0.02900478978179883, + "grad_norm": 2.008845806121826, + "learning_rate": 1.148936170212766e-07, + "loss": 0.419, + "step": 109 + }, + { + "epoch": 0.02927088877062267, + "grad_norm": 2.0013351440429688, + "learning_rate": 1.1595744680851063e-07, + "loss": 0.403, + "step": 110 + }, + { + "epoch": 0.029536987759446513, + "grad_norm": 1.67275071144104, + "learning_rate": 1.1702127659574468e-07, + "loss": 0.3923, + "step": 111 + }, + { + "epoch": 0.029803086748270355, + "grad_norm": 1.577343463897705, + "learning_rate": 1.1808510638297872e-07, + "loss": 0.3681, + "step": 112 + }, + { + "epoch": 0.0300691857370942, + "grad_norm": 1.8648662567138672, + "learning_rate": 1.1914893617021275e-07, + "loss": 0.4056, + "step": 113 + }, + { + "epoch": 0.03033528472591804, + "grad_norm": 1.747404932975769, + "learning_rate": 1.202127659574468e-07, + "loss": 0.4034, + "step": 114 + }, + { + "epoch": 0.030601383714741883, + "grad_norm": 1.7049554586410522, + "learning_rate": 1.2127659574468084e-07, + "loss": 0.3724, + "step": 115 + }, + { + "epoch": 0.030867482703565728, + "grad_norm": 1.8814678192138672, + "learning_rate": 1.2234042553191488e-07, + "loss": 0.3997, + "step": 116 + }, + { + "epoch": 0.03113358169238957, + "grad_norm": 1.639127254486084, + "learning_rate": 1.2340425531914892e-07, + "loss": 0.386, + "step": 117 + }, + { + "epoch": 0.03139968068121341, + "grad_norm": 1.5526881217956543, + "learning_rate": 1.24468085106383e-07, + "loss": 0.3806, + "step": 118 + }, + { + "epoch": 0.03166577967003725, + "grad_norm": 1.4972989559173584, + "learning_rate": 1.25531914893617e-07, + "loss": 0.3902, + "step": 119 + }, + { + "epoch": 0.031931878658861094, + "grad_norm": 1.3757778406143188, + "learning_rate": 1.2659574468085107e-07, + "loss": 0.3346, + "step": 120 + }, + { + "epoch": 0.032197977647684936, + "grad_norm": 1.3951451778411865, + "learning_rate": 1.276595744680851e-07, + "loss": 0.3705, + "step": 121 + }, + { + "epoch": 0.032464076636508785, + "grad_norm": 1.410186767578125, + "learning_rate": 1.2872340425531915e-07, + "loss": 0.348, + "step": 122 + }, + { + "epoch": 0.032730175625332626, + "grad_norm": 1.369508981704712, + "learning_rate": 1.2978723404255319e-07, + "loss": 0.3698, + "step": 123 + }, + { + "epoch": 0.03299627461415647, + "grad_norm": 1.3033406734466553, + "learning_rate": 1.3085106382978723e-07, + "loss": 0.3744, + "step": 124 + }, + { + "epoch": 0.03326237360298031, + "grad_norm": 1.3611514568328857, + "learning_rate": 1.3191489361702127e-07, + "loss": 0.3543, + "step": 125 + }, + { + "epoch": 0.03352847259180415, + "grad_norm": 1.3535687923431396, + "learning_rate": 1.329787234042553e-07, + "loss": 0.3739, + "step": 126 + }, + { + "epoch": 0.03379457158062799, + "grad_norm": 1.3076380491256714, + "learning_rate": 1.3404255319148937e-07, + "loss": 0.3588, + "step": 127 + }, + { + "epoch": 0.034060670569451834, + "grad_norm": 1.281329870223999, + "learning_rate": 1.3510638297872338e-07, + "loss": 0.3749, + "step": 128 + }, + { + "epoch": 0.034326769558275676, + "grad_norm": 1.3607063293457031, + "learning_rate": 1.3617021276595742e-07, + "loss": 0.3852, + "step": 129 + }, + { + "epoch": 0.034592868547099524, + "grad_norm": 1.1073755025863647, + "learning_rate": 1.372340425531915e-07, + "loss": 0.353, + "step": 130 + }, + { + "epoch": 0.034858967535923366, + "grad_norm": 1.1145133972167969, + "learning_rate": 1.3829787234042553e-07, + "loss": 0.3673, + "step": 131 + }, + { + "epoch": 0.03512506652474721, + "grad_norm": 0.986873984336853, + "learning_rate": 1.3936170212765957e-07, + "loss": 0.3429, + "step": 132 + }, + { + "epoch": 0.03539116551357105, + "grad_norm": 1.0029836893081665, + "learning_rate": 1.404255319148936e-07, + "loss": 0.3601, + "step": 133 + }, + { + "epoch": 0.03565726450239489, + "grad_norm": 1.0504239797592163, + "learning_rate": 1.4148936170212768e-07, + "loss": 0.3638, + "step": 134 + }, + { + "epoch": 0.03592336349121873, + "grad_norm": 1.026857614517212, + "learning_rate": 1.425531914893617e-07, + "loss": 0.3545, + "step": 135 + }, + { + "epoch": 0.036189462480042574, + "grad_norm": 0.9713463187217712, + "learning_rate": 1.4361702127659573e-07, + "loss": 0.3487, + "step": 136 + }, + { + "epoch": 0.036455561468866415, + "grad_norm": 1.0249488353729248, + "learning_rate": 1.446808510638298e-07, + "loss": 0.3614, + "step": 137 + }, + { + "epoch": 0.036721660457690264, + "grad_norm": 1.0480413436889648, + "learning_rate": 1.457446808510638e-07, + "loss": 0.3884, + "step": 138 + }, + { + "epoch": 0.036987759446514105, + "grad_norm": 0.8297316431999207, + "learning_rate": 1.4680851063829787e-07, + "loss": 0.3249, + "step": 139 + }, + { + "epoch": 0.03725385843533795, + "grad_norm": 0.9653038382530212, + "learning_rate": 1.4787234042553191e-07, + "loss": 0.3732, + "step": 140 + }, + { + "epoch": 0.03751995742416179, + "grad_norm": 0.9085098505020142, + "learning_rate": 1.4893617021276595e-07, + "loss": 0.3413, + "step": 141 + }, + { + "epoch": 0.03778605641298563, + "grad_norm": 0.8898190855979919, + "learning_rate": 1.5e-07, + "loss": 0.3563, + "step": 142 + }, + { + "epoch": 0.03805215540180947, + "grad_norm": 1.0132604837417603, + "learning_rate": 1.5106382978723403e-07, + "loss": 0.365, + "step": 143 + }, + { + "epoch": 0.03831825439063331, + "grad_norm": 0.9058570265769958, + "learning_rate": 1.5212765957446807e-07, + "loss": 0.3514, + "step": 144 + }, + { + "epoch": 0.038584353379457155, + "grad_norm": 0.8517774343490601, + "learning_rate": 1.531914893617021e-07, + "loss": 0.3515, + "step": 145 + }, + { + "epoch": 0.038850452368281004, + "grad_norm": 0.8309643864631653, + "learning_rate": 1.5425531914893618e-07, + "loss": 0.3469, + "step": 146 + }, + { + "epoch": 0.039116551357104845, + "grad_norm": 0.7890806794166565, + "learning_rate": 1.553191489361702e-07, + "loss": 0.3185, + "step": 147 + }, + { + "epoch": 0.03938265034592869, + "grad_norm": 0.942486047744751, + "learning_rate": 1.5638297872340426e-07, + "loss": 0.3535, + "step": 148 + }, + { + "epoch": 0.03964874933475253, + "grad_norm": 0.8013913631439209, + "learning_rate": 1.574468085106383e-07, + "loss": 0.3233, + "step": 149 + }, + { + "epoch": 0.03991484832357637, + "grad_norm": 0.8250101208686829, + "learning_rate": 1.585106382978723e-07, + "loss": 0.347, + "step": 150 + }, + { + "epoch": 0.04018094731240021, + "grad_norm": 0.8187092542648315, + "learning_rate": 1.5957446808510638e-07, + "loss": 0.357, + "step": 151 + }, + { + "epoch": 0.04044704630122405, + "grad_norm": 0.7748068571090698, + "learning_rate": 1.6063829787234042e-07, + "loss": 0.327, + "step": 152 + }, + { + "epoch": 0.040713145290047895, + "grad_norm": 0.8472638726234436, + "learning_rate": 1.6170212765957446e-07, + "loss": 0.3381, + "step": 153 + }, + { + "epoch": 0.04097924427887174, + "grad_norm": 0.8115334510803223, + "learning_rate": 1.627659574468085e-07, + "loss": 0.3587, + "step": 154 + }, + { + "epoch": 0.041245343267695585, + "grad_norm": 0.7391449809074402, + "learning_rate": 1.6382978723404256e-07, + "loss": 0.3236, + "step": 155 + }, + { + "epoch": 0.041511442256519426, + "grad_norm": 1.6551411151885986, + "learning_rate": 1.648936170212766e-07, + "loss": 0.3615, + "step": 156 + }, + { + "epoch": 0.04177754124534327, + "grad_norm": 0.7428591251373291, + "learning_rate": 1.6595744680851062e-07, + "loss": 0.3398, + "step": 157 + }, + { + "epoch": 0.04204364023416711, + "grad_norm": 0.7521659135818481, + "learning_rate": 1.6702127659574468e-07, + "loss": 0.3172, + "step": 158 + }, + { + "epoch": 0.04230973922299095, + "grad_norm": 0.7381339073181152, + "learning_rate": 1.6808510638297872e-07, + "loss": 0.3262, + "step": 159 + }, + { + "epoch": 0.04257583821181479, + "grad_norm": 0.7944221496582031, + "learning_rate": 1.6914893617021276e-07, + "loss": 0.3453, + "step": 160 + }, + { + "epoch": 0.042841937200638634, + "grad_norm": 0.8589829206466675, + "learning_rate": 1.702127659574468e-07, + "loss": 0.3792, + "step": 161 + }, + { + "epoch": 0.04310803618946248, + "grad_norm": 0.7041075825691223, + "learning_rate": 1.7127659574468084e-07, + "loss": 0.321, + "step": 162 + }, + { + "epoch": 0.043374135178286324, + "grad_norm": 0.7845090627670288, + "learning_rate": 1.7234042553191488e-07, + "loss": 0.3542, + "step": 163 + }, + { + "epoch": 0.043640234167110166, + "grad_norm": 0.6824455261230469, + "learning_rate": 1.7340425531914892e-07, + "loss": 0.3247, + "step": 164 + }, + { + "epoch": 0.04390633315593401, + "grad_norm": 0.9044448137283325, + "learning_rate": 1.7446808510638299e-07, + "loss": 0.371, + "step": 165 + }, + { + "epoch": 0.04417243214475785, + "grad_norm": 0.7518810033798218, + "learning_rate": 1.75531914893617e-07, + "loss": 0.3575, + "step": 166 + }, + { + "epoch": 0.04443853113358169, + "grad_norm": 0.7053118944168091, + "learning_rate": 1.7659574468085106e-07, + "loss": 0.3342, + "step": 167 + }, + { + "epoch": 0.04470463012240553, + "grad_norm": 0.6585180163383484, + "learning_rate": 1.776595744680851e-07, + "loss": 0.314, + "step": 168 + }, + { + "epoch": 0.044970729111229374, + "grad_norm": 0.7100732326507568, + "learning_rate": 1.7872340425531914e-07, + "loss": 0.3355, + "step": 169 + }, + { + "epoch": 0.04523682810005322, + "grad_norm": 0.7052040696144104, + "learning_rate": 1.7978723404255318e-07, + "loss": 0.3436, + "step": 170 + }, + { + "epoch": 0.045502927088877064, + "grad_norm": 0.6318358778953552, + "learning_rate": 1.8085106382978722e-07, + "loss": 0.3062, + "step": 171 + }, + { + "epoch": 0.045769026077700906, + "grad_norm": 0.7026102542877197, + "learning_rate": 1.8191489361702126e-07, + "loss": 0.3247, + "step": 172 + }, + { + "epoch": 0.04603512506652475, + "grad_norm": 0.8733323812484741, + "learning_rate": 1.829787234042553e-07, + "loss": 0.3792, + "step": 173 + }, + { + "epoch": 0.04630122405534859, + "grad_norm": 0.6728752851486206, + "learning_rate": 1.8404255319148937e-07, + "loss": 0.3395, + "step": 174 + }, + { + "epoch": 0.04656732304417243, + "grad_norm": 0.715578019618988, + "learning_rate": 1.8510638297872338e-07, + "loss": 0.3535, + "step": 175 + }, + { + "epoch": 0.04683342203299627, + "grad_norm": 0.7242829203605652, + "learning_rate": 1.8617021276595742e-07, + "loss": 0.3562, + "step": 176 + }, + { + "epoch": 0.04709952102182012, + "grad_norm": 0.6335774064064026, + "learning_rate": 1.872340425531915e-07, + "loss": 0.3235, + "step": 177 + }, + { + "epoch": 0.04736562001064396, + "grad_norm": 0.6342115998268127, + "learning_rate": 1.8829787234042553e-07, + "loss": 0.3161, + "step": 178 + }, + { + "epoch": 0.047631718999467804, + "grad_norm": 0.7689881920814514, + "learning_rate": 1.8936170212765957e-07, + "loss": 0.3452, + "step": 179 + }, + { + "epoch": 0.047897817988291645, + "grad_norm": 0.604850709438324, + "learning_rate": 1.904255319148936e-07, + "loss": 0.3132, + "step": 180 + }, + { + "epoch": 0.04816391697711549, + "grad_norm": 0.7331966757774353, + "learning_rate": 1.9148936170212767e-07, + "loss": 0.3709, + "step": 181 + }, + { + "epoch": 0.04843001596593933, + "grad_norm": 0.613190770149231, + "learning_rate": 1.9255319148936169e-07, + "loss": 0.3292, + "step": 182 + }, + { + "epoch": 0.04869611495476317, + "grad_norm": 0.6349330544471741, + "learning_rate": 1.9361702127659573e-07, + "loss": 0.3258, + "step": 183 + }, + { + "epoch": 0.04896221394358701, + "grad_norm": 0.5901505947113037, + "learning_rate": 1.946808510638298e-07, + "loss": 0.3317, + "step": 184 + }, + { + "epoch": 0.04922831293241086, + "grad_norm": 0.5172446966171265, + "learning_rate": 1.957446808510638e-07, + "loss": 0.3007, + "step": 185 + }, + { + "epoch": 0.0494944119212347, + "grad_norm": 0.578785240650177, + "learning_rate": 1.9680851063829787e-07, + "loss": 0.3111, + "step": 186 + }, + { + "epoch": 0.04976051091005854, + "grad_norm": 0.5551354885101318, + "learning_rate": 1.978723404255319e-07, + "loss": 0.3252, + "step": 187 + }, + { + "epoch": 0.050026609898882385, + "grad_norm": 0.5722272992134094, + "learning_rate": 1.9893617021276595e-07, + "loss": 0.3239, + "step": 188 + }, + { + "epoch": 0.050292708887706226, + "grad_norm": 0.5582234263420105, + "learning_rate": 2e-07, + "loss": 0.3304, + "step": 189 + }, + { + "epoch": 0.05055880787653007, + "grad_norm": 0.5618994235992432, + "learning_rate": 1.999999985738984e-07, + "loss": 0.3261, + "step": 190 + }, + { + "epoch": 0.05082490686535391, + "grad_norm": 0.6032128930091858, + "learning_rate": 1.999999942955936e-07, + "loss": 0.3308, + "step": 191 + }, + { + "epoch": 0.05109100585417775, + "grad_norm": 0.5607643127441406, + "learning_rate": 1.999999871650858e-07, + "loss": 0.3147, + "step": 192 + }, + { + "epoch": 0.0513571048430016, + "grad_norm": 0.5089272260665894, + "learning_rate": 1.9999997718237513e-07, + "loss": 0.2988, + "step": 193 + }, + { + "epoch": 0.05162320383182544, + "grad_norm": 0.5834162831306458, + "learning_rate": 1.999999643474619e-07, + "loss": 0.3288, + "step": 194 + }, + { + "epoch": 0.05188930282064928, + "grad_norm": 0.5043500661849976, + "learning_rate": 1.9999994866034648e-07, + "loss": 0.311, + "step": 195 + }, + { + "epoch": 0.052155401809473124, + "grad_norm": 0.5727211236953735, + "learning_rate": 1.9999993012102934e-07, + "loss": 0.3345, + "step": 196 + }, + { + "epoch": 0.052421500798296966, + "grad_norm": 0.5006299018859863, + "learning_rate": 1.9999990872951097e-07, + "loss": 0.3217, + "step": 197 + }, + { + "epoch": 0.05268759978712081, + "grad_norm": 0.5729879140853882, + "learning_rate": 1.9999988448579197e-07, + "loss": 0.3348, + "step": 198 + }, + { + "epoch": 0.05295369877594465, + "grad_norm": 0.4827020764350891, + "learning_rate": 1.9999985738987307e-07, + "loss": 0.3251, + "step": 199 + }, + { + "epoch": 0.05321979776476849, + "grad_norm": 0.5758902430534363, + "learning_rate": 1.99999827441755e-07, + "loss": 0.3264, + "step": 200 + }, + { + "epoch": 0.05348589675359234, + "grad_norm": 0.5227569341659546, + "learning_rate": 1.9999979464143869e-07, + "loss": 0.3363, + "step": 201 + }, + { + "epoch": 0.05375199574241618, + "grad_norm": 0.47421061992645264, + "learning_rate": 1.99999758988925e-07, + "loss": 0.2946, + "step": 202 + }, + { + "epoch": 0.05401809473124002, + "grad_norm": 0.47599998116493225, + "learning_rate": 1.99999720484215e-07, + "loss": 0.3376, + "step": 203 + }, + { + "epoch": 0.054284193720063864, + "grad_norm": 0.6409503221511841, + "learning_rate": 1.9999967912730973e-07, + "loss": 0.3244, + "step": 204 + }, + { + "epoch": 0.054550292708887706, + "grad_norm": 0.4434550106525421, + "learning_rate": 1.9999963491821044e-07, + "loss": 0.2985, + "step": 205 + }, + { + "epoch": 0.05481639169771155, + "grad_norm": 0.4539810121059418, + "learning_rate": 1.999995878569183e-07, + "loss": 0.2968, + "step": 206 + }, + { + "epoch": 0.05508249068653539, + "grad_norm": 0.460541695356369, + "learning_rate": 1.9999953794343475e-07, + "loss": 0.3163, + "step": 207 + }, + { + "epoch": 0.05534858967535923, + "grad_norm": 0.4842148721218109, + "learning_rate": 1.9999948517776114e-07, + "loss": 0.3188, + "step": 208 + }, + { + "epoch": 0.05561468866418308, + "grad_norm": 0.6233951449394226, + "learning_rate": 1.99999429559899e-07, + "loss": 0.346, + "step": 209 + }, + { + "epoch": 0.05588078765300692, + "grad_norm": 0.48298314213752747, + "learning_rate": 1.9999937108984996e-07, + "loss": 0.3104, + "step": 210 + }, + { + "epoch": 0.05614688664183076, + "grad_norm": 0.48747509717941284, + "learning_rate": 1.9999930976761564e-07, + "loss": 0.3191, + "step": 211 + }, + { + "epoch": 0.056412985630654604, + "grad_norm": 1.0450503826141357, + "learning_rate": 1.9999924559319777e-07, + "loss": 0.2888, + "step": 212 + }, + { + "epoch": 0.056679084619478445, + "grad_norm": 0.4596988260746002, + "learning_rate": 1.9999917856659823e-07, + "loss": 0.314, + "step": 213 + }, + { + "epoch": 0.05694518360830229, + "grad_norm": 0.422661155462265, + "learning_rate": 1.999991086878189e-07, + "loss": 0.3025, + "step": 214 + }, + { + "epoch": 0.05721128259712613, + "grad_norm": 0.4410701096057892, + "learning_rate": 1.999990359568618e-07, + "loss": 0.312, + "step": 215 + }, + { + "epoch": 0.05747738158594997, + "grad_norm": 0.42835283279418945, + "learning_rate": 1.9999896037372897e-07, + "loss": 0.3082, + "step": 216 + }, + { + "epoch": 0.05774348057477382, + "grad_norm": 0.4045667350292206, + "learning_rate": 1.9999888193842257e-07, + "loss": 0.3115, + "step": 217 + }, + { + "epoch": 0.05800957956359766, + "grad_norm": 0.42176464200019836, + "learning_rate": 1.9999880065094485e-07, + "loss": 0.2971, + "step": 218 + }, + { + "epoch": 0.0582756785524215, + "grad_norm": 0.5353208780288696, + "learning_rate": 1.9999871651129814e-07, + "loss": 0.3411, + "step": 219 + }, + { + "epoch": 0.05854177754124534, + "grad_norm": 0.5372373461723328, + "learning_rate": 1.9999862951948482e-07, + "loss": 0.3348, + "step": 220 + }, + { + "epoch": 0.058807876530069185, + "grad_norm": 0.42193472385406494, + "learning_rate": 1.999985396755074e-07, + "loss": 0.3062, + "step": 221 + }, + { + "epoch": 0.05907397551889303, + "grad_norm": 0.4003113806247711, + "learning_rate": 1.9999844697936842e-07, + "loss": 0.2999, + "step": 222 + }, + { + "epoch": 0.05934007450771687, + "grad_norm": 0.46931323409080505, + "learning_rate": 1.9999835143107052e-07, + "loss": 0.3181, + "step": 223 + }, + { + "epoch": 0.05960617349654071, + "grad_norm": 0.4039660394191742, + "learning_rate": 1.999982530306164e-07, + "loss": 0.3048, + "step": 224 + }, + { + "epoch": 0.05987227248536456, + "grad_norm": 0.48130354285240173, + "learning_rate": 1.9999815177800893e-07, + "loss": 0.3532, + "step": 225 + }, + { + "epoch": 0.0601383714741884, + "grad_norm": 0.390566885471344, + "learning_rate": 1.9999804767325096e-07, + "loss": 0.2947, + "step": 226 + }, + { + "epoch": 0.06040447046301224, + "grad_norm": 0.3958500325679779, + "learning_rate": 1.9999794071634546e-07, + "loss": 0.302, + "step": 227 + }, + { + "epoch": 0.06067056945183608, + "grad_norm": 0.5880823731422424, + "learning_rate": 1.9999783090729548e-07, + "loss": 0.3373, + "step": 228 + }, + { + "epoch": 0.060936668440659925, + "grad_norm": 0.40962082147598267, + "learning_rate": 1.9999771824610415e-07, + "loss": 0.2989, + "step": 229 + }, + { + "epoch": 0.061202767429483766, + "grad_norm": 0.4480498731136322, + "learning_rate": 1.9999760273277474e-07, + "loss": 0.3298, + "step": 230 + }, + { + "epoch": 0.06146886641830761, + "grad_norm": 0.44164198637008667, + "learning_rate": 1.9999748436731046e-07, + "loss": 0.3196, + "step": 231 + }, + { + "epoch": 0.061734965407131456, + "grad_norm": 0.4488755762577057, + "learning_rate": 1.9999736314971472e-07, + "loss": 0.3121, + "step": 232 + }, + { + "epoch": 0.0620010643959553, + "grad_norm": 0.3948761522769928, + "learning_rate": 1.9999723907999097e-07, + "loss": 0.2959, + "step": 233 + }, + { + "epoch": 0.06226716338477914, + "grad_norm": 0.43969088792800903, + "learning_rate": 1.999971121581428e-07, + "loss": 0.3074, + "step": 234 + }, + { + "epoch": 0.06253326237360297, + "grad_norm": 0.5704119205474854, + "learning_rate": 1.9999698238417373e-07, + "loss": 0.3217, + "step": 235 + }, + { + "epoch": 0.06279936136242682, + "grad_norm": 0.43683922290802, + "learning_rate": 1.9999684975808755e-07, + "loss": 0.306, + "step": 236 + }, + { + "epoch": 0.06306546035125067, + "grad_norm": 0.45782336592674255, + "learning_rate": 1.99996714279888e-07, + "loss": 0.32, + "step": 237 + }, + { + "epoch": 0.0633315593400745, + "grad_norm": 0.36692261695861816, + "learning_rate": 1.9999657594957897e-07, + "loss": 0.2804, + "step": 238 + }, + { + "epoch": 0.06359765832889835, + "grad_norm": 0.41345006227493286, + "learning_rate": 1.9999643476716436e-07, + "loss": 0.3277, + "step": 239 + }, + { + "epoch": 0.06386375731772219, + "grad_norm": 0.37832555174827576, + "learning_rate": 1.9999629073264827e-07, + "loss": 0.2982, + "step": 240 + }, + { + "epoch": 0.06412985630654604, + "grad_norm": 0.4364365339279175, + "learning_rate": 1.9999614384603475e-07, + "loss": 0.3054, + "step": 241 + }, + { + "epoch": 0.06439595529536987, + "grad_norm": 0.37108147144317627, + "learning_rate": 1.9999599410732799e-07, + "loss": 0.2742, + "step": 242 + }, + { + "epoch": 0.06466205428419372, + "grad_norm": 0.4012935757637024, + "learning_rate": 1.9999584151653228e-07, + "loss": 0.3078, + "step": 243 + }, + { + "epoch": 0.06492815327301757, + "grad_norm": 0.3635095953941345, + "learning_rate": 1.99995686073652e-07, + "loss": 0.2981, + "step": 244 + }, + { + "epoch": 0.0651942522618414, + "grad_norm": 0.4264245927333832, + "learning_rate": 1.9999552777869151e-07, + "loss": 0.2982, + "step": 245 + }, + { + "epoch": 0.06546035125066525, + "grad_norm": 0.4966065287590027, + "learning_rate": 1.9999536663165538e-07, + "loss": 0.3303, + "step": 246 + }, + { + "epoch": 0.06572645023948909, + "grad_norm": 0.429969847202301, + "learning_rate": 1.9999520263254822e-07, + "loss": 0.2851, + "step": 247 + }, + { + "epoch": 0.06599254922831294, + "grad_norm": 0.5703898668289185, + "learning_rate": 1.9999503578137467e-07, + "loss": 0.3042, + "step": 248 + }, + { + "epoch": 0.06625864821713677, + "grad_norm": 0.41282883286476135, + "learning_rate": 1.9999486607813948e-07, + "loss": 0.3039, + "step": 249 + }, + { + "epoch": 0.06652474720596062, + "grad_norm": 0.41089197993278503, + "learning_rate": 1.9999469352284753e-07, + "loss": 0.3009, + "step": 250 + }, + { + "epoch": 0.06679084619478445, + "grad_norm": 0.3605394661426544, + "learning_rate": 1.999945181155037e-07, + "loss": 0.2833, + "step": 251 + }, + { + "epoch": 0.0670569451836083, + "grad_norm": 0.3836910128593445, + "learning_rate": 1.9999433985611303e-07, + "loss": 0.3009, + "step": 252 + }, + { + "epoch": 0.06732304417243215, + "grad_norm": 0.37604981660842896, + "learning_rate": 1.9999415874468058e-07, + "loss": 0.285, + "step": 253 + }, + { + "epoch": 0.06758914316125599, + "grad_norm": 0.4187633693218231, + "learning_rate": 1.9999397478121155e-07, + "loss": 0.31, + "step": 254 + }, + { + "epoch": 0.06785524215007983, + "grad_norm": 0.3745670020580292, + "learning_rate": 1.9999378796571114e-07, + "loss": 0.2968, + "step": 255 + }, + { + "epoch": 0.06812134113890367, + "grad_norm": 0.36284932494163513, + "learning_rate": 1.999935982981847e-07, + "loss": 0.303, + "step": 256 + }, + { + "epoch": 0.06838744012772752, + "grad_norm": 0.37186917662620544, + "learning_rate": 1.9999340577863763e-07, + "loss": 0.3016, + "step": 257 + }, + { + "epoch": 0.06865353911655135, + "grad_norm": 0.40722620487213135, + "learning_rate": 1.9999321040707544e-07, + "loss": 0.308, + "step": 258 + }, + { + "epoch": 0.0689196381053752, + "grad_norm": 0.34554964303970337, + "learning_rate": 1.999930121835037e-07, + "loss": 0.2881, + "step": 259 + }, + { + "epoch": 0.06918573709419905, + "grad_norm": 0.3990630507469177, + "learning_rate": 1.9999281110792805e-07, + "loss": 0.3125, + "step": 260 + }, + { + "epoch": 0.06945183608302288, + "grad_norm": 0.37163981795310974, + "learning_rate": 1.9999260718035424e-07, + "loss": 0.3063, + "step": 261 + }, + { + "epoch": 0.06971793507184673, + "grad_norm": 0.35464754700660706, + "learning_rate": 1.999924004007881e-07, + "loss": 0.2991, + "step": 262 + }, + { + "epoch": 0.06998403406067057, + "grad_norm": 0.3877115249633789, + "learning_rate": 1.9999219076923547e-07, + "loss": 0.3145, + "step": 263 + }, + { + "epoch": 0.07025013304949441, + "grad_norm": 0.3551427125930786, + "learning_rate": 1.9999197828570237e-07, + "loss": 0.2953, + "step": 264 + }, + { + "epoch": 0.07051623203831825, + "grad_norm": 0.36005666851997375, + "learning_rate": 1.9999176295019486e-07, + "loss": 0.2834, + "step": 265 + }, + { + "epoch": 0.0707823310271421, + "grad_norm": 0.3430812656879425, + "learning_rate": 1.999915447627191e-07, + "loss": 0.2691, + "step": 266 + }, + { + "epoch": 0.07104843001596593, + "grad_norm": 0.35199952125549316, + "learning_rate": 1.9999132372328126e-07, + "loss": 0.2938, + "step": 267 + }, + { + "epoch": 0.07131452900478978, + "grad_norm": 0.3484020233154297, + "learning_rate": 1.9999109983188768e-07, + "loss": 0.2836, + "step": 268 + }, + { + "epoch": 0.07158062799361363, + "grad_norm": 0.5120248794555664, + "learning_rate": 1.9999087308854477e-07, + "loss": 0.3241, + "step": 269 + }, + { + "epoch": 0.07184672698243746, + "grad_norm": 0.42735210061073303, + "learning_rate": 1.9999064349325893e-07, + "loss": 0.3154, + "step": 270 + }, + { + "epoch": 0.07211282597126131, + "grad_norm": 0.3770093321800232, + "learning_rate": 1.9999041104603678e-07, + "loss": 0.3069, + "step": 271 + }, + { + "epoch": 0.07237892496008515, + "grad_norm": 0.4287719130516052, + "learning_rate": 1.9999017574688489e-07, + "loss": 0.2991, + "step": 272 + }, + { + "epoch": 0.072645023948909, + "grad_norm": 0.3381522595882416, + "learning_rate": 1.9998993759581e-07, + "loss": 0.2927, + "step": 273 + }, + { + "epoch": 0.07291112293773283, + "grad_norm": 0.40846511721611023, + "learning_rate": 1.999896965928189e-07, + "loss": 0.3184, + "step": 274 + }, + { + "epoch": 0.07317722192655668, + "grad_norm": 0.3467021882534027, + "learning_rate": 1.9998945273791848e-07, + "loss": 0.282, + "step": 275 + }, + { + "epoch": 0.07344332091538053, + "grad_norm": 0.4188089966773987, + "learning_rate": 1.999892060311157e-07, + "loss": 0.2981, + "step": 276 + }, + { + "epoch": 0.07370941990420436, + "grad_norm": 0.46843117475509644, + "learning_rate": 1.9998895647241754e-07, + "loss": 0.3329, + "step": 277 + }, + { + "epoch": 0.07397551889302821, + "grad_norm": 0.3426223695278168, + "learning_rate": 1.9998870406183115e-07, + "loss": 0.3018, + "step": 278 + }, + { + "epoch": 0.07424161788185205, + "grad_norm": 0.45952826738357544, + "learning_rate": 1.9998844879936375e-07, + "loss": 0.302, + "step": 279 + }, + { + "epoch": 0.0745077168706759, + "grad_norm": 0.3459342420101166, + "learning_rate": 1.9998819068502257e-07, + "loss": 0.3149, + "step": 280 + }, + { + "epoch": 0.07477381585949973, + "grad_norm": 0.4867733120918274, + "learning_rate": 1.9998792971881504e-07, + "loss": 0.3187, + "step": 281 + }, + { + "epoch": 0.07503991484832358, + "grad_norm": 0.31014737486839294, + "learning_rate": 1.9998766590074858e-07, + "loss": 0.259, + "step": 282 + }, + { + "epoch": 0.07530601383714741, + "grad_norm": 0.3680420219898224, + "learning_rate": 1.999873992308307e-07, + "loss": 0.3039, + "step": 283 + }, + { + "epoch": 0.07557211282597126, + "grad_norm": 0.7058132290840149, + "learning_rate": 1.9998712970906898e-07, + "loss": 0.2768, + "step": 284 + }, + { + "epoch": 0.07583821181479511, + "grad_norm": 0.5142005681991577, + "learning_rate": 1.9998685733547117e-07, + "loss": 0.3191, + "step": 285 + }, + { + "epoch": 0.07610431080361894, + "grad_norm": 0.33594396710395813, + "learning_rate": 1.9998658211004498e-07, + "loss": 0.2887, + "step": 286 + }, + { + "epoch": 0.07637040979244279, + "grad_norm": 0.343458354473114, + "learning_rate": 1.999863040327983e-07, + "loss": 0.2866, + "step": 287 + }, + { + "epoch": 0.07663650878126663, + "grad_norm": 0.34397098422050476, + "learning_rate": 1.9998602310373904e-07, + "loss": 0.3002, + "step": 288 + }, + { + "epoch": 0.07690260777009048, + "grad_norm": 0.37085914611816406, + "learning_rate": 1.999857393228752e-07, + "loss": 0.2905, + "step": 289 + }, + { + "epoch": 0.07716870675891431, + "grad_norm": 0.41271668672561646, + "learning_rate": 1.9998545269021493e-07, + "loss": 0.3126, + "step": 290 + }, + { + "epoch": 0.07743480574773816, + "grad_norm": 0.44847846031188965, + "learning_rate": 1.9998516320576636e-07, + "loss": 0.2792, + "step": 291 + }, + { + "epoch": 0.07770090473656201, + "grad_norm": 0.4561721086502075, + "learning_rate": 1.9998487086953777e-07, + "loss": 0.2864, + "step": 292 + }, + { + "epoch": 0.07796700372538584, + "grad_norm": 0.3448788821697235, + "learning_rate": 1.9998457568153745e-07, + "loss": 0.2957, + "step": 293 + }, + { + "epoch": 0.07823310271420969, + "grad_norm": 0.3823447823524475, + "learning_rate": 1.9998427764177389e-07, + "loss": 0.2946, + "step": 294 + }, + { + "epoch": 0.07849920170303352, + "grad_norm": 0.3295064866542816, + "learning_rate": 1.9998397675025553e-07, + "loss": 0.2823, + "step": 295 + }, + { + "epoch": 0.07876530069185737, + "grad_norm": 0.4076249301433563, + "learning_rate": 1.9998367300699096e-07, + "loss": 0.3026, + "step": 296 + }, + { + "epoch": 0.07903139968068121, + "grad_norm": 0.35823214054107666, + "learning_rate": 1.999833664119889e-07, + "loss": 0.3202, + "step": 297 + }, + { + "epoch": 0.07929749866950506, + "grad_norm": 0.3138267397880554, + "learning_rate": 1.9998305696525804e-07, + "loss": 0.2653, + "step": 298 + }, + { + "epoch": 0.0795635976583289, + "grad_norm": 0.34390249848365784, + "learning_rate": 1.999827446668072e-07, + "loss": 0.2758, + "step": 299 + }, + { + "epoch": 0.07982969664715274, + "grad_norm": 0.39423879981040955, + "learning_rate": 1.9998242951664533e-07, + "loss": 0.3051, + "step": 300 + }, + { + "epoch": 0.08009579563597659, + "grad_norm": 0.4985777735710144, + "learning_rate": 1.9998211151478136e-07, + "loss": 0.304, + "step": 301 + }, + { + "epoch": 0.08036189462480042, + "grad_norm": 0.38002440333366394, + "learning_rate": 1.9998179066122444e-07, + "loss": 0.2986, + "step": 302 + }, + { + "epoch": 0.08062799361362427, + "grad_norm": 0.3011569678783417, + "learning_rate": 1.9998146695598366e-07, + "loss": 0.263, + "step": 303 + }, + { + "epoch": 0.0808940926024481, + "grad_norm": 0.34786179661750793, + "learning_rate": 1.9998114039906827e-07, + "loss": 0.2932, + "step": 304 + }, + { + "epoch": 0.08116019159127195, + "grad_norm": 0.3467542231082916, + "learning_rate": 1.9998081099048758e-07, + "loss": 0.3002, + "step": 305 + }, + { + "epoch": 0.08142629058009579, + "grad_norm": 0.319053053855896, + "learning_rate": 1.99980478730251e-07, + "loss": 0.2875, + "step": 306 + }, + { + "epoch": 0.08169238956891964, + "grad_norm": 0.32629337906837463, + "learning_rate": 1.99980143618368e-07, + "loss": 0.2906, + "step": 307 + }, + { + "epoch": 0.08195848855774349, + "grad_norm": 0.39316895604133606, + "learning_rate": 1.999798056548481e-07, + "loss": 0.2826, + "step": 308 + }, + { + "epoch": 0.08222458754656732, + "grad_norm": 0.3926166594028473, + "learning_rate": 1.9997946483970104e-07, + "loss": 0.3126, + "step": 309 + }, + { + "epoch": 0.08249068653539117, + "grad_norm": 0.34699153900146484, + "learning_rate": 1.9997912117293642e-07, + "loss": 0.2927, + "step": 310 + }, + { + "epoch": 0.082756785524215, + "grad_norm": 0.3089158535003662, + "learning_rate": 1.999787746545641e-07, + "loss": 0.2808, + "step": 311 + }, + { + "epoch": 0.08302288451303885, + "grad_norm": 0.32552769780158997, + "learning_rate": 1.9997842528459396e-07, + "loss": 0.3065, + "step": 312 + }, + { + "epoch": 0.08328898350186269, + "grad_norm": 0.43554094433784485, + "learning_rate": 1.99978073063036e-07, + "loss": 0.3005, + "step": 313 + }, + { + "epoch": 0.08355508249068654, + "grad_norm": 0.45646440982818604, + "learning_rate": 1.9997771798990022e-07, + "loss": 0.322, + "step": 314 + }, + { + "epoch": 0.08382118147951038, + "grad_norm": 0.29909947514533997, + "learning_rate": 1.9997736006519673e-07, + "loss": 0.2845, + "step": 315 + }, + { + "epoch": 0.08408728046833422, + "grad_norm": 0.41563403606414795, + "learning_rate": 1.999769992889358e-07, + "loss": 0.2878, + "step": 316 + }, + { + "epoch": 0.08435337945715807, + "grad_norm": 0.32387417554855347, + "learning_rate": 1.9997663566112766e-07, + "loss": 0.2921, + "step": 317 + }, + { + "epoch": 0.0846194784459819, + "grad_norm": 0.30931544303894043, + "learning_rate": 1.999762691817827e-07, + "loss": 0.2644, + "step": 318 + }, + { + "epoch": 0.08488557743480575, + "grad_norm": 0.4611895680427551, + "learning_rate": 1.999758998509114e-07, + "loss": 0.285, + "step": 319 + }, + { + "epoch": 0.08515167642362959, + "grad_norm": 0.36586201190948486, + "learning_rate": 1.999755276685243e-07, + "loss": 0.2822, + "step": 320 + }, + { + "epoch": 0.08541777541245343, + "grad_norm": 0.41241025924682617, + "learning_rate": 1.9997515263463196e-07, + "loss": 0.3089, + "step": 321 + }, + { + "epoch": 0.08568387440127727, + "grad_norm": 0.3163708746433258, + "learning_rate": 1.999747747492451e-07, + "loss": 0.2844, + "step": 322 + }, + { + "epoch": 0.08594997339010112, + "grad_norm": 0.38741981983184814, + "learning_rate": 1.9997439401237456e-07, + "loss": 0.3047, + "step": 323 + }, + { + "epoch": 0.08621607237892497, + "grad_norm": 0.4101684093475342, + "learning_rate": 1.9997401042403107e-07, + "loss": 0.2993, + "step": 324 + }, + { + "epoch": 0.0864821713677488, + "grad_norm": 0.449336975812912, + "learning_rate": 1.999736239842257e-07, + "loss": 0.3046, + "step": 325 + }, + { + "epoch": 0.08674827035657265, + "grad_norm": 0.2982255816459656, + "learning_rate": 1.999732346929694e-07, + "loss": 0.2761, + "step": 326 + }, + { + "epoch": 0.08701436934539648, + "grad_norm": 0.31293806433677673, + "learning_rate": 1.9997284255027326e-07, + "loss": 0.2812, + "step": 327 + }, + { + "epoch": 0.08728046833422033, + "grad_norm": 0.3016981780529022, + "learning_rate": 1.9997244755614854e-07, + "loss": 0.2675, + "step": 328 + }, + { + "epoch": 0.08754656732304417, + "grad_norm": 0.3328494429588318, + "learning_rate": 1.9997204971060646e-07, + "loss": 0.2857, + "step": 329 + }, + { + "epoch": 0.08781266631186802, + "grad_norm": 0.33516526222229004, + "learning_rate": 1.9997164901365836e-07, + "loss": 0.3017, + "step": 330 + }, + { + "epoch": 0.08807876530069186, + "grad_norm": 0.3227655291557312, + "learning_rate": 1.9997124546531567e-07, + "loss": 0.3062, + "step": 331 + }, + { + "epoch": 0.0883448642895157, + "grad_norm": 0.3984590470790863, + "learning_rate": 1.9997083906558993e-07, + "loss": 0.2758, + "step": 332 + }, + { + "epoch": 0.08861096327833955, + "grad_norm": 0.4096013009548187, + "learning_rate": 1.9997042981449267e-07, + "loss": 0.2983, + "step": 333 + }, + { + "epoch": 0.08887706226716338, + "grad_norm": 0.33598387241363525, + "learning_rate": 1.9997001771203563e-07, + "loss": 0.2829, + "step": 334 + }, + { + "epoch": 0.08914316125598723, + "grad_norm": 0.29417914152145386, + "learning_rate": 1.9996960275823054e-07, + "loss": 0.2644, + "step": 335 + }, + { + "epoch": 0.08940926024481106, + "grad_norm": 0.35586193203926086, + "learning_rate": 1.9996918495308922e-07, + "loss": 0.2882, + "step": 336 + }, + { + "epoch": 0.08967535923363491, + "grad_norm": 0.31254318356513977, + "learning_rate": 1.999687642966236e-07, + "loss": 0.2722, + "step": 337 + }, + { + "epoch": 0.08994145822245875, + "grad_norm": 0.29660764336586, + "learning_rate": 1.9996834078884567e-07, + "loss": 0.2796, + "step": 338 + }, + { + "epoch": 0.0902075572112826, + "grad_norm": 0.3193124532699585, + "learning_rate": 1.9996791442976754e-07, + "loss": 0.2753, + "step": 339 + }, + { + "epoch": 0.09047365620010644, + "grad_norm": 0.6787993907928467, + "learning_rate": 1.9996748521940133e-07, + "loss": 0.282, + "step": 340 + }, + { + "epoch": 0.09073975518893028, + "grad_norm": 0.5048204064369202, + "learning_rate": 1.999670531577593e-07, + "loss": 0.3045, + "step": 341 + }, + { + "epoch": 0.09100585417775413, + "grad_norm": 0.3206119239330292, + "learning_rate": 1.9996661824485374e-07, + "loss": 0.2753, + "step": 342 + }, + { + "epoch": 0.09127195316657796, + "grad_norm": 0.3311781883239746, + "learning_rate": 1.9996618048069712e-07, + "loss": 0.2953, + "step": 343 + }, + { + "epoch": 0.09153805215540181, + "grad_norm": 0.32599347829818726, + "learning_rate": 1.999657398653019e-07, + "loss": 0.2744, + "step": 344 + }, + { + "epoch": 0.09180415114422565, + "grad_norm": 0.3190529942512512, + "learning_rate": 1.9996529639868061e-07, + "loss": 0.3063, + "step": 345 + }, + { + "epoch": 0.0920702501330495, + "grad_norm": 0.3398551046848297, + "learning_rate": 1.9996485008084595e-07, + "loss": 0.289, + "step": 346 + }, + { + "epoch": 0.09233634912187334, + "grad_norm": 0.40479332208633423, + "learning_rate": 1.999644009118106e-07, + "loss": 0.2798, + "step": 347 + }, + { + "epoch": 0.09260244811069718, + "grad_norm": 0.3014296889305115, + "learning_rate": 1.9996394889158746e-07, + "loss": 0.2838, + "step": 348 + }, + { + "epoch": 0.09286854709952103, + "grad_norm": 0.31375184655189514, + "learning_rate": 1.9996349402018932e-07, + "loss": 0.2938, + "step": 349 + }, + { + "epoch": 0.09313464608834486, + "grad_norm": 0.2849459648132324, + "learning_rate": 1.999630362976292e-07, + "loss": 0.254, + "step": 350 + }, + { + "epoch": 0.09340074507716871, + "grad_norm": 0.437877357006073, + "learning_rate": 1.999625757239201e-07, + "loss": 0.283, + "step": 351 + }, + { + "epoch": 0.09366684406599254, + "grad_norm": 0.31292033195495605, + "learning_rate": 1.9996211229907525e-07, + "loss": 0.2723, + "step": 352 + }, + { + "epoch": 0.09393294305481639, + "grad_norm": 0.36729303002357483, + "learning_rate": 1.9996164602310784e-07, + "loss": 0.2832, + "step": 353 + }, + { + "epoch": 0.09419904204364024, + "grad_norm": 0.27809563279151917, + "learning_rate": 1.9996117689603113e-07, + "loss": 0.2687, + "step": 354 + }, + { + "epoch": 0.09446514103246408, + "grad_norm": 0.3828507661819458, + "learning_rate": 1.9996070491785852e-07, + "loss": 0.2942, + "step": 355 + }, + { + "epoch": 0.09473124002128792, + "grad_norm": 0.2945742905139923, + "learning_rate": 1.999602300886035e-07, + "loss": 0.2825, + "step": 356 + }, + { + "epoch": 0.09499733901011176, + "grad_norm": 0.3191017806529999, + "learning_rate": 1.9995975240827957e-07, + "loss": 0.2839, + "step": 357 + }, + { + "epoch": 0.09526343799893561, + "grad_norm": 0.2864844501018524, + "learning_rate": 1.9995927187690042e-07, + "loss": 0.2848, + "step": 358 + }, + { + "epoch": 0.09552953698775944, + "grad_norm": 0.4083368480205536, + "learning_rate": 1.9995878849447965e-07, + "loss": 0.3103, + "step": 359 + }, + { + "epoch": 0.09579563597658329, + "grad_norm": 0.3540007472038269, + "learning_rate": 1.9995830226103113e-07, + "loss": 0.2731, + "step": 360 + }, + { + "epoch": 0.09606173496540713, + "grad_norm": 0.47219032049179077, + "learning_rate": 1.9995781317656868e-07, + "loss": 0.3159, + "step": 361 + }, + { + "epoch": 0.09632783395423097, + "grad_norm": 0.2831895649433136, + "learning_rate": 1.999573212411063e-07, + "loss": 0.2707, + "step": 362 + }, + { + "epoch": 0.09659393294305482, + "grad_norm": 0.33914685249328613, + "learning_rate": 1.99956826454658e-07, + "loss": 0.3104, + "step": 363 + }, + { + "epoch": 0.09686003193187866, + "grad_norm": 0.3105792999267578, + "learning_rate": 1.9995632881723787e-07, + "loss": 0.2848, + "step": 364 + }, + { + "epoch": 0.0971261309207025, + "grad_norm": 0.3359394371509552, + "learning_rate": 1.9995582832886012e-07, + "loss": 0.2787, + "step": 365 + }, + { + "epoch": 0.09739222990952634, + "grad_norm": 0.36433136463165283, + "learning_rate": 1.9995532498953907e-07, + "loss": 0.3011, + "step": 366 + }, + { + "epoch": 0.09765832889835019, + "grad_norm": 0.307444304227829, + "learning_rate": 1.99954818799289e-07, + "loss": 0.2804, + "step": 367 + }, + { + "epoch": 0.09792442788717402, + "grad_norm": 0.30974921584129333, + "learning_rate": 1.9995430975812435e-07, + "loss": 0.2817, + "step": 368 + }, + { + "epoch": 0.09819052687599787, + "grad_norm": 0.3535693287849426, + "learning_rate": 1.999537978660597e-07, + "loss": 0.2982, + "step": 369 + }, + { + "epoch": 0.09845662586482172, + "grad_norm": 0.35360223054885864, + "learning_rate": 1.999532831231096e-07, + "loss": 0.2878, + "step": 370 + }, + { + "epoch": 0.09872272485364555, + "grad_norm": 0.4999409019947052, + "learning_rate": 1.999527655292888e-07, + "loss": 0.2715, + "step": 371 + }, + { + "epoch": 0.0989888238424694, + "grad_norm": 0.28085625171661377, + "learning_rate": 1.9995224508461198e-07, + "loss": 0.2642, + "step": 372 + }, + { + "epoch": 0.09925492283129324, + "grad_norm": 0.29860037565231323, + "learning_rate": 1.99951721789094e-07, + "loss": 0.2738, + "step": 373 + }, + { + "epoch": 0.09952102182011709, + "grad_norm": 0.30500414967536926, + "learning_rate": 1.999511956427498e-07, + "loss": 0.2903, + "step": 374 + }, + { + "epoch": 0.09978712080894092, + "grad_norm": 0.31424036622047424, + "learning_rate": 1.9995066664559442e-07, + "loss": 0.2836, + "step": 375 + }, + { + "epoch": 0.10005321979776477, + "grad_norm": 0.36487463116645813, + "learning_rate": 1.999501347976429e-07, + "loss": 0.287, + "step": 376 + }, + { + "epoch": 0.1003193187865886, + "grad_norm": 0.29266953468322754, + "learning_rate": 1.9994960009891044e-07, + "loss": 0.2651, + "step": 377 + }, + { + "epoch": 0.10058541777541245, + "grad_norm": 0.2890806198120117, + "learning_rate": 1.9994906254941228e-07, + "loss": 0.2704, + "step": 378 + }, + { + "epoch": 0.1008515167642363, + "grad_norm": 0.4202888309955597, + "learning_rate": 1.9994852214916375e-07, + "loss": 0.2904, + "step": 379 + }, + { + "epoch": 0.10111761575306014, + "grad_norm": 0.3482842743396759, + "learning_rate": 1.9994797889818025e-07, + "loss": 0.2828, + "step": 380 + }, + { + "epoch": 0.10138371474188398, + "grad_norm": 0.27966347336769104, + "learning_rate": 1.9994743279647727e-07, + "loss": 0.27, + "step": 381 + }, + { + "epoch": 0.10164981373070782, + "grad_norm": 0.3000625967979431, + "learning_rate": 1.9994688384407046e-07, + "loss": 0.2863, + "step": 382 + }, + { + "epoch": 0.10191591271953167, + "grad_norm": 0.3122108280658722, + "learning_rate": 1.9994633204097537e-07, + "loss": 0.2647, + "step": 383 + }, + { + "epoch": 0.1021820117083555, + "grad_norm": 0.30902203917503357, + "learning_rate": 1.9994577738720782e-07, + "loss": 0.2815, + "step": 384 + }, + { + "epoch": 0.10244811069717935, + "grad_norm": 0.2954978346824646, + "learning_rate": 1.9994521988278357e-07, + "loss": 0.2879, + "step": 385 + }, + { + "epoch": 0.1027142096860032, + "grad_norm": 0.27365076541900635, + "learning_rate": 1.9994465952771856e-07, + "loss": 0.2579, + "step": 386 + }, + { + "epoch": 0.10298030867482703, + "grad_norm": 0.34371069073677063, + "learning_rate": 1.9994409632202878e-07, + "loss": 0.2781, + "step": 387 + }, + { + "epoch": 0.10324640766365088, + "grad_norm": 0.30911436676979065, + "learning_rate": 1.9994353026573027e-07, + "loss": 0.2638, + "step": 388 + }, + { + "epoch": 0.10351250665247472, + "grad_norm": 0.5227352380752563, + "learning_rate": 1.9994296135883918e-07, + "loss": 0.2919, + "step": 389 + }, + { + "epoch": 0.10377860564129857, + "grad_norm": 0.3007265627384186, + "learning_rate": 1.9994238960137173e-07, + "loss": 0.2903, + "step": 390 + }, + { + "epoch": 0.1040447046301224, + "grad_norm": 0.332045316696167, + "learning_rate": 1.9994181499334425e-07, + "loss": 0.2708, + "step": 391 + }, + { + "epoch": 0.10431080361894625, + "grad_norm": 0.29829713702201843, + "learning_rate": 1.999412375347731e-07, + "loss": 0.2827, + "step": 392 + }, + { + "epoch": 0.10457690260777008, + "grad_norm": 0.293899267911911, + "learning_rate": 1.9994065722567478e-07, + "loss": 0.2715, + "step": 393 + }, + { + "epoch": 0.10484300159659393, + "grad_norm": 0.3169609010219574, + "learning_rate": 1.9994007406606582e-07, + "loss": 0.2722, + "step": 394 + }, + { + "epoch": 0.10510910058541778, + "grad_norm": 0.29608577489852905, + "learning_rate": 1.9993948805596286e-07, + "loss": 0.2737, + "step": 395 + }, + { + "epoch": 0.10537519957424162, + "grad_norm": 0.28176188468933105, + "learning_rate": 1.9993889919538263e-07, + "loss": 0.2627, + "step": 396 + }, + { + "epoch": 0.10564129856306546, + "grad_norm": 0.33564287424087524, + "learning_rate": 1.9993830748434189e-07, + "loss": 0.2773, + "step": 397 + }, + { + "epoch": 0.1059073975518893, + "grad_norm": 0.2943306565284729, + "learning_rate": 1.9993771292285752e-07, + "loss": 0.2751, + "step": 398 + }, + { + "epoch": 0.10617349654071315, + "grad_norm": 0.31558758020401, + "learning_rate": 1.9993711551094652e-07, + "loss": 0.268, + "step": 399 + }, + { + "epoch": 0.10643959552953698, + "grad_norm": 0.29902830719947815, + "learning_rate": 1.999365152486259e-07, + "loss": 0.2896, + "step": 400 + }, + { + "epoch": 0.10670569451836083, + "grad_norm": 0.2782115340232849, + "learning_rate": 1.9993591213591276e-07, + "loss": 0.2649, + "step": 401 + }, + { + "epoch": 0.10697179350718468, + "grad_norm": 0.4040801525115967, + "learning_rate": 1.9993530617282434e-07, + "loss": 0.3255, + "step": 402 + }, + { + "epoch": 0.10723789249600851, + "grad_norm": 0.301345556974411, + "learning_rate": 1.9993469735937793e-07, + "loss": 0.2691, + "step": 403 + }, + { + "epoch": 0.10750399148483236, + "grad_norm": 0.3060532510280609, + "learning_rate": 1.9993408569559088e-07, + "loss": 0.2758, + "step": 404 + }, + { + "epoch": 0.1077700904736562, + "grad_norm": 0.3033914864063263, + "learning_rate": 1.999334711814806e-07, + "loss": 0.2773, + "step": 405 + }, + { + "epoch": 0.10803618946248005, + "grad_norm": 0.4201382100582123, + "learning_rate": 1.9993285381706467e-07, + "loss": 0.2891, + "step": 406 + }, + { + "epoch": 0.10830228845130388, + "grad_norm": 0.33336341381073, + "learning_rate": 1.9993223360236064e-07, + "loss": 0.2729, + "step": 407 + }, + { + "epoch": 0.10856838744012773, + "grad_norm": 0.27830955386161804, + "learning_rate": 1.9993161053738627e-07, + "loss": 0.2511, + "step": 408 + }, + { + "epoch": 0.10883448642895158, + "grad_norm": 0.31467404961586, + "learning_rate": 1.999309846221593e-07, + "loss": 0.2998, + "step": 409 + }, + { + "epoch": 0.10910058541777541, + "grad_norm": 0.29825207591056824, + "learning_rate": 1.9993035585669755e-07, + "loss": 0.2967, + "step": 410 + }, + { + "epoch": 0.10936668440659926, + "grad_norm": 0.3623383045196533, + "learning_rate": 1.99929724241019e-07, + "loss": 0.262, + "step": 411 + }, + { + "epoch": 0.1096327833954231, + "grad_norm": 0.37793198227882385, + "learning_rate": 1.9992908977514166e-07, + "loss": 0.2838, + "step": 412 + }, + { + "epoch": 0.10989888238424694, + "grad_norm": 0.2902279198169708, + "learning_rate": 1.9992845245908359e-07, + "loss": 0.245, + "step": 413 + }, + { + "epoch": 0.11016498137307078, + "grad_norm": 0.282109797000885, + "learning_rate": 1.99927812292863e-07, + "loss": 0.2937, + "step": 414 + }, + { + "epoch": 0.11043108036189463, + "grad_norm": 0.47328996658325195, + "learning_rate": 1.9992716927649815e-07, + "loss": 0.2803, + "step": 415 + }, + { + "epoch": 0.11069717935071846, + "grad_norm": 0.2967725694179535, + "learning_rate": 1.999265234100074e-07, + "loss": 0.2654, + "step": 416 + }, + { + "epoch": 0.11096327833954231, + "grad_norm": 0.4182688593864441, + "learning_rate": 1.9992587469340908e-07, + "loss": 0.2758, + "step": 417 + }, + { + "epoch": 0.11122937732836616, + "grad_norm": 0.2938145399093628, + "learning_rate": 1.9992522312672177e-07, + "loss": 0.2654, + "step": 418 + }, + { + "epoch": 0.11149547631718999, + "grad_norm": 0.5359379649162292, + "learning_rate": 1.9992456870996404e-07, + "loss": 0.3051, + "step": 419 + }, + { + "epoch": 0.11176157530601384, + "grad_norm": 0.30653539299964905, + "learning_rate": 1.9992391144315457e-07, + "loss": 0.2588, + "step": 420 + }, + { + "epoch": 0.11202767429483768, + "grad_norm": 0.3093917667865753, + "learning_rate": 1.999232513263121e-07, + "loss": 0.268, + "step": 421 + }, + { + "epoch": 0.11229377328366152, + "grad_norm": 0.295555055141449, + "learning_rate": 1.9992258835945542e-07, + "loss": 0.2785, + "step": 422 + }, + { + "epoch": 0.11255987227248536, + "grad_norm": 0.31814059615135193, + "learning_rate": 1.9992192254260347e-07, + "loss": 0.2821, + "step": 423 + }, + { + "epoch": 0.11282597126130921, + "grad_norm": 0.38462159037590027, + "learning_rate": 1.9992125387577526e-07, + "loss": 0.2721, + "step": 424 + }, + { + "epoch": 0.11309207025013306, + "grad_norm": 0.2940784692764282, + "learning_rate": 1.9992058235898982e-07, + "loss": 0.2715, + "step": 425 + }, + { + "epoch": 0.11335816923895689, + "grad_norm": 0.3299471139907837, + "learning_rate": 1.9991990799226634e-07, + "loss": 0.2572, + "step": 426 + }, + { + "epoch": 0.11362426822778074, + "grad_norm": 0.3037220537662506, + "learning_rate": 1.9991923077562404e-07, + "loss": 0.2786, + "step": 427 + }, + { + "epoch": 0.11389036721660457, + "grad_norm": 0.29868632555007935, + "learning_rate": 1.9991855070908218e-07, + "loss": 0.2698, + "step": 428 + }, + { + "epoch": 0.11415646620542842, + "grad_norm": 0.3113382160663605, + "learning_rate": 1.9991786779266026e-07, + "loss": 0.2855, + "step": 429 + }, + { + "epoch": 0.11442256519425226, + "grad_norm": 0.2877894341945648, + "learning_rate": 1.9991718202637772e-07, + "loss": 0.2622, + "step": 430 + }, + { + "epoch": 0.1146886641830761, + "grad_norm": 0.3610289394855499, + "learning_rate": 1.9991649341025407e-07, + "loss": 0.294, + "step": 431 + }, + { + "epoch": 0.11495476317189994, + "grad_norm": 0.2824068069458008, + "learning_rate": 1.99915801944309e-07, + "loss": 0.2567, + "step": 432 + }, + { + "epoch": 0.11522086216072379, + "grad_norm": 0.2857569456100464, + "learning_rate": 1.999151076285622e-07, + "loss": 0.2774, + "step": 433 + }, + { + "epoch": 0.11548696114954764, + "grad_norm": 0.29554203152656555, + "learning_rate": 1.9991441046303353e-07, + "loss": 0.2788, + "step": 434 + }, + { + "epoch": 0.11575306013837147, + "grad_norm": 0.4392849802970886, + "learning_rate": 1.9991371044774284e-07, + "loss": 0.2722, + "step": 435 + }, + { + "epoch": 0.11601915912719532, + "grad_norm": 0.3891715705394745, + "learning_rate": 1.9991300758271007e-07, + "loss": 0.2839, + "step": 436 + }, + { + "epoch": 0.11628525811601916, + "grad_norm": 0.4232847988605499, + "learning_rate": 1.999123018679553e-07, + "loss": 0.3027, + "step": 437 + }, + { + "epoch": 0.116551357104843, + "grad_norm": 0.2973833382129669, + "learning_rate": 1.9991159330349863e-07, + "loss": 0.2676, + "step": 438 + }, + { + "epoch": 0.11681745609366684, + "grad_norm": 0.298066645860672, + "learning_rate": 1.999108818893603e-07, + "loss": 0.2817, + "step": 439 + }, + { + "epoch": 0.11708355508249069, + "grad_norm": 0.305722713470459, + "learning_rate": 1.999101676255606e-07, + "loss": 0.2757, + "step": 440 + }, + { + "epoch": 0.11734965407131454, + "grad_norm": 0.2769024074077606, + "learning_rate": 1.999094505121199e-07, + "loss": 0.2722, + "step": 441 + }, + { + "epoch": 0.11761575306013837, + "grad_norm": 0.4154115319252014, + "learning_rate": 1.9990873054905863e-07, + "loss": 0.273, + "step": 442 + }, + { + "epoch": 0.11788185204896222, + "grad_norm": 0.28346484899520874, + "learning_rate": 1.9990800773639736e-07, + "loss": 0.2569, + "step": 443 + }, + { + "epoch": 0.11814795103778605, + "grad_norm": 0.39686375856399536, + "learning_rate": 1.9990728207415666e-07, + "loss": 0.2813, + "step": 444 + }, + { + "epoch": 0.1184140500266099, + "grad_norm": 0.29169395565986633, + "learning_rate": 1.9990655356235725e-07, + "loss": 0.2776, + "step": 445 + }, + { + "epoch": 0.11868014901543374, + "grad_norm": 0.29662951827049255, + "learning_rate": 1.9990582220101993e-07, + "loss": 0.2845, + "step": 446 + }, + { + "epoch": 0.11894624800425758, + "grad_norm": 0.3459962010383606, + "learning_rate": 1.9990508799016554e-07, + "loss": 0.2719, + "step": 447 + }, + { + "epoch": 0.11921234699308142, + "grad_norm": 0.3072816729545593, + "learning_rate": 1.9990435092981503e-07, + "loss": 0.2615, + "step": 448 + }, + { + "epoch": 0.11947844598190527, + "grad_norm": 0.3086475729942322, + "learning_rate": 1.999036110199894e-07, + "loss": 0.2903, + "step": 449 + }, + { + "epoch": 0.11974454497072912, + "grad_norm": 0.4996224641799927, + "learning_rate": 1.999028682607098e-07, + "loss": 0.282, + "step": 450 + }, + { + "epoch": 0.12001064395955295, + "grad_norm": 0.27780818939208984, + "learning_rate": 1.9990212265199734e-07, + "loss": 0.2651, + "step": 451 + }, + { + "epoch": 0.1202767429483768, + "grad_norm": 0.2778901159763336, + "learning_rate": 1.9990137419387337e-07, + "loss": 0.2503, + "step": 452 + }, + { + "epoch": 0.12054284193720063, + "grad_norm": 0.27587762475013733, + "learning_rate": 1.999006228863592e-07, + "loss": 0.2638, + "step": 453 + }, + { + "epoch": 0.12080894092602448, + "grad_norm": 0.2893401086330414, + "learning_rate": 1.9989986872947622e-07, + "loss": 0.2639, + "step": 454 + }, + { + "epoch": 0.12107503991484832, + "grad_norm": 0.26800087094306946, + "learning_rate": 1.9989911172324598e-07, + "loss": 0.255, + "step": 455 + }, + { + "epoch": 0.12134113890367217, + "grad_norm": 0.3203968107700348, + "learning_rate": 1.998983518676901e-07, + "loss": 0.2558, + "step": 456 + }, + { + "epoch": 0.12160723789249601, + "grad_norm": 0.3036022484302521, + "learning_rate": 1.9989758916283022e-07, + "loss": 0.278, + "step": 457 + }, + { + "epoch": 0.12187333688131985, + "grad_norm": 0.388261616230011, + "learning_rate": 1.9989682360868807e-07, + "loss": 0.2758, + "step": 458 + }, + { + "epoch": 0.1221394358701437, + "grad_norm": 0.4354883134365082, + "learning_rate": 1.9989605520528555e-07, + "loss": 0.2661, + "step": 459 + }, + { + "epoch": 0.12240553485896753, + "grad_norm": 0.32492223381996155, + "learning_rate": 1.998952839526445e-07, + "loss": 0.2674, + "step": 460 + }, + { + "epoch": 0.12267163384779138, + "grad_norm": 0.28603094816207886, + "learning_rate": 1.9989450985078697e-07, + "loss": 0.2784, + "step": 461 + }, + { + "epoch": 0.12293773283661522, + "grad_norm": 0.3660326302051544, + "learning_rate": 1.99893732899735e-07, + "loss": 0.3091, + "step": 462 + }, + { + "epoch": 0.12320383182543906, + "grad_norm": 0.2604646384716034, + "learning_rate": 1.9989295309951082e-07, + "loss": 0.2445, + "step": 463 + }, + { + "epoch": 0.12346993081426291, + "grad_norm": 0.3149201273918152, + "learning_rate": 1.9989217045013658e-07, + "loss": 0.2632, + "step": 464 + }, + { + "epoch": 0.12373602980308675, + "grad_norm": 0.28996583819389343, + "learning_rate": 1.9989138495163468e-07, + "loss": 0.2705, + "step": 465 + }, + { + "epoch": 0.1240021287919106, + "grad_norm": 0.44175297021865845, + "learning_rate": 1.998905966040275e-07, + "loss": 0.2814, + "step": 466 + }, + { + "epoch": 0.12426822778073443, + "grad_norm": 0.3418348431587219, + "learning_rate": 1.9988980540733746e-07, + "loss": 0.2823, + "step": 467 + }, + { + "epoch": 0.12453432676955828, + "grad_norm": 0.29337337613105774, + "learning_rate": 1.9988901136158723e-07, + "loss": 0.2663, + "step": 468 + }, + { + "epoch": 0.12480042575838211, + "grad_norm": 0.3927862346172333, + "learning_rate": 1.9988821446679939e-07, + "loss": 0.2687, + "step": 469 + }, + { + "epoch": 0.12506652474720595, + "grad_norm": 0.27148157358169556, + "learning_rate": 1.9988741472299674e-07, + "loss": 0.2497, + "step": 470 + }, + { + "epoch": 0.1253326237360298, + "grad_norm": 0.40071171522140503, + "learning_rate": 1.99886612130202e-07, + "loss": 0.294, + "step": 471 + }, + { + "epoch": 0.12559872272485365, + "grad_norm": 0.29318246245384216, + "learning_rate": 1.9988580668843808e-07, + "loss": 0.2531, + "step": 472 + }, + { + "epoch": 0.1258648217136775, + "grad_norm": 0.2981473505496979, + "learning_rate": 1.9988499839772804e-07, + "loss": 0.2767, + "step": 473 + }, + { + "epoch": 0.12613092070250134, + "grad_norm": 0.36356744170188904, + "learning_rate": 1.998841872580948e-07, + "loss": 0.2775, + "step": 474 + }, + { + "epoch": 0.12639701969132516, + "grad_norm": 0.2820228040218353, + "learning_rate": 1.998833732695616e-07, + "loss": 0.2641, + "step": 475 + }, + { + "epoch": 0.126663118680149, + "grad_norm": 0.3615041971206665, + "learning_rate": 1.9988255643215165e-07, + "loss": 0.2748, + "step": 476 + }, + { + "epoch": 0.12692921766897286, + "grad_norm": 0.30123406648635864, + "learning_rate": 1.9988173674588817e-07, + "loss": 0.2824, + "step": 477 + }, + { + "epoch": 0.1271953166577967, + "grad_norm": 0.3720749318599701, + "learning_rate": 1.9988091421079462e-07, + "loss": 0.2813, + "step": 478 + }, + { + "epoch": 0.12746141564662053, + "grad_norm": 0.34858235716819763, + "learning_rate": 1.998800888268944e-07, + "loss": 0.2789, + "step": 479 + }, + { + "epoch": 0.12772751463544438, + "grad_norm": 0.2779724895954132, + "learning_rate": 1.9987926059421109e-07, + "loss": 0.2453, + "step": 480 + }, + { + "epoch": 0.12799361362426823, + "grad_norm": 0.45524415373802185, + "learning_rate": 1.9987842951276831e-07, + "loss": 0.2672, + "step": 481 + }, + { + "epoch": 0.12825971261309208, + "grad_norm": 0.3548915386199951, + "learning_rate": 1.998775955825898e-07, + "loss": 0.2842, + "step": 482 + }, + { + "epoch": 0.12852581160191592, + "grad_norm": 0.299777090549469, + "learning_rate": 1.9987675880369922e-07, + "loss": 0.2879, + "step": 483 + }, + { + "epoch": 0.12879191059073974, + "grad_norm": 0.2981191873550415, + "learning_rate": 1.9987591917612058e-07, + "loss": 0.2663, + "step": 484 + }, + { + "epoch": 0.1290580095795636, + "grad_norm": 0.41283026337623596, + "learning_rate": 1.9987507669987775e-07, + "loss": 0.2908, + "step": 485 + }, + { + "epoch": 0.12932410856838744, + "grad_norm": 0.3170233368873596, + "learning_rate": 1.9987423137499477e-07, + "loss": 0.291, + "step": 486 + }, + { + "epoch": 0.1295902075572113, + "grad_norm": 0.41537386178970337, + "learning_rate": 1.998733832014958e-07, + "loss": 0.2819, + "step": 487 + }, + { + "epoch": 0.12985630654603514, + "grad_norm": 0.29995018243789673, + "learning_rate": 1.9987253217940494e-07, + "loss": 0.2635, + "step": 488 + }, + { + "epoch": 0.13012240553485896, + "grad_norm": 0.29771021008491516, + "learning_rate": 1.998716783087465e-07, + "loss": 0.2702, + "step": 489 + }, + { + "epoch": 0.1303885045236828, + "grad_norm": 0.3260813355445862, + "learning_rate": 1.9987082158954487e-07, + "loss": 0.2638, + "step": 490 + }, + { + "epoch": 0.13065460351250666, + "grad_norm": 0.29532191157341003, + "learning_rate": 1.9986996202182446e-07, + "loss": 0.2701, + "step": 491 + }, + { + "epoch": 0.1309207025013305, + "grad_norm": 0.4091576039791107, + "learning_rate": 1.9986909960560978e-07, + "loss": 0.2735, + "step": 492 + }, + { + "epoch": 0.13118680149015433, + "grad_norm": 0.33810868859291077, + "learning_rate": 1.9986823434092547e-07, + "loss": 0.2627, + "step": 493 + }, + { + "epoch": 0.13145290047897817, + "grad_norm": 0.26704245805740356, + "learning_rate": 1.9986736622779615e-07, + "loss": 0.2583, + "step": 494 + }, + { + "epoch": 0.13171899946780202, + "grad_norm": 0.2911052107810974, + "learning_rate": 1.998664952662466e-07, + "loss": 0.2559, + "step": 495 + }, + { + "epoch": 0.13198509845662587, + "grad_norm": 0.39633479714393616, + "learning_rate": 1.9986562145630165e-07, + "loss": 0.2592, + "step": 496 + }, + { + "epoch": 0.13225119744544972, + "grad_norm": 0.29811492562294006, + "learning_rate": 1.9986474479798624e-07, + "loss": 0.2655, + "step": 497 + }, + { + "epoch": 0.13251729643427354, + "grad_norm": 0.2911699116230011, + "learning_rate": 1.9986386529132542e-07, + "loss": 0.2848, + "step": 498 + }, + { + "epoch": 0.1327833954230974, + "grad_norm": 0.3210042417049408, + "learning_rate": 1.9986298293634422e-07, + "loss": 0.2736, + "step": 499 + }, + { + "epoch": 0.13304949441192124, + "grad_norm": 0.29652145504951477, + "learning_rate": 1.9986209773306776e-07, + "loss": 0.2716, + "step": 500 + }, + { + "epoch": 0.13331559340074509, + "grad_norm": 0.26504048705101013, + "learning_rate": 1.998612096815214e-07, + "loss": 0.2623, + "step": 501 + }, + { + "epoch": 0.1335816923895689, + "grad_norm": 0.2894943654537201, + "learning_rate": 1.998603187817304e-07, + "loss": 0.2664, + "step": 502 + }, + { + "epoch": 0.13384779137839276, + "grad_norm": 0.3921050727367401, + "learning_rate": 1.9985942503372017e-07, + "loss": 0.2929, + "step": 503 + }, + { + "epoch": 0.1341138903672166, + "grad_norm": 0.2646369934082031, + "learning_rate": 1.998585284375162e-07, + "loss": 0.2542, + "step": 504 + }, + { + "epoch": 0.13437998935604045, + "grad_norm": 0.36545535922050476, + "learning_rate": 1.9985762899314408e-07, + "loss": 0.2922, + "step": 505 + }, + { + "epoch": 0.1346460883448643, + "grad_norm": 0.27820590138435364, + "learning_rate": 1.998567267006295e-07, + "loss": 0.276, + "step": 506 + }, + { + "epoch": 0.13491218733368812, + "grad_norm": 0.33176538348197937, + "learning_rate": 1.9985582155999815e-07, + "loss": 0.283, + "step": 507 + }, + { + "epoch": 0.13517828632251197, + "grad_norm": 0.35292676091194153, + "learning_rate": 1.9985491357127583e-07, + "loss": 0.2815, + "step": 508 + }, + { + "epoch": 0.13544438531133582, + "grad_norm": 0.2919992208480835, + "learning_rate": 1.9985400273448844e-07, + "loss": 0.2671, + "step": 509 + }, + { + "epoch": 0.13571048430015967, + "grad_norm": 0.27904605865478516, + "learning_rate": 1.99853089049662e-07, + "loss": 0.2564, + "step": 510 + }, + { + "epoch": 0.1359765832889835, + "grad_norm": 0.29494863748550415, + "learning_rate": 1.9985217251682255e-07, + "loss": 0.2629, + "step": 511 + }, + { + "epoch": 0.13624268227780734, + "grad_norm": 0.4515877962112427, + "learning_rate": 1.9985125313599625e-07, + "loss": 0.297, + "step": 512 + }, + { + "epoch": 0.13650878126663119, + "grad_norm": 0.29135727882385254, + "learning_rate": 1.9985033090720928e-07, + "loss": 0.2648, + "step": 513 + }, + { + "epoch": 0.13677488025545503, + "grad_norm": 0.2975388169288635, + "learning_rate": 1.9984940583048798e-07, + "loss": 0.2683, + "step": 514 + }, + { + "epoch": 0.13704097924427888, + "grad_norm": 0.36475053429603577, + "learning_rate": 1.998484779058587e-07, + "loss": 0.2712, + "step": 515 + }, + { + "epoch": 0.1373070782331027, + "grad_norm": 0.2763751745223999, + "learning_rate": 1.9984754713334795e-07, + "loss": 0.2573, + "step": 516 + }, + { + "epoch": 0.13757317722192655, + "grad_norm": 0.27574220299720764, + "learning_rate": 1.9984661351298224e-07, + "loss": 0.258, + "step": 517 + }, + { + "epoch": 0.1378392762107504, + "grad_norm": 0.2974015474319458, + "learning_rate": 1.9984567704478824e-07, + "loss": 0.2675, + "step": 518 + }, + { + "epoch": 0.13810537519957425, + "grad_norm": 0.28329700231552124, + "learning_rate": 1.9984473772879263e-07, + "loss": 0.2417, + "step": 519 + }, + { + "epoch": 0.1383714741883981, + "grad_norm": 0.44777947664260864, + "learning_rate": 1.9984379556502217e-07, + "loss": 0.2808, + "step": 520 + }, + { + "epoch": 0.13863757317722192, + "grad_norm": 0.306633859872818, + "learning_rate": 1.9984285055350377e-07, + "loss": 0.2652, + "step": 521 + }, + { + "epoch": 0.13890367216604577, + "grad_norm": 0.2823948860168457, + "learning_rate": 1.9984190269426443e-07, + "loss": 0.2582, + "step": 522 + }, + { + "epoch": 0.13916977115486961, + "grad_norm": 0.3731010854244232, + "learning_rate": 1.9984095198733112e-07, + "loss": 0.2818, + "step": 523 + }, + { + "epoch": 0.13943587014369346, + "grad_norm": 0.29947665333747864, + "learning_rate": 1.9983999843273094e-07, + "loss": 0.2683, + "step": 524 + }, + { + "epoch": 0.13970196913251728, + "grad_norm": 0.3804115653038025, + "learning_rate": 1.9983904203049116e-07, + "loss": 0.2602, + "step": 525 + }, + { + "epoch": 0.13996806812134113, + "grad_norm": 0.36829912662506104, + "learning_rate": 1.99838082780639e-07, + "loss": 0.2871, + "step": 526 + }, + { + "epoch": 0.14023416711016498, + "grad_norm": 0.3121577203273773, + "learning_rate": 1.9983712068320185e-07, + "loss": 0.2782, + "step": 527 + }, + { + "epoch": 0.14050026609898883, + "grad_norm": 0.38161027431488037, + "learning_rate": 1.998361557382071e-07, + "loss": 0.2804, + "step": 528 + }, + { + "epoch": 0.14076636508781268, + "grad_norm": 0.2896275818347931, + "learning_rate": 1.9983518794568238e-07, + "loss": 0.2646, + "step": 529 + }, + { + "epoch": 0.1410324640766365, + "grad_norm": 0.3063912093639374, + "learning_rate": 1.9983421730565518e-07, + "loss": 0.2636, + "step": 530 + }, + { + "epoch": 0.14129856306546035, + "grad_norm": 0.294838547706604, + "learning_rate": 1.9983324381815322e-07, + "loss": 0.2711, + "step": 531 + }, + { + "epoch": 0.1415646620542842, + "grad_norm": 0.29148274660110474, + "learning_rate": 1.998322674832043e-07, + "loss": 0.2504, + "step": 532 + }, + { + "epoch": 0.14183076104310804, + "grad_norm": 0.279859334230423, + "learning_rate": 1.9983128830083622e-07, + "loss": 0.255, + "step": 533 + }, + { + "epoch": 0.14209686003193187, + "grad_norm": 0.45013055205345154, + "learning_rate": 1.9983030627107694e-07, + "loss": 0.2714, + "step": 534 + }, + { + "epoch": 0.1423629590207557, + "grad_norm": 0.39854344725608826, + "learning_rate": 1.9982932139395448e-07, + "loss": 0.2725, + "step": 535 + }, + { + "epoch": 0.14262905800957956, + "grad_norm": 0.36836305260658264, + "learning_rate": 1.998283336694969e-07, + "loss": 0.2807, + "step": 536 + }, + { + "epoch": 0.1428951569984034, + "grad_norm": 0.2999846637248993, + "learning_rate": 1.9982734309773236e-07, + "loss": 0.2644, + "step": 537 + }, + { + "epoch": 0.14316125598722726, + "grad_norm": 0.281830370426178, + "learning_rate": 1.9982634967868917e-07, + "loss": 0.2639, + "step": 538 + }, + { + "epoch": 0.14342735497605108, + "grad_norm": 0.3935796618461609, + "learning_rate": 1.998253534123956e-07, + "loss": 0.2772, + "step": 539 + }, + { + "epoch": 0.14369345396487493, + "grad_norm": 0.35637855529785156, + "learning_rate": 1.9982435429888012e-07, + "loss": 0.2894, + "step": 540 + }, + { + "epoch": 0.14395955295369878, + "grad_norm": 0.26878616213798523, + "learning_rate": 1.9982335233817122e-07, + "loss": 0.2624, + "step": 541 + }, + { + "epoch": 0.14422565194252263, + "grad_norm": 0.29366010427474976, + "learning_rate": 1.998223475302974e-07, + "loss": 0.2636, + "step": 542 + }, + { + "epoch": 0.14449175093134647, + "grad_norm": 0.26881447434425354, + "learning_rate": 1.998213398752874e-07, + "loss": 0.2652, + "step": 543 + }, + { + "epoch": 0.1447578499201703, + "grad_norm": 0.26117151975631714, + "learning_rate": 1.9982032937316997e-07, + "loss": 0.2454, + "step": 544 + }, + { + "epoch": 0.14502394890899414, + "grad_norm": 0.2754858136177063, + "learning_rate": 1.9981931602397387e-07, + "loss": 0.2549, + "step": 545 + }, + { + "epoch": 0.145290047897818, + "grad_norm": 0.30071139335632324, + "learning_rate": 1.9981829982772806e-07, + "loss": 0.2736, + "step": 546 + }, + { + "epoch": 0.14555614688664184, + "grad_norm": 0.3303591012954712, + "learning_rate": 1.998172807844615e-07, + "loss": 0.27, + "step": 547 + }, + { + "epoch": 0.14582224587546566, + "grad_norm": 0.29069647192955017, + "learning_rate": 1.9981625889420324e-07, + "loss": 0.2589, + "step": 548 + }, + { + "epoch": 0.1460883448642895, + "grad_norm": 0.3659944534301758, + "learning_rate": 1.9981523415698244e-07, + "loss": 0.261, + "step": 549 + }, + { + "epoch": 0.14635444385311336, + "grad_norm": 0.4061686098575592, + "learning_rate": 1.9981420657282832e-07, + "loss": 0.2833, + "step": 550 + }, + { + "epoch": 0.1466205428419372, + "grad_norm": 0.2714155912399292, + "learning_rate": 1.998131761417702e-07, + "loss": 0.2595, + "step": 551 + }, + { + "epoch": 0.14688664183076106, + "grad_norm": 0.28949451446533203, + "learning_rate": 1.9981214286383747e-07, + "loss": 0.2564, + "step": 552 + }, + { + "epoch": 0.14715274081958488, + "grad_norm": 0.3015984892845154, + "learning_rate": 1.9981110673905962e-07, + "loss": 0.2645, + "step": 553 + }, + { + "epoch": 0.14741883980840872, + "grad_norm": 0.2966475188732147, + "learning_rate": 1.9981006776746616e-07, + "loss": 0.2624, + "step": 554 + }, + { + "epoch": 0.14768493879723257, + "grad_norm": 0.2740791440010071, + "learning_rate": 1.9980902594908673e-07, + "loss": 0.2515, + "step": 555 + }, + { + "epoch": 0.14795103778605642, + "grad_norm": 0.6117942333221436, + "learning_rate": 1.998079812839511e-07, + "loss": 0.2743, + "step": 556 + }, + { + "epoch": 0.14821713677488024, + "grad_norm": 0.2709779143333435, + "learning_rate": 1.9980693377208899e-07, + "loss": 0.2531, + "step": 557 + }, + { + "epoch": 0.1484832357637041, + "grad_norm": 0.3370727002620697, + "learning_rate": 1.998058834135303e-07, + "loss": 0.2791, + "step": 558 + }, + { + "epoch": 0.14874933475252794, + "grad_norm": 0.30392852425575256, + "learning_rate": 1.9980483020830504e-07, + "loss": 0.2704, + "step": 559 + }, + { + "epoch": 0.1490154337413518, + "grad_norm": 0.3682747483253479, + "learning_rate": 1.9980377415644315e-07, + "loss": 0.2646, + "step": 560 + }, + { + "epoch": 0.14928153273017564, + "grad_norm": 0.2679588794708252, + "learning_rate": 1.9980271525797487e-07, + "loss": 0.2742, + "step": 561 + }, + { + "epoch": 0.14954763171899946, + "grad_norm": 0.3824721872806549, + "learning_rate": 1.9980165351293032e-07, + "loss": 0.2618, + "step": 562 + }, + { + "epoch": 0.1498137307078233, + "grad_norm": 0.2706497609615326, + "learning_rate": 1.998005889213398e-07, + "loss": 0.2515, + "step": 563 + }, + { + "epoch": 0.15007982969664715, + "grad_norm": 0.2904748320579529, + "learning_rate": 1.9979952148323366e-07, + "loss": 0.2523, + "step": 564 + }, + { + "epoch": 0.150345928685471, + "grad_norm": 0.38405290246009827, + "learning_rate": 1.997984511986424e-07, + "loss": 0.2639, + "step": 565 + }, + { + "epoch": 0.15061202767429482, + "grad_norm": 0.2828895151615143, + "learning_rate": 1.997973780675965e-07, + "loss": 0.2531, + "step": 566 + }, + { + "epoch": 0.15087812666311867, + "grad_norm": 0.36157354712486267, + "learning_rate": 1.997963020901266e-07, + "loss": 0.2691, + "step": 567 + }, + { + "epoch": 0.15114422565194252, + "grad_norm": 0.28727492690086365, + "learning_rate": 1.9979522326626334e-07, + "loss": 0.2735, + "step": 568 + }, + { + "epoch": 0.15141032464076637, + "grad_norm": 0.3641880452632904, + "learning_rate": 1.9979414159603753e-07, + "loss": 0.2648, + "step": 569 + }, + { + "epoch": 0.15167642362959022, + "grad_norm": 0.39625653624534607, + "learning_rate": 1.9979305707948e-07, + "loss": 0.2714, + "step": 570 + }, + { + "epoch": 0.15194252261841404, + "grad_norm": 0.2727203965187073, + "learning_rate": 1.997919697166217e-07, + "loss": 0.2571, + "step": 571 + }, + { + "epoch": 0.1522086216072379, + "grad_norm": 0.2892410159111023, + "learning_rate": 1.9979087950749363e-07, + "loss": 0.2629, + "step": 572 + }, + { + "epoch": 0.15247472059606174, + "grad_norm": 0.3681047260761261, + "learning_rate": 1.9978978645212693e-07, + "loss": 0.2637, + "step": 573 + }, + { + "epoch": 0.15274081958488558, + "grad_norm": 0.2716493010520935, + "learning_rate": 1.9978869055055268e-07, + "loss": 0.2539, + "step": 574 + }, + { + "epoch": 0.15300691857370943, + "grad_norm": 0.29301655292510986, + "learning_rate": 1.9978759180280222e-07, + "loss": 0.2463, + "step": 575 + }, + { + "epoch": 0.15327301756253325, + "grad_norm": 0.33298689126968384, + "learning_rate": 1.9978649020890685e-07, + "loss": 0.2739, + "step": 576 + }, + { + "epoch": 0.1535391165513571, + "grad_norm": 0.3069711923599243, + "learning_rate": 1.9978538576889803e-07, + "loss": 0.2593, + "step": 577 + }, + { + "epoch": 0.15380521554018095, + "grad_norm": 0.4051957130432129, + "learning_rate": 1.997842784828072e-07, + "loss": 0.2875, + "step": 578 + }, + { + "epoch": 0.1540713145290048, + "grad_norm": 0.2764022648334503, + "learning_rate": 1.99783168350666e-07, + "loss": 0.2541, + "step": 579 + }, + { + "epoch": 0.15433741351782862, + "grad_norm": 0.38543543219566345, + "learning_rate": 1.997820553725061e-07, + "loss": 0.2763, + "step": 580 + }, + { + "epoch": 0.15460351250665247, + "grad_norm": 0.2967700660228729, + "learning_rate": 1.9978093954835918e-07, + "loss": 0.2707, + "step": 581 + }, + { + "epoch": 0.15486961149547632, + "grad_norm": 0.28892672061920166, + "learning_rate": 1.997798208782571e-07, + "loss": 0.277, + "step": 582 + }, + { + "epoch": 0.15513571048430017, + "grad_norm": 0.2820899784564972, + "learning_rate": 1.9977869936223179e-07, + "loss": 0.2629, + "step": 583 + }, + { + "epoch": 0.15540180947312401, + "grad_norm": 0.3493090271949768, + "learning_rate": 1.9977757500031517e-07, + "loss": 0.2746, + "step": 584 + }, + { + "epoch": 0.15566790846194783, + "grad_norm": 0.2913517653942108, + "learning_rate": 1.9977644779253938e-07, + "loss": 0.2364, + "step": 585 + }, + { + "epoch": 0.15593400745077168, + "grad_norm": 0.29200026392936707, + "learning_rate": 1.9977531773893652e-07, + "loss": 0.2777, + "step": 586 + }, + { + "epoch": 0.15620010643959553, + "grad_norm": 0.3092879056930542, + "learning_rate": 1.9977418483953888e-07, + "loss": 0.2706, + "step": 587 + }, + { + "epoch": 0.15646620542841938, + "grad_norm": 0.2764892578125, + "learning_rate": 1.997730490943787e-07, + "loss": 0.2581, + "step": 588 + }, + { + "epoch": 0.1567323044172432, + "grad_norm": 0.2877793610095978, + "learning_rate": 1.9977191050348844e-07, + "loss": 0.2599, + "step": 589 + }, + { + "epoch": 0.15699840340606705, + "grad_norm": 0.3175089359283447, + "learning_rate": 1.9977076906690055e-07, + "loss": 0.2661, + "step": 590 + }, + { + "epoch": 0.1572645023948909, + "grad_norm": 0.2733308970928192, + "learning_rate": 1.9976962478464755e-07, + "loss": 0.2574, + "step": 591 + }, + { + "epoch": 0.15753060138371475, + "grad_norm": 0.4810810089111328, + "learning_rate": 1.9976847765676217e-07, + "loss": 0.264, + "step": 592 + }, + { + "epoch": 0.1577967003725386, + "grad_norm": 0.39847350120544434, + "learning_rate": 1.9976732768327703e-07, + "loss": 0.2639, + "step": 593 + }, + { + "epoch": 0.15806279936136242, + "grad_norm": 0.3023441731929779, + "learning_rate": 1.9976617486422492e-07, + "loss": 0.2719, + "step": 594 + }, + { + "epoch": 0.15832889835018626, + "grad_norm": 0.3295069634914398, + "learning_rate": 1.9976501919963884e-07, + "loss": 0.2507, + "step": 595 + }, + { + "epoch": 0.1585949973390101, + "grad_norm": 0.5789831876754761, + "learning_rate": 1.9976386068955166e-07, + "loss": 0.2798, + "step": 596 + }, + { + "epoch": 0.15886109632783396, + "grad_norm": 0.29260045289993286, + "learning_rate": 1.9976269933399643e-07, + "loss": 0.2947, + "step": 597 + }, + { + "epoch": 0.1591271953166578, + "grad_norm": 0.3529303967952728, + "learning_rate": 1.997615351330063e-07, + "loss": 0.2739, + "step": 598 + }, + { + "epoch": 0.15939329430548163, + "grad_norm": 0.3714059889316559, + "learning_rate": 1.9976036808661446e-07, + "loss": 0.2929, + "step": 599 + }, + { + "epoch": 0.15965939329430548, + "grad_norm": 0.3023780584335327, + "learning_rate": 1.9975919819485417e-07, + "loss": 0.2451, + "step": 600 + }, + { + "epoch": 0.15992549228312933, + "grad_norm": 0.30290576815605164, + "learning_rate": 1.9975802545775888e-07, + "loss": 0.2508, + "step": 601 + }, + { + "epoch": 0.16019159127195318, + "grad_norm": 0.3650478422641754, + "learning_rate": 1.9975684987536198e-07, + "loss": 0.2649, + "step": 602 + }, + { + "epoch": 0.160457690260777, + "grad_norm": 0.3479679822921753, + "learning_rate": 1.99755671447697e-07, + "loss": 0.277, + "step": 603 + }, + { + "epoch": 0.16072378924960085, + "grad_norm": 0.31108012795448303, + "learning_rate": 1.9975449017479754e-07, + "loss": 0.2615, + "step": 604 + }, + { + "epoch": 0.1609898882384247, + "grad_norm": 0.3882838189601898, + "learning_rate": 1.997533060566973e-07, + "loss": 0.2788, + "step": 605 + }, + { + "epoch": 0.16125598722724854, + "grad_norm": 0.2641850709915161, + "learning_rate": 1.9975211909343007e-07, + "loss": 0.2469, + "step": 606 + }, + { + "epoch": 0.1615220862160724, + "grad_norm": 0.27303367853164673, + "learning_rate": 1.997509292850297e-07, + "loss": 0.2531, + "step": 607 + }, + { + "epoch": 0.1617881852048962, + "grad_norm": 0.2904418706893921, + "learning_rate": 1.9974973663153013e-07, + "loss": 0.2608, + "step": 608 + }, + { + "epoch": 0.16205428419372006, + "grad_norm": 0.27185794711112976, + "learning_rate": 1.9974854113296535e-07, + "loss": 0.2446, + "step": 609 + }, + { + "epoch": 0.1623203831825439, + "grad_norm": 0.3863961100578308, + "learning_rate": 1.997473427893695e-07, + "loss": 0.2703, + "step": 610 + }, + { + "epoch": 0.16258648217136776, + "grad_norm": 0.2650785446166992, + "learning_rate": 1.9974614160077673e-07, + "loss": 0.2531, + "step": 611 + }, + { + "epoch": 0.16285258116019158, + "grad_norm": 0.4255432188510895, + "learning_rate": 1.9974493756722133e-07, + "loss": 0.299, + "step": 612 + }, + { + "epoch": 0.16311868014901543, + "grad_norm": 0.2784261405467987, + "learning_rate": 1.997437306887376e-07, + "loss": 0.2395, + "step": 613 + }, + { + "epoch": 0.16338477913783928, + "grad_norm": 0.31971561908721924, + "learning_rate": 1.9974252096535998e-07, + "loss": 0.2764, + "step": 614 + }, + { + "epoch": 0.16365087812666312, + "grad_norm": 0.36960962414741516, + "learning_rate": 1.9974130839712297e-07, + "loss": 0.2869, + "step": 615 + }, + { + "epoch": 0.16391697711548697, + "grad_norm": 0.27656710147857666, + "learning_rate": 1.9974009298406118e-07, + "loss": 0.2444, + "step": 616 + }, + { + "epoch": 0.1641830761043108, + "grad_norm": 0.28074246644973755, + "learning_rate": 1.9973887472620924e-07, + "loss": 0.239, + "step": 617 + }, + { + "epoch": 0.16444917509313464, + "grad_norm": 0.26590511202812195, + "learning_rate": 1.997376536236019e-07, + "loss": 0.2537, + "step": 618 + }, + { + "epoch": 0.1647152740819585, + "grad_norm": 0.3256346881389618, + "learning_rate": 1.9973642967627402e-07, + "loss": 0.2714, + "step": 619 + }, + { + "epoch": 0.16498137307078234, + "grad_norm": 0.277334064245224, + "learning_rate": 1.997352028842605e-07, + "loss": 0.2468, + "step": 620 + }, + { + "epoch": 0.16524747205960616, + "grad_norm": 0.2788625955581665, + "learning_rate": 1.9973397324759632e-07, + "loss": 0.2458, + "step": 621 + }, + { + "epoch": 0.16551357104843, + "grad_norm": 0.35713738203048706, + "learning_rate": 1.9973274076631655e-07, + "loss": 0.2625, + "step": 622 + }, + { + "epoch": 0.16577967003725386, + "grad_norm": 0.4811781942844391, + "learning_rate": 1.9973150544045635e-07, + "loss": 0.2663, + "step": 623 + }, + { + "epoch": 0.1660457690260777, + "grad_norm": 0.5305161476135254, + "learning_rate": 1.9973026727005093e-07, + "loss": 0.2769, + "step": 624 + }, + { + "epoch": 0.16631186801490155, + "grad_norm": 0.28298717737197876, + "learning_rate": 1.9972902625513563e-07, + "loss": 0.2426, + "step": 625 + }, + { + "epoch": 0.16657796700372537, + "grad_norm": 0.2827981114387512, + "learning_rate": 1.9972778239574587e-07, + "loss": 0.2581, + "step": 626 + }, + { + "epoch": 0.16684406599254922, + "grad_norm": 0.2779403030872345, + "learning_rate": 1.9972653569191705e-07, + "loss": 0.2538, + "step": 627 + }, + { + "epoch": 0.16711016498137307, + "grad_norm": 0.26734021306037903, + "learning_rate": 1.9972528614368485e-07, + "loss": 0.2331, + "step": 628 + }, + { + "epoch": 0.16737626397019692, + "grad_norm": 0.38003459572792053, + "learning_rate": 1.9972403375108477e-07, + "loss": 0.2571, + "step": 629 + }, + { + "epoch": 0.16764236295902077, + "grad_norm": 0.29563385248184204, + "learning_rate": 1.9972277851415264e-07, + "loss": 0.255, + "step": 630 + }, + { + "epoch": 0.1679084619478446, + "grad_norm": 0.27760669589042664, + "learning_rate": 1.997215204329242e-07, + "loss": 0.2446, + "step": 631 + }, + { + "epoch": 0.16817456093666844, + "grad_norm": 0.26777517795562744, + "learning_rate": 1.9972025950743537e-07, + "loss": 0.2463, + "step": 632 + }, + { + "epoch": 0.1684406599254923, + "grad_norm": 0.3182704448699951, + "learning_rate": 1.9971899573772207e-07, + "loss": 0.2516, + "step": 633 + }, + { + "epoch": 0.16870675891431614, + "grad_norm": 0.40940356254577637, + "learning_rate": 1.997177291238204e-07, + "loss": 0.3009, + "step": 634 + }, + { + "epoch": 0.16897285790313996, + "grad_norm": 0.3079686760902405, + "learning_rate": 1.9971645966576645e-07, + "loss": 0.274, + "step": 635 + }, + { + "epoch": 0.1692389568919638, + "grad_norm": 0.27568238973617554, + "learning_rate": 1.9971518736359644e-07, + "loss": 0.2658, + "step": 636 + }, + { + "epoch": 0.16950505588078765, + "grad_norm": 0.27102720737457275, + "learning_rate": 1.9971391221734665e-07, + "loss": 0.2379, + "step": 637 + }, + { + "epoch": 0.1697711548696115, + "grad_norm": 0.3103911280632019, + "learning_rate": 1.9971263422705346e-07, + "loss": 0.2436, + "step": 638 + }, + { + "epoch": 0.17003725385843535, + "grad_norm": 0.2507857382297516, + "learning_rate": 1.997113533927533e-07, + "loss": 0.2264, + "step": 639 + }, + { + "epoch": 0.17030335284725917, + "grad_norm": 0.3579860329627991, + "learning_rate": 1.9971006971448274e-07, + "loss": 0.2591, + "step": 640 + }, + { + "epoch": 0.17056945183608302, + "grad_norm": 0.26285746693611145, + "learning_rate": 1.9970878319227834e-07, + "loss": 0.247, + "step": 641 + }, + { + "epoch": 0.17083555082490687, + "grad_norm": 0.29574379324913025, + "learning_rate": 1.9970749382617687e-07, + "loss": 0.2563, + "step": 642 + }, + { + "epoch": 0.17110164981373072, + "grad_norm": 0.3502154052257538, + "learning_rate": 1.9970620161621503e-07, + "loss": 0.2539, + "step": 643 + }, + { + "epoch": 0.17136774880255454, + "grad_norm": 0.43258076906204224, + "learning_rate": 1.9970490656242972e-07, + "loss": 0.2661, + "step": 644 + }, + { + "epoch": 0.17163384779137839, + "grad_norm": 0.3362884521484375, + "learning_rate": 1.9970360866485787e-07, + "loss": 0.2724, + "step": 645 + }, + { + "epoch": 0.17189994678020223, + "grad_norm": 0.3555415868759155, + "learning_rate": 1.9970230792353649e-07, + "loss": 0.2566, + "step": 646 + }, + { + "epoch": 0.17216604576902608, + "grad_norm": 0.44382038712501526, + "learning_rate": 1.9970100433850265e-07, + "loss": 0.2737, + "step": 647 + }, + { + "epoch": 0.17243214475784993, + "grad_norm": 0.4515036344528198, + "learning_rate": 1.9969969790979362e-07, + "loss": 0.285, + "step": 648 + }, + { + "epoch": 0.17269824374667375, + "grad_norm": 0.294086217880249, + "learning_rate": 1.9969838863744658e-07, + "loss": 0.2626, + "step": 649 + }, + { + "epoch": 0.1729643427354976, + "grad_norm": 0.3243994414806366, + "learning_rate": 1.9969707652149887e-07, + "loss": 0.289, + "step": 650 + }, + { + "epoch": 0.17323044172432145, + "grad_norm": 0.28936415910720825, + "learning_rate": 1.99695761561988e-07, + "loss": 0.2596, + "step": 651 + }, + { + "epoch": 0.1734965407131453, + "grad_norm": 0.29560866951942444, + "learning_rate": 1.9969444375895137e-07, + "loss": 0.2628, + "step": 652 + }, + { + "epoch": 0.17376263970196915, + "grad_norm": 0.3728846311569214, + "learning_rate": 1.9969312311242664e-07, + "loss": 0.2664, + "step": 653 + }, + { + "epoch": 0.17402873869079297, + "grad_norm": 0.29041561484336853, + "learning_rate": 1.9969179962245145e-07, + "loss": 0.2553, + "step": 654 + }, + { + "epoch": 0.17429483767961682, + "grad_norm": 0.28993552923202515, + "learning_rate": 1.9969047328906355e-07, + "loss": 0.2573, + "step": 655 + }, + { + "epoch": 0.17456093666844066, + "grad_norm": 0.3117077648639679, + "learning_rate": 1.9968914411230077e-07, + "loss": 0.2661, + "step": 656 + }, + { + "epoch": 0.1748270356572645, + "grad_norm": 0.2776130735874176, + "learning_rate": 1.9968781209220103e-07, + "loss": 0.2642, + "step": 657 + }, + { + "epoch": 0.17509313464608833, + "grad_norm": 0.30899578332901, + "learning_rate": 1.9968647722880228e-07, + "loss": 0.2661, + "step": 658 + }, + { + "epoch": 0.17535923363491218, + "grad_norm": 0.27804604172706604, + "learning_rate": 1.9968513952214267e-07, + "loss": 0.2498, + "step": 659 + }, + { + "epoch": 0.17562533262373603, + "grad_norm": 0.28030118346214294, + "learning_rate": 1.996837989722603e-07, + "loss": 0.261, + "step": 660 + }, + { + "epoch": 0.17589143161255988, + "grad_norm": 0.284861296415329, + "learning_rate": 1.9968245557919338e-07, + "loss": 0.2552, + "step": 661 + }, + { + "epoch": 0.17615753060138373, + "grad_norm": 0.32939329743385315, + "learning_rate": 1.9968110934298031e-07, + "loss": 0.2615, + "step": 662 + }, + { + "epoch": 0.17642362959020755, + "grad_norm": 0.2841421365737915, + "learning_rate": 1.9967976026365941e-07, + "loss": 0.2388, + "step": 663 + }, + { + "epoch": 0.1766897285790314, + "grad_norm": 0.3230195641517639, + "learning_rate": 1.996784083412692e-07, + "loss": 0.2544, + "step": 664 + }, + { + "epoch": 0.17695582756785525, + "grad_norm": 0.2910357415676117, + "learning_rate": 1.9967705357584823e-07, + "loss": 0.2729, + "step": 665 + }, + { + "epoch": 0.1772219265566791, + "grad_norm": 0.27483272552490234, + "learning_rate": 1.9967569596743513e-07, + "loss": 0.2507, + "step": 666 + }, + { + "epoch": 0.17748802554550291, + "grad_norm": 0.42251864075660706, + "learning_rate": 1.9967433551606866e-07, + "loss": 0.2594, + "step": 667 + }, + { + "epoch": 0.17775412453432676, + "grad_norm": 0.2845243215560913, + "learning_rate": 1.9967297222178755e-07, + "loss": 0.2501, + "step": 668 + }, + { + "epoch": 0.1780202235231506, + "grad_norm": 0.39159730076789856, + "learning_rate": 1.9967160608463076e-07, + "loss": 0.282, + "step": 669 + }, + { + "epoch": 0.17828632251197446, + "grad_norm": 0.2958918809890747, + "learning_rate": 1.9967023710463718e-07, + "loss": 0.2547, + "step": 670 + }, + { + "epoch": 0.1785524215007983, + "grad_norm": 0.26856449246406555, + "learning_rate": 1.9966886528184595e-07, + "loss": 0.2576, + "step": 671 + }, + { + "epoch": 0.17881852048962213, + "grad_norm": 0.3487318456172943, + "learning_rate": 1.996674906162961e-07, + "loss": 0.2461, + "step": 672 + }, + { + "epoch": 0.17908461947844598, + "grad_norm": 0.315598726272583, + "learning_rate": 1.9966611310802692e-07, + "loss": 0.2813, + "step": 673 + }, + { + "epoch": 0.17935071846726983, + "grad_norm": 0.25112563371658325, + "learning_rate": 1.9966473275707764e-07, + "loss": 0.2476, + "step": 674 + }, + { + "epoch": 0.17961681745609367, + "grad_norm": 0.3997279703617096, + "learning_rate": 1.9966334956348766e-07, + "loss": 0.2967, + "step": 675 + }, + { + "epoch": 0.1798829164449175, + "grad_norm": 0.2824614346027374, + "learning_rate": 1.9966196352729644e-07, + "loss": 0.2576, + "step": 676 + }, + { + "epoch": 0.18014901543374134, + "grad_norm": 0.25081679224967957, + "learning_rate": 1.996605746485435e-07, + "loss": 0.2371, + "step": 677 + }, + { + "epoch": 0.1804151144225652, + "grad_norm": 0.2795896530151367, + "learning_rate": 1.9965918292726843e-07, + "loss": 0.2495, + "step": 678 + }, + { + "epoch": 0.18068121341138904, + "grad_norm": 0.5056897401809692, + "learning_rate": 1.9965778836351094e-07, + "loss": 0.2769, + "step": 679 + }, + { + "epoch": 0.1809473124002129, + "grad_norm": 0.37872955203056335, + "learning_rate": 1.9965639095731085e-07, + "loss": 0.2399, + "step": 680 + }, + { + "epoch": 0.1812134113890367, + "grad_norm": 0.2784816026687622, + "learning_rate": 1.9965499070870793e-07, + "loss": 0.2532, + "step": 681 + }, + { + "epoch": 0.18147951037786056, + "grad_norm": 0.38402166962623596, + "learning_rate": 1.996535876177422e-07, + "loss": 0.277, + "step": 682 + }, + { + "epoch": 0.1817456093666844, + "grad_norm": 0.25583958625793457, + "learning_rate": 1.9965218168445362e-07, + "loss": 0.2476, + "step": 683 + }, + { + "epoch": 0.18201170835550826, + "grad_norm": 0.3045964539051056, + "learning_rate": 1.9965077290888233e-07, + "loss": 0.2574, + "step": 684 + }, + { + "epoch": 0.1822778073443321, + "grad_norm": 0.2939528822898865, + "learning_rate": 1.9964936129106848e-07, + "loss": 0.2506, + "step": 685 + }, + { + "epoch": 0.18254390633315593, + "grad_norm": 0.28474655747413635, + "learning_rate": 1.9964794683105238e-07, + "loss": 0.2536, + "step": 686 + }, + { + "epoch": 0.18281000532197977, + "grad_norm": 0.3030225932598114, + "learning_rate": 1.9964652952887432e-07, + "loss": 0.2703, + "step": 687 + }, + { + "epoch": 0.18307610431080362, + "grad_norm": 0.37315258383750916, + "learning_rate": 1.9964510938457475e-07, + "loss": 0.2595, + "step": 688 + }, + { + "epoch": 0.18334220329962747, + "grad_norm": 0.3889746367931366, + "learning_rate": 1.9964368639819417e-07, + "loss": 0.2625, + "step": 689 + }, + { + "epoch": 0.1836083022884513, + "grad_norm": 0.2832851707935333, + "learning_rate": 1.9964226056977317e-07, + "loss": 0.2552, + "step": 690 + }, + { + "epoch": 0.18387440127727514, + "grad_norm": 0.30636727809906006, + "learning_rate": 1.996408318993524e-07, + "loss": 0.2643, + "step": 691 + }, + { + "epoch": 0.184140500266099, + "grad_norm": 0.3622879385948181, + "learning_rate": 1.9963940038697267e-07, + "loss": 0.2648, + "step": 692 + }, + { + "epoch": 0.18440659925492284, + "grad_norm": 0.5870465040206909, + "learning_rate": 1.9963796603267473e-07, + "loss": 0.29, + "step": 693 + }, + { + "epoch": 0.18467269824374669, + "grad_norm": 0.35307684540748596, + "learning_rate": 1.9963652883649952e-07, + "loss": 0.2618, + "step": 694 + }, + { + "epoch": 0.1849387972325705, + "grad_norm": 0.2739260792732239, + "learning_rate": 1.9963508879848806e-07, + "loss": 0.2447, + "step": 695 + }, + { + "epoch": 0.18520489622139436, + "grad_norm": 0.2875898480415344, + "learning_rate": 1.9963364591868138e-07, + "loss": 0.2565, + "step": 696 + }, + { + "epoch": 0.1854709952102182, + "grad_norm": 0.3422403335571289, + "learning_rate": 1.9963220019712067e-07, + "loss": 0.2604, + "step": 697 + }, + { + "epoch": 0.18573709419904205, + "grad_norm": 0.37042558193206787, + "learning_rate": 1.9963075163384714e-07, + "loss": 0.2486, + "step": 698 + }, + { + "epoch": 0.18600319318786587, + "grad_norm": 0.2808763384819031, + "learning_rate": 1.996293002289021e-07, + "loss": 0.2505, + "step": 699 + }, + { + "epoch": 0.18626929217668972, + "grad_norm": 0.3143659830093384, + "learning_rate": 1.99627845982327e-07, + "loss": 0.2782, + "step": 700 + }, + { + "epoch": 0.18653539116551357, + "grad_norm": 0.32909247279167175, + "learning_rate": 1.9962638889416323e-07, + "loss": 0.2614, + "step": 701 + }, + { + "epoch": 0.18680149015433742, + "grad_norm": 0.3620654344558716, + "learning_rate": 1.9962492896445243e-07, + "loss": 0.2572, + "step": 702 + }, + { + "epoch": 0.18706758914316127, + "grad_norm": 0.30487945675849915, + "learning_rate": 1.996234661932362e-07, + "loss": 0.2669, + "step": 703 + }, + { + "epoch": 0.1873336881319851, + "grad_norm": 0.35614174604415894, + "learning_rate": 1.9962200058055626e-07, + "loss": 0.2614, + "step": 704 + }, + { + "epoch": 0.18759978712080894, + "grad_norm": 0.26566216349601746, + "learning_rate": 1.9962053212645446e-07, + "loss": 0.2493, + "step": 705 + }, + { + "epoch": 0.18786588610963278, + "grad_norm": 0.272996723651886, + "learning_rate": 1.996190608309726e-07, + "loss": 0.2652, + "step": 706 + }, + { + "epoch": 0.18813198509845663, + "grad_norm": 0.25883203744888306, + "learning_rate": 1.996175866941527e-07, + "loss": 0.2416, + "step": 707 + }, + { + "epoch": 0.18839808408728048, + "grad_norm": 0.27542248368263245, + "learning_rate": 1.996161097160368e-07, + "loss": 0.256, + "step": 708 + }, + { + "epoch": 0.1886641830761043, + "grad_norm": 0.35482895374298096, + "learning_rate": 1.99614629896667e-07, + "loss": 0.2731, + "step": 709 + }, + { + "epoch": 0.18893028206492815, + "grad_norm": 0.3465169072151184, + "learning_rate": 1.9961314723608558e-07, + "loss": 0.2534, + "step": 710 + }, + { + "epoch": 0.189196381053752, + "grad_norm": 0.3601391017436981, + "learning_rate": 1.9961166173433473e-07, + "loss": 0.2556, + "step": 711 + }, + { + "epoch": 0.18946248004257585, + "grad_norm": 0.293433278799057, + "learning_rate": 1.996101733914569e-07, + "loss": 0.255, + "step": 712 + }, + { + "epoch": 0.18972857903139967, + "grad_norm": 0.3862440586090088, + "learning_rate": 1.9960868220749447e-07, + "loss": 0.2745, + "step": 713 + }, + { + "epoch": 0.18999467802022352, + "grad_norm": 0.2631433308124542, + "learning_rate": 1.9960718818249003e-07, + "loss": 0.2351, + "step": 714 + }, + { + "epoch": 0.19026077700904737, + "grad_norm": 0.3622463047504425, + "learning_rate": 1.9960569131648617e-07, + "loss": 0.2711, + "step": 715 + }, + { + "epoch": 0.19052687599787121, + "grad_norm": 0.2635989189147949, + "learning_rate": 1.996041916095256e-07, + "loss": 0.2367, + "step": 716 + }, + { + "epoch": 0.19079297498669506, + "grad_norm": 0.31338438391685486, + "learning_rate": 1.9960268906165108e-07, + "loss": 0.2535, + "step": 717 + }, + { + "epoch": 0.19105907397551888, + "grad_norm": 0.26627394556999207, + "learning_rate": 1.996011836729054e-07, + "loss": 0.2375, + "step": 718 + }, + { + "epoch": 0.19132517296434273, + "grad_norm": 0.26833274960517883, + "learning_rate": 1.9959967544333164e-07, + "loss": 0.2592, + "step": 719 + }, + { + "epoch": 0.19159127195316658, + "grad_norm": 0.36708346009254456, + "learning_rate": 1.9959816437297272e-07, + "loss": 0.2532, + "step": 720 + }, + { + "epoch": 0.19185737094199043, + "grad_norm": 0.31823667883872986, + "learning_rate": 1.9959665046187175e-07, + "loss": 0.2399, + "step": 721 + }, + { + "epoch": 0.19212346993081425, + "grad_norm": 0.2951697111129761, + "learning_rate": 1.9959513371007192e-07, + "loss": 0.2614, + "step": 722 + }, + { + "epoch": 0.1923895689196381, + "grad_norm": 0.2822766900062561, + "learning_rate": 1.995936141176165e-07, + "loss": 0.2488, + "step": 723 + }, + { + "epoch": 0.19265566790846195, + "grad_norm": 0.34977930784225464, + "learning_rate": 1.9959209168454883e-07, + "loss": 0.2764, + "step": 724 + }, + { + "epoch": 0.1929217668972858, + "grad_norm": 0.2734612226486206, + "learning_rate": 1.9959056641091227e-07, + "loss": 0.2441, + "step": 725 + }, + { + "epoch": 0.19318786588610964, + "grad_norm": 0.39709362387657166, + "learning_rate": 1.9958903829675046e-07, + "loss": 0.2562, + "step": 726 + }, + { + "epoch": 0.19345396487493347, + "grad_norm": 0.29345476627349854, + "learning_rate": 1.9958750734210683e-07, + "loss": 0.2695, + "step": 727 + }, + { + "epoch": 0.1937200638637573, + "grad_norm": 0.27561959624290466, + "learning_rate": 1.9958597354702516e-07, + "loss": 0.2531, + "step": 728 + }, + { + "epoch": 0.19398616285258116, + "grad_norm": 0.26222681999206543, + "learning_rate": 1.9958443691154913e-07, + "loss": 0.2508, + "step": 729 + }, + { + "epoch": 0.194252261841405, + "grad_norm": 0.3398798704147339, + "learning_rate": 1.9958289743572263e-07, + "loss": 0.2689, + "step": 730 + }, + { + "epoch": 0.19451836083022883, + "grad_norm": 0.31878355145454407, + "learning_rate": 1.9958135511958952e-07, + "loss": 0.2382, + "step": 731 + }, + { + "epoch": 0.19478445981905268, + "grad_norm": 0.252086877822876, + "learning_rate": 1.9957980996319376e-07, + "loss": 0.2424, + "step": 732 + }, + { + "epoch": 0.19505055880787653, + "grad_norm": 0.3768672049045563, + "learning_rate": 1.995782619665795e-07, + "loss": 0.2789, + "step": 733 + }, + { + "epoch": 0.19531665779670038, + "grad_norm": 0.324638307094574, + "learning_rate": 1.9957671112979086e-07, + "loss": 0.2456, + "step": 734 + }, + { + "epoch": 0.19558275678552423, + "grad_norm": 0.26149553060531616, + "learning_rate": 1.9957515745287207e-07, + "loss": 0.2519, + "step": 735 + }, + { + "epoch": 0.19584885577434805, + "grad_norm": 0.33309048414230347, + "learning_rate": 1.9957360093586745e-07, + "loss": 0.2627, + "step": 736 + }, + { + "epoch": 0.1961149547631719, + "grad_norm": 0.3820885419845581, + "learning_rate": 1.9957204157882134e-07, + "loss": 0.242, + "step": 737 + }, + { + "epoch": 0.19638105375199574, + "grad_norm": 0.26837751269340515, + "learning_rate": 1.995704793817783e-07, + "loss": 0.2474, + "step": 738 + }, + { + "epoch": 0.1966471527408196, + "grad_norm": 0.50078946352005, + "learning_rate": 1.9956891434478287e-07, + "loss": 0.2636, + "step": 739 + }, + { + "epoch": 0.19691325172964344, + "grad_norm": 0.38451623916625977, + "learning_rate": 1.9956734646787965e-07, + "loss": 0.255, + "step": 740 + }, + { + "epoch": 0.19717935071846726, + "grad_norm": 0.29793238639831543, + "learning_rate": 1.9956577575111338e-07, + "loss": 0.2718, + "step": 741 + }, + { + "epoch": 0.1974454497072911, + "grad_norm": 0.2693246006965637, + "learning_rate": 1.9956420219452886e-07, + "loss": 0.2527, + "step": 742 + }, + { + "epoch": 0.19771154869611496, + "grad_norm": 0.279423326253891, + "learning_rate": 1.9956262579817094e-07, + "loss": 0.231, + "step": 743 + }, + { + "epoch": 0.1979776476849388, + "grad_norm": 0.30782437324523926, + "learning_rate": 1.9956104656208464e-07, + "loss": 0.2659, + "step": 744 + }, + { + "epoch": 0.19824374667376263, + "grad_norm": 0.27930036187171936, + "learning_rate": 1.9955946448631498e-07, + "loss": 0.2402, + "step": 745 + }, + { + "epoch": 0.19850984566258648, + "grad_norm": 0.3018147051334381, + "learning_rate": 1.9955787957090707e-07, + "loss": 0.2825, + "step": 746 + }, + { + "epoch": 0.19877594465141032, + "grad_norm": 0.2659485340118408, + "learning_rate": 1.9955629181590611e-07, + "loss": 0.2608, + "step": 747 + }, + { + "epoch": 0.19904204364023417, + "grad_norm": 0.24569910764694214, + "learning_rate": 1.995547012213574e-07, + "loss": 0.2322, + "step": 748 + }, + { + "epoch": 0.19930814262905802, + "grad_norm": 0.3759925663471222, + "learning_rate": 1.9955310778730633e-07, + "loss": 0.2722, + "step": 749 + }, + { + "epoch": 0.19957424161788184, + "grad_norm": 0.280881404876709, + "learning_rate": 1.995515115137983e-07, + "loss": 0.2656, + "step": 750 + }, + { + "epoch": 0.1998403406067057, + "grad_norm": 0.25260522961616516, + "learning_rate": 1.9954991240087892e-07, + "loss": 0.2343, + "step": 751 + }, + { + "epoch": 0.20010643959552954, + "grad_norm": 0.33499664068222046, + "learning_rate": 1.9954831044859367e-07, + "loss": 0.2622, + "step": 752 + }, + { + "epoch": 0.2003725385843534, + "grad_norm": 0.2768414318561554, + "learning_rate": 1.9954670565698834e-07, + "loss": 0.2509, + "step": 753 + }, + { + "epoch": 0.2006386375731772, + "grad_norm": 0.29787296056747437, + "learning_rate": 1.995450980261087e-07, + "loss": 0.2498, + "step": 754 + }, + { + "epoch": 0.20090473656200106, + "grad_norm": 0.28048834204673767, + "learning_rate": 1.9954348755600055e-07, + "loss": 0.2494, + "step": 755 + }, + { + "epoch": 0.2011708355508249, + "grad_norm": 0.27570095658302307, + "learning_rate": 1.9954187424670985e-07, + "loss": 0.2605, + "step": 756 + }, + { + "epoch": 0.20143693453964875, + "grad_norm": 0.31790149211883545, + "learning_rate": 1.9954025809828264e-07, + "loss": 0.2781, + "step": 757 + }, + { + "epoch": 0.2017030335284726, + "grad_norm": 0.2716260850429535, + "learning_rate": 1.9953863911076495e-07, + "loss": 0.2478, + "step": 758 + }, + { + "epoch": 0.20196913251729642, + "grad_norm": 0.34793516993522644, + "learning_rate": 1.9953701728420306e-07, + "loss": 0.2341, + "step": 759 + }, + { + "epoch": 0.20223523150612027, + "grad_norm": 0.25665393471717834, + "learning_rate": 1.9953539261864315e-07, + "loss": 0.2488, + "step": 760 + }, + { + "epoch": 0.20250133049494412, + "grad_norm": 0.3799664378166199, + "learning_rate": 1.9953376511413156e-07, + "loss": 0.2711, + "step": 761 + }, + { + "epoch": 0.20276742948376797, + "grad_norm": 0.33906224370002747, + "learning_rate": 1.9953213477071472e-07, + "loss": 0.2474, + "step": 762 + }, + { + "epoch": 0.20303352847259182, + "grad_norm": 0.2634209394454956, + "learning_rate": 1.9953050158843917e-07, + "loss": 0.2522, + "step": 763 + }, + { + "epoch": 0.20329962746141564, + "grad_norm": 0.3686966896057129, + "learning_rate": 1.9952886556735143e-07, + "loss": 0.2431, + "step": 764 + }, + { + "epoch": 0.2035657264502395, + "grad_norm": 0.28738948702812195, + "learning_rate": 1.995272267074982e-07, + "loss": 0.2654, + "step": 765 + }, + { + "epoch": 0.20383182543906334, + "grad_norm": 0.36608338356018066, + "learning_rate": 1.9952558500892623e-07, + "loss": 0.2678, + "step": 766 + }, + { + "epoch": 0.20409792442788718, + "grad_norm": 0.4136664569377899, + "learning_rate": 1.9952394047168232e-07, + "loss": 0.2734, + "step": 767 + }, + { + "epoch": 0.204364023416711, + "grad_norm": 0.36350730061531067, + "learning_rate": 1.9952229309581336e-07, + "loss": 0.2718, + "step": 768 + }, + { + "epoch": 0.20463012240553485, + "grad_norm": 0.3663521409034729, + "learning_rate": 1.995206428813664e-07, + "loss": 0.243, + "step": 769 + }, + { + "epoch": 0.2048962213943587, + "grad_norm": 0.41200578212738037, + "learning_rate": 1.9951898982838844e-07, + "loss": 0.2761, + "step": 770 + }, + { + "epoch": 0.20516232038318255, + "grad_norm": 0.40094706416130066, + "learning_rate": 1.9951733393692666e-07, + "loss": 0.2796, + "step": 771 + }, + { + "epoch": 0.2054284193720064, + "grad_norm": 0.4159850776195526, + "learning_rate": 1.995156752070283e-07, + "loss": 0.2741, + "step": 772 + }, + { + "epoch": 0.20569451836083022, + "grad_norm": 0.310433954000473, + "learning_rate": 1.9951401363874068e-07, + "loss": 0.2573, + "step": 773 + }, + { + "epoch": 0.20596061734965407, + "grad_norm": 0.33902862668037415, + "learning_rate": 1.9951234923211113e-07, + "loss": 0.2487, + "step": 774 + }, + { + "epoch": 0.20622671633847792, + "grad_norm": 0.40634745359420776, + "learning_rate": 1.995106819871872e-07, + "loss": 0.2638, + "step": 775 + }, + { + "epoch": 0.20649281532730177, + "grad_norm": 0.37630584836006165, + "learning_rate": 1.9950901190401638e-07, + "loss": 0.2658, + "step": 776 + }, + { + "epoch": 0.20675891431612559, + "grad_norm": 0.4503554701805115, + "learning_rate": 1.995073389826463e-07, + "loss": 0.2555, + "step": 777 + }, + { + "epoch": 0.20702501330494943, + "grad_norm": 0.3218191862106323, + "learning_rate": 1.9950566322312477e-07, + "loss": 0.2747, + "step": 778 + }, + { + "epoch": 0.20729111229377328, + "grad_norm": 0.4482448697090149, + "learning_rate": 1.9950398462549945e-07, + "loss": 0.2823, + "step": 779 + }, + { + "epoch": 0.20755721128259713, + "grad_norm": 0.29434505105018616, + "learning_rate": 1.9950230318981833e-07, + "loss": 0.2468, + "step": 780 + }, + { + "epoch": 0.20782331027142098, + "grad_norm": 0.2769262194633484, + "learning_rate": 1.9950061891612931e-07, + "loss": 0.2477, + "step": 781 + }, + { + "epoch": 0.2080894092602448, + "grad_norm": 0.35178476572036743, + "learning_rate": 1.9949893180448044e-07, + "loss": 0.2553, + "step": 782 + }, + { + "epoch": 0.20835550824906865, + "grad_norm": 0.26828181743621826, + "learning_rate": 1.9949724185491986e-07, + "loss": 0.2414, + "step": 783 + }, + { + "epoch": 0.2086216072378925, + "grad_norm": 0.2761688232421875, + "learning_rate": 1.9949554906749575e-07, + "loss": 0.232, + "step": 784 + }, + { + "epoch": 0.20888770622671635, + "grad_norm": 0.27420803904533386, + "learning_rate": 1.9949385344225639e-07, + "loss": 0.2433, + "step": 785 + }, + { + "epoch": 0.20915380521554017, + "grad_norm": 0.27002257108688354, + "learning_rate": 1.9949215497925013e-07, + "loss": 0.2327, + "step": 786 + }, + { + "epoch": 0.20941990420436402, + "grad_norm": 0.31017088890075684, + "learning_rate": 1.9949045367852548e-07, + "loss": 0.2391, + "step": 787 + }, + { + "epoch": 0.20968600319318786, + "grad_norm": 0.42146462202072144, + "learning_rate": 1.994887495401309e-07, + "loss": 0.2689, + "step": 788 + }, + { + "epoch": 0.2099521021820117, + "grad_norm": 0.27292829751968384, + "learning_rate": 1.9948704256411498e-07, + "loss": 0.2601, + "step": 789 + }, + { + "epoch": 0.21021820117083556, + "grad_norm": 0.2711561322212219, + "learning_rate": 1.9948533275052646e-07, + "loss": 0.2451, + "step": 790 + }, + { + "epoch": 0.21048430015965938, + "grad_norm": 0.3529212772846222, + "learning_rate": 1.9948362009941407e-07, + "loss": 0.2459, + "step": 791 + }, + { + "epoch": 0.21075039914848323, + "grad_norm": 0.31752026081085205, + "learning_rate": 1.994819046108267e-07, + "loss": 0.2374, + "step": 792 + }, + { + "epoch": 0.21101649813730708, + "grad_norm": 0.36974087357521057, + "learning_rate": 1.9948018628481326e-07, + "loss": 0.2496, + "step": 793 + }, + { + "epoch": 0.21128259712613093, + "grad_norm": 0.41379228234291077, + "learning_rate": 1.9947846512142276e-07, + "loss": 0.2678, + "step": 794 + }, + { + "epoch": 0.21154869611495478, + "grad_norm": 0.4125928282737732, + "learning_rate": 1.9947674112070425e-07, + "loss": 0.2669, + "step": 795 + }, + { + "epoch": 0.2118147951037786, + "grad_norm": 0.27173087000846863, + "learning_rate": 1.9947501428270694e-07, + "loss": 0.2385, + "step": 796 + }, + { + "epoch": 0.21208089409260245, + "grad_norm": 0.2725726068019867, + "learning_rate": 1.9947328460748007e-07, + "loss": 0.2529, + "step": 797 + }, + { + "epoch": 0.2123469930814263, + "grad_norm": 0.3275291621685028, + "learning_rate": 1.9947155209507303e-07, + "loss": 0.2583, + "step": 798 + }, + { + "epoch": 0.21261309207025014, + "grad_norm": 0.3242279887199402, + "learning_rate": 1.9946981674553513e-07, + "loss": 0.2568, + "step": 799 + }, + { + "epoch": 0.21287919105907396, + "grad_norm": 0.2922271490097046, + "learning_rate": 1.9946807855891598e-07, + "loss": 0.2463, + "step": 800 + }, + { + "epoch": 0.2131452900478978, + "grad_norm": 0.3727880120277405, + "learning_rate": 1.9946633753526508e-07, + "loss": 0.242, + "step": 801 + }, + { + "epoch": 0.21341138903672166, + "grad_norm": 0.28755050897598267, + "learning_rate": 1.9946459367463212e-07, + "loss": 0.2363, + "step": 802 + }, + { + "epoch": 0.2136774880255455, + "grad_norm": 0.3311071991920471, + "learning_rate": 1.9946284697706676e-07, + "loss": 0.252, + "step": 803 + }, + { + "epoch": 0.21394358701436936, + "grad_norm": 0.28883907198905945, + "learning_rate": 1.9946109744261897e-07, + "loss": 0.2634, + "step": 804 + }, + { + "epoch": 0.21420968600319318, + "grad_norm": 0.2915334701538086, + "learning_rate": 1.9945934507133854e-07, + "loss": 0.2443, + "step": 805 + }, + { + "epoch": 0.21447578499201703, + "grad_norm": 0.2611197829246521, + "learning_rate": 1.9945758986327544e-07, + "loss": 0.2343, + "step": 806 + }, + { + "epoch": 0.21474188398084088, + "grad_norm": 0.29701587557792664, + "learning_rate": 1.9945583181847982e-07, + "loss": 0.2574, + "step": 807 + }, + { + "epoch": 0.21500798296966472, + "grad_norm": 0.29277706146240234, + "learning_rate": 1.9945407093700174e-07, + "loss": 0.2571, + "step": 808 + }, + { + "epoch": 0.21527408195848854, + "grad_norm": 0.30038145184516907, + "learning_rate": 1.9945230721889148e-07, + "loss": 0.2573, + "step": 809 + }, + { + "epoch": 0.2155401809473124, + "grad_norm": 0.27999162673950195, + "learning_rate": 1.994505406641993e-07, + "loss": 0.2545, + "step": 810 + }, + { + "epoch": 0.21580627993613624, + "grad_norm": 0.26733407378196716, + "learning_rate": 1.9944877127297563e-07, + "loss": 0.2453, + "step": 811 + }, + { + "epoch": 0.2160723789249601, + "grad_norm": 0.2810485363006592, + "learning_rate": 1.9944699904527092e-07, + "loss": 0.2514, + "step": 812 + }, + { + "epoch": 0.21633847791378394, + "grad_norm": 0.3030773401260376, + "learning_rate": 1.994452239811357e-07, + "loss": 0.2787, + "step": 813 + }, + { + "epoch": 0.21660457690260776, + "grad_norm": 0.2717123031616211, + "learning_rate": 1.9944344608062057e-07, + "loss": 0.259, + "step": 814 + }, + { + "epoch": 0.2168706758914316, + "grad_norm": 0.38534823060035706, + "learning_rate": 1.994416653437763e-07, + "loss": 0.2491, + "step": 815 + }, + { + "epoch": 0.21713677488025546, + "grad_norm": 0.4555160403251648, + "learning_rate": 1.994398817706537e-07, + "loss": 0.2474, + "step": 816 + }, + { + "epoch": 0.2174028738690793, + "grad_norm": 0.2692924439907074, + "learning_rate": 1.9943809536130356e-07, + "loss": 0.2361, + "step": 817 + }, + { + "epoch": 0.21766897285790315, + "grad_norm": 0.35904866456985474, + "learning_rate": 1.994363061157769e-07, + "loss": 0.2811, + "step": 818 + }, + { + "epoch": 0.21793507184672697, + "grad_norm": 0.34600400924682617, + "learning_rate": 1.994345140341247e-07, + "loss": 0.2359, + "step": 819 + }, + { + "epoch": 0.21820117083555082, + "grad_norm": 0.3305373191833496, + "learning_rate": 1.994327191163981e-07, + "loss": 0.2485, + "step": 820 + }, + { + "epoch": 0.21846726982437467, + "grad_norm": 0.47995635867118835, + "learning_rate": 1.994309213626483e-07, + "loss": 0.2938, + "step": 821 + }, + { + "epoch": 0.21873336881319852, + "grad_norm": 0.25949010252952576, + "learning_rate": 1.9942912077292658e-07, + "loss": 0.2385, + "step": 822 + }, + { + "epoch": 0.21899946780202234, + "grad_norm": 0.38834619522094727, + "learning_rate": 1.994273173472843e-07, + "loss": 0.25, + "step": 823 + }, + { + "epoch": 0.2192655667908462, + "grad_norm": 0.2804381549358368, + "learning_rate": 1.9942551108577283e-07, + "loss": 0.2571, + "step": 824 + }, + { + "epoch": 0.21953166577967004, + "grad_norm": 0.3059987425804138, + "learning_rate": 1.9942370198844375e-07, + "loss": 0.23, + "step": 825 + }, + { + "epoch": 0.2197977647684939, + "grad_norm": 0.28835874795913696, + "learning_rate": 1.9942189005534868e-07, + "loss": 0.2648, + "step": 826 + }, + { + "epoch": 0.22006386375731773, + "grad_norm": 0.2945844531059265, + "learning_rate": 1.9942007528653928e-07, + "loss": 0.2474, + "step": 827 + }, + { + "epoch": 0.22032996274614156, + "grad_norm": 0.3534849286079407, + "learning_rate": 1.9941825768206727e-07, + "loss": 0.2431, + "step": 828 + }, + { + "epoch": 0.2205960617349654, + "grad_norm": 0.4117734134197235, + "learning_rate": 1.9941643724198457e-07, + "loss": 0.268, + "step": 829 + }, + { + "epoch": 0.22086216072378925, + "grad_norm": 0.28515157103538513, + "learning_rate": 1.99414613966343e-07, + "loss": 0.2551, + "step": 830 + }, + { + "epoch": 0.2211282597126131, + "grad_norm": 0.5174008011817932, + "learning_rate": 1.9941278785519465e-07, + "loss": 0.2397, + "step": 831 + }, + { + "epoch": 0.22139435870143692, + "grad_norm": 0.38163965940475464, + "learning_rate": 1.994109589085916e-07, + "loss": 0.2638, + "step": 832 + }, + { + "epoch": 0.22166045769026077, + "grad_norm": 0.4592016637325287, + "learning_rate": 1.9940912712658592e-07, + "loss": 0.2395, + "step": 833 + }, + { + "epoch": 0.22192655667908462, + "grad_norm": 0.3037923276424408, + "learning_rate": 1.9940729250922997e-07, + "loss": 0.2518, + "step": 834 + }, + { + "epoch": 0.22219265566790847, + "grad_norm": 0.2610786557197571, + "learning_rate": 1.9940545505657602e-07, + "loss": 0.2345, + "step": 835 + }, + { + "epoch": 0.22245875465673232, + "grad_norm": 0.509770393371582, + "learning_rate": 1.9940361476867648e-07, + "loss": 0.2504, + "step": 836 + }, + { + "epoch": 0.22272485364555614, + "grad_norm": 0.39076530933380127, + "learning_rate": 1.9940177164558389e-07, + "loss": 0.2543, + "step": 837 + }, + { + "epoch": 0.22299095263437999, + "grad_norm": 0.2955693304538727, + "learning_rate": 1.9939992568735074e-07, + "loss": 0.252, + "step": 838 + }, + { + "epoch": 0.22325705162320383, + "grad_norm": 0.3050330877304077, + "learning_rate": 1.9939807689402973e-07, + "loss": 0.2586, + "step": 839 + }, + { + "epoch": 0.22352315061202768, + "grad_norm": 0.30963486433029175, + "learning_rate": 1.993962252656736e-07, + "loss": 0.2395, + "step": 840 + }, + { + "epoch": 0.2237892496008515, + "grad_norm": 0.273254930973053, + "learning_rate": 1.993943708023351e-07, + "loss": 0.2338, + "step": 841 + }, + { + "epoch": 0.22405534858967535, + "grad_norm": 0.26140329241752625, + "learning_rate": 1.9939251350406721e-07, + "loss": 0.2338, + "step": 842 + }, + { + "epoch": 0.2243214475784992, + "grad_norm": 0.27460214495658875, + "learning_rate": 1.9939065337092282e-07, + "loss": 0.2422, + "step": 843 + }, + { + "epoch": 0.22458754656732305, + "grad_norm": 0.34315168857574463, + "learning_rate": 1.9938879040295506e-07, + "loss": 0.2367, + "step": 844 + }, + { + "epoch": 0.2248536455561469, + "grad_norm": 0.2748246192932129, + "learning_rate": 1.9938692460021702e-07, + "loss": 0.2565, + "step": 845 + }, + { + "epoch": 0.22511974454497072, + "grad_norm": 0.27762386202812195, + "learning_rate": 1.9938505596276192e-07, + "loss": 0.2299, + "step": 846 + }, + { + "epoch": 0.22538584353379457, + "grad_norm": 0.3016430735588074, + "learning_rate": 1.9938318449064305e-07, + "loss": 0.2523, + "step": 847 + }, + { + "epoch": 0.22565194252261842, + "grad_norm": 0.3355666399002075, + "learning_rate": 1.9938131018391382e-07, + "loss": 0.2431, + "step": 848 + }, + { + "epoch": 0.22591804151144226, + "grad_norm": 0.29486897587776184, + "learning_rate": 1.9937943304262768e-07, + "loss": 0.2308, + "step": 849 + }, + { + "epoch": 0.2261841405002661, + "grad_norm": 0.24810025095939636, + "learning_rate": 1.9937755306683815e-07, + "loss": 0.2351, + "step": 850 + }, + { + "epoch": 0.22645023948908993, + "grad_norm": 0.2890107035636902, + "learning_rate": 1.9937567025659888e-07, + "loss": 0.2395, + "step": 851 + }, + { + "epoch": 0.22671633847791378, + "grad_norm": 0.27450260519981384, + "learning_rate": 1.993737846119635e-07, + "loss": 0.2386, + "step": 852 + }, + { + "epoch": 0.22698243746673763, + "grad_norm": 0.39107877016067505, + "learning_rate": 1.993718961329859e-07, + "loss": 0.2447, + "step": 853 + }, + { + "epoch": 0.22724853645556148, + "grad_norm": 0.3704221248626709, + "learning_rate": 1.9937000481971987e-07, + "loss": 0.2544, + "step": 854 + }, + { + "epoch": 0.2275146354443853, + "grad_norm": 0.2870044708251953, + "learning_rate": 1.9936811067221937e-07, + "loss": 0.2682, + "step": 855 + }, + { + "epoch": 0.22778073443320915, + "grad_norm": 0.282552570104599, + "learning_rate": 1.9936621369053844e-07, + "loss": 0.2541, + "step": 856 + }, + { + "epoch": 0.228046833422033, + "grad_norm": 0.30462607741355896, + "learning_rate": 1.9936431387473114e-07, + "loss": 0.2574, + "step": 857 + }, + { + "epoch": 0.22831293241085684, + "grad_norm": 0.284768283367157, + "learning_rate": 1.9936241122485172e-07, + "loss": 0.2374, + "step": 858 + }, + { + "epoch": 0.2285790313996807, + "grad_norm": 0.2656472623348236, + "learning_rate": 1.9936050574095442e-07, + "loss": 0.2445, + "step": 859 + }, + { + "epoch": 0.22884513038850451, + "grad_norm": 0.3471359610557556, + "learning_rate": 1.9935859742309357e-07, + "loss": 0.2552, + "step": 860 + }, + { + "epoch": 0.22911122937732836, + "grad_norm": 0.3710417151451111, + "learning_rate": 1.9935668627132363e-07, + "loss": 0.2749, + "step": 861 + }, + { + "epoch": 0.2293773283661522, + "grad_norm": 0.38368576765060425, + "learning_rate": 1.9935477228569907e-07, + "loss": 0.2468, + "step": 862 + }, + { + "epoch": 0.22964342735497606, + "grad_norm": 0.27314090728759766, + "learning_rate": 1.9935285546627454e-07, + "loss": 0.2667, + "step": 863 + }, + { + "epoch": 0.22990952634379988, + "grad_norm": 0.24558879435062408, + "learning_rate": 1.9935093581310464e-07, + "loss": 0.2347, + "step": 864 + }, + { + "epoch": 0.23017562533262373, + "grad_norm": 0.31700149178504944, + "learning_rate": 1.9934901332624418e-07, + "loss": 0.2663, + "step": 865 + }, + { + "epoch": 0.23044172432144758, + "grad_norm": 0.2694562077522278, + "learning_rate": 1.99347088005748e-07, + "loss": 0.2393, + "step": 866 + }, + { + "epoch": 0.23070782331027143, + "grad_norm": 0.4120158851146698, + "learning_rate": 1.9934515985167094e-07, + "loss": 0.2748, + "step": 867 + }, + { + "epoch": 0.23097392229909527, + "grad_norm": 0.2991825342178345, + "learning_rate": 1.9934322886406807e-07, + "loss": 0.2572, + "step": 868 + }, + { + "epoch": 0.2312400212879191, + "grad_norm": 0.2756354808807373, + "learning_rate": 1.993412950429944e-07, + "loss": 0.2348, + "step": 869 + }, + { + "epoch": 0.23150612027674294, + "grad_norm": 0.3253782093524933, + "learning_rate": 1.993393583885052e-07, + "loss": 0.2623, + "step": 870 + }, + { + "epoch": 0.2317722192655668, + "grad_norm": 0.2694564163684845, + "learning_rate": 1.9933741890065557e-07, + "loss": 0.2432, + "step": 871 + }, + { + "epoch": 0.23203831825439064, + "grad_norm": 0.282968133687973, + "learning_rate": 1.993354765795009e-07, + "loss": 0.2319, + "step": 872 + }, + { + "epoch": 0.2323044172432145, + "grad_norm": 0.33134496212005615, + "learning_rate": 1.9933353142509658e-07, + "loss": 0.2413, + "step": 873 + }, + { + "epoch": 0.2325705162320383, + "grad_norm": 0.23985326290130615, + "learning_rate": 1.993315834374981e-07, + "loss": 0.2197, + "step": 874 + }, + { + "epoch": 0.23283661522086216, + "grad_norm": 0.395430326461792, + "learning_rate": 1.9932963261676102e-07, + "loss": 0.2452, + "step": 875 + }, + { + "epoch": 0.233102714209686, + "grad_norm": 0.32083043456077576, + "learning_rate": 1.9932767896294093e-07, + "loss": 0.2359, + "step": 876 + }, + { + "epoch": 0.23336881319850986, + "grad_norm": 0.2866591215133667, + "learning_rate": 1.9932572247609363e-07, + "loss": 0.2459, + "step": 877 + }, + { + "epoch": 0.23363491218733368, + "grad_norm": 0.27823421359062195, + "learning_rate": 1.9932376315627488e-07, + "loss": 0.2636, + "step": 878 + }, + { + "epoch": 0.23390101117615753, + "grad_norm": 0.2870425879955292, + "learning_rate": 1.9932180100354055e-07, + "loss": 0.2526, + "step": 879 + }, + { + "epoch": 0.23416711016498137, + "grad_norm": 0.33135971426963806, + "learning_rate": 1.9931983601794662e-07, + "loss": 0.2367, + "step": 880 + }, + { + "epoch": 0.23443320915380522, + "grad_norm": 0.33267804980278015, + "learning_rate": 1.9931786819954917e-07, + "loss": 0.2397, + "step": 881 + }, + { + "epoch": 0.23469930814262907, + "grad_norm": 0.2880088686943054, + "learning_rate": 1.9931589754840426e-07, + "loss": 0.2621, + "step": 882 + }, + { + "epoch": 0.2349654071314529, + "grad_norm": 0.34121939539909363, + "learning_rate": 1.9931392406456818e-07, + "loss": 0.2558, + "step": 883 + }, + { + "epoch": 0.23523150612027674, + "grad_norm": 0.4095722734928131, + "learning_rate": 1.9931194774809711e-07, + "loss": 0.2436, + "step": 884 + }, + { + "epoch": 0.2354976051091006, + "grad_norm": 0.2789800465106964, + "learning_rate": 1.9930996859904753e-07, + "loss": 0.2476, + "step": 885 + }, + { + "epoch": 0.23576370409792444, + "grad_norm": 0.2827874422073364, + "learning_rate": 1.993079866174758e-07, + "loss": 0.2393, + "step": 886 + }, + { + "epoch": 0.23602980308674826, + "grad_norm": 0.2563740015029907, + "learning_rate": 1.9930600180343852e-07, + "loss": 0.2423, + "step": 887 + }, + { + "epoch": 0.2362959020755721, + "grad_norm": 0.25741058588027954, + "learning_rate": 1.9930401415699226e-07, + "loss": 0.2394, + "step": 888 + }, + { + "epoch": 0.23656200106439595, + "grad_norm": 0.2577243745326996, + "learning_rate": 1.993020236781937e-07, + "loss": 0.2298, + "step": 889 + }, + { + "epoch": 0.2368281000532198, + "grad_norm": 0.2773585021495819, + "learning_rate": 1.9930003036709965e-07, + "loss": 0.2379, + "step": 890 + }, + { + "epoch": 0.23709419904204365, + "grad_norm": 0.37622132897377014, + "learning_rate": 1.9929803422376695e-07, + "loss": 0.2637, + "step": 891 + }, + { + "epoch": 0.23736029803086747, + "grad_norm": 0.29460203647613525, + "learning_rate": 1.9929603524825253e-07, + "loss": 0.2358, + "step": 892 + }, + { + "epoch": 0.23762639701969132, + "grad_norm": 0.2692514657974243, + "learning_rate": 1.992940334406134e-07, + "loss": 0.2358, + "step": 893 + }, + { + "epoch": 0.23789249600851517, + "grad_norm": 0.319105863571167, + "learning_rate": 1.9929202880090666e-07, + "loss": 0.2619, + "step": 894 + }, + { + "epoch": 0.23815859499733902, + "grad_norm": 0.2673569321632385, + "learning_rate": 1.9929002132918948e-07, + "loss": 0.2396, + "step": 895 + }, + { + "epoch": 0.23842469398616284, + "grad_norm": 0.2809813916683197, + "learning_rate": 1.9928801102551916e-07, + "loss": 0.2615, + "step": 896 + }, + { + "epoch": 0.2386907929749867, + "grad_norm": 0.30797937512397766, + "learning_rate": 1.9928599788995294e-07, + "loss": 0.2515, + "step": 897 + }, + { + "epoch": 0.23895689196381054, + "grad_norm": 0.405829519033432, + "learning_rate": 1.9928398192254835e-07, + "loss": 0.2653, + "step": 898 + }, + { + "epoch": 0.23922299095263438, + "grad_norm": 0.30604761838912964, + "learning_rate": 1.9928196312336283e-07, + "loss": 0.2693, + "step": 899 + }, + { + "epoch": 0.23948908994145823, + "grad_norm": 0.2686694264411926, + "learning_rate": 1.9927994149245396e-07, + "loss": 0.2339, + "step": 900 + }, + { + "epoch": 0.23975518893028205, + "grad_norm": 0.3171556890010834, + "learning_rate": 1.9927791702987944e-07, + "loss": 0.252, + "step": 901 + }, + { + "epoch": 0.2400212879191059, + "grad_norm": 0.2840009033679962, + "learning_rate": 1.99275889735697e-07, + "loss": 0.2432, + "step": 902 + }, + { + "epoch": 0.24028738690792975, + "grad_norm": 0.2939351797103882, + "learning_rate": 1.992738596099644e-07, + "loss": 0.2563, + "step": 903 + }, + { + "epoch": 0.2405534858967536, + "grad_norm": 0.263685941696167, + "learning_rate": 1.992718266527396e-07, + "loss": 0.2408, + "step": 904 + }, + { + "epoch": 0.24081958488557745, + "grad_norm": 0.36283788084983826, + "learning_rate": 1.9926979086408059e-07, + "loss": 0.2607, + "step": 905 + }, + { + "epoch": 0.24108568387440127, + "grad_norm": 0.259039968252182, + "learning_rate": 1.9926775224404537e-07, + "loss": 0.2369, + "step": 906 + }, + { + "epoch": 0.24135178286322512, + "grad_norm": 0.3034122884273529, + "learning_rate": 1.992657107926922e-07, + "loss": 0.2593, + "step": 907 + }, + { + "epoch": 0.24161788185204897, + "grad_norm": 0.2852848768234253, + "learning_rate": 1.9926366651007917e-07, + "loss": 0.2274, + "step": 908 + }, + { + "epoch": 0.24188398084087281, + "grad_norm": 0.2901228070259094, + "learning_rate": 1.9926161939626472e-07, + "loss": 0.2274, + "step": 909 + }, + { + "epoch": 0.24215007982969664, + "grad_norm": 0.284610778093338, + "learning_rate": 1.9925956945130715e-07, + "loss": 0.2454, + "step": 910 + }, + { + "epoch": 0.24241617881852048, + "grad_norm": 0.2881953716278076, + "learning_rate": 1.9925751667526495e-07, + "loss": 0.248, + "step": 911 + }, + { + "epoch": 0.24268227780734433, + "grad_norm": 0.39585161209106445, + "learning_rate": 1.9925546106819669e-07, + "loss": 0.2798, + "step": 912 + }, + { + "epoch": 0.24294837679616818, + "grad_norm": 0.3008171021938324, + "learning_rate": 1.9925340263016098e-07, + "loss": 0.2493, + "step": 913 + }, + { + "epoch": 0.24321447578499203, + "grad_norm": 0.36570727825164795, + "learning_rate": 1.9925134136121652e-07, + "loss": 0.2809, + "step": 914 + }, + { + "epoch": 0.24348057477381585, + "grad_norm": 0.3720909655094147, + "learning_rate": 1.9924927726142212e-07, + "loss": 0.2398, + "step": 915 + }, + { + "epoch": 0.2437466737626397, + "grad_norm": 0.2846370339393616, + "learning_rate": 1.9924721033083667e-07, + "loss": 0.2575, + "step": 916 + }, + { + "epoch": 0.24401277275146355, + "grad_norm": 0.44456562399864197, + "learning_rate": 1.9924514056951905e-07, + "loss": 0.2442, + "step": 917 + }, + { + "epoch": 0.2442788717402874, + "grad_norm": 0.2809826135635376, + "learning_rate": 1.9924306797752842e-07, + "loss": 0.2499, + "step": 918 + }, + { + "epoch": 0.24454497072911122, + "grad_norm": 0.28263145685195923, + "learning_rate": 1.9924099255492376e-07, + "loss": 0.2569, + "step": 919 + }, + { + "epoch": 0.24481106971793506, + "grad_norm": 0.2605680525302887, + "learning_rate": 1.9923891430176436e-07, + "loss": 0.2385, + "step": 920 + }, + { + "epoch": 0.2450771687067589, + "grad_norm": 0.2898847162723541, + "learning_rate": 1.9923683321810945e-07, + "loss": 0.227, + "step": 921 + }, + { + "epoch": 0.24534326769558276, + "grad_norm": 0.3559992015361786, + "learning_rate": 1.9923474930401838e-07, + "loss": 0.2416, + "step": 922 + }, + { + "epoch": 0.2456093666844066, + "grad_norm": 0.2789672911167145, + "learning_rate": 1.992326625595506e-07, + "loss": 0.2375, + "step": 923 + }, + { + "epoch": 0.24587546567323043, + "grad_norm": 0.2505508363246918, + "learning_rate": 1.992305729847657e-07, + "loss": 0.2249, + "step": 924 + }, + { + "epoch": 0.24614156466205428, + "grad_norm": 0.3470998704433441, + "learning_rate": 1.9922848057972315e-07, + "loss": 0.2258, + "step": 925 + }, + { + "epoch": 0.24640766365087813, + "grad_norm": 0.3474298417568207, + "learning_rate": 1.992263853444827e-07, + "loss": 0.2585, + "step": 926 + }, + { + "epoch": 0.24667376263970198, + "grad_norm": 0.23795299232006073, + "learning_rate": 1.9922428727910413e-07, + "loss": 0.2136, + "step": 927 + }, + { + "epoch": 0.24693986162852583, + "grad_norm": 0.3459668755531311, + "learning_rate": 1.992221863836472e-07, + "loss": 0.2517, + "step": 928 + }, + { + "epoch": 0.24720596061734965, + "grad_norm": 0.4644218981266022, + "learning_rate": 1.9922008265817193e-07, + "loss": 0.2847, + "step": 929 + }, + { + "epoch": 0.2474720596061735, + "grad_norm": 0.42850929498672485, + "learning_rate": 1.9921797610273824e-07, + "loss": 0.2616, + "step": 930 + }, + { + "epoch": 0.24773815859499734, + "grad_norm": 0.29648324847221375, + "learning_rate": 1.992158667174063e-07, + "loss": 0.2698, + "step": 931 + }, + { + "epoch": 0.2480042575838212, + "grad_norm": 0.28132009506225586, + "learning_rate": 1.9921375450223618e-07, + "loss": 0.2248, + "step": 932 + }, + { + "epoch": 0.248270356572645, + "grad_norm": 0.2851416766643524, + "learning_rate": 1.9921163945728822e-07, + "loss": 0.2531, + "step": 933 + }, + { + "epoch": 0.24853645556146886, + "grad_norm": 0.28643369674682617, + "learning_rate": 1.9920952158262264e-07, + "loss": 0.2642, + "step": 934 + }, + { + "epoch": 0.2488025545502927, + "grad_norm": 0.34276825189590454, + "learning_rate": 1.9920740087829994e-07, + "loss": 0.2508, + "step": 935 + }, + { + "epoch": 0.24906865353911656, + "grad_norm": 0.2889997363090515, + "learning_rate": 1.9920527734438056e-07, + "loss": 0.2602, + "step": 936 + }, + { + "epoch": 0.2493347525279404, + "grad_norm": 0.26026439666748047, + "learning_rate": 1.9920315098092506e-07, + "loss": 0.2217, + "step": 937 + }, + { + "epoch": 0.24960085151676423, + "grad_norm": 0.3178027272224426, + "learning_rate": 1.9920102178799412e-07, + "loss": 0.239, + "step": 938 + }, + { + "epoch": 0.24986695050558808, + "grad_norm": 0.28510716557502747, + "learning_rate": 1.9919888976564844e-07, + "loss": 0.2409, + "step": 939 + }, + { + "epoch": 0.2501330494944119, + "grad_norm": 0.44172486662864685, + "learning_rate": 1.9919675491394888e-07, + "loss": 0.2785, + "step": 940 + }, + { + "epoch": 0.25039914848323575, + "grad_norm": 0.29704809188842773, + "learning_rate": 1.9919461723295626e-07, + "loss": 0.2778, + "step": 941 + }, + { + "epoch": 0.2506652474720596, + "grad_norm": 0.3210563361644745, + "learning_rate": 1.9919247672273156e-07, + "loss": 0.2288, + "step": 942 + }, + { + "epoch": 0.25093134646088344, + "grad_norm": 0.27060702443122864, + "learning_rate": 1.991903333833359e-07, + "loss": 0.2295, + "step": 943 + }, + { + "epoch": 0.2511974454497073, + "grad_norm": 0.28654518723487854, + "learning_rate": 1.9918818721483033e-07, + "loss": 0.2478, + "step": 944 + }, + { + "epoch": 0.25146354443853114, + "grad_norm": 0.2861230969429016, + "learning_rate": 1.9918603821727613e-07, + "loss": 0.2326, + "step": 945 + }, + { + "epoch": 0.251729643427355, + "grad_norm": 0.2991807460784912, + "learning_rate": 1.9918388639073453e-07, + "loss": 0.2614, + "step": 946 + }, + { + "epoch": 0.25199574241617884, + "grad_norm": 0.2638091444969177, + "learning_rate": 1.9918173173526694e-07, + "loss": 0.2441, + "step": 947 + }, + { + "epoch": 0.2522618414050027, + "grad_norm": 0.2837468683719635, + "learning_rate": 1.991795742509349e-07, + "loss": 0.2529, + "step": 948 + }, + { + "epoch": 0.2525279403938265, + "grad_norm": 0.26516032218933105, + "learning_rate": 1.9917741393779975e-07, + "loss": 0.2504, + "step": 949 + }, + { + "epoch": 0.2527940393826503, + "grad_norm": 0.2785818576812744, + "learning_rate": 1.9917525079592329e-07, + "loss": 0.2307, + "step": 950 + }, + { + "epoch": 0.2530601383714742, + "grad_norm": 0.2810152769088745, + "learning_rate": 1.9917308482536708e-07, + "loss": 0.2451, + "step": 951 + }, + { + "epoch": 0.253326237360298, + "grad_norm": 0.42845091223716736, + "learning_rate": 1.99170916026193e-07, + "loss": 0.25, + "step": 952 + }, + { + "epoch": 0.25359233634912187, + "grad_norm": 0.2771959900856018, + "learning_rate": 1.9916874439846288e-07, + "loss": 0.2432, + "step": 953 + }, + { + "epoch": 0.2538584353379457, + "grad_norm": 0.2796684801578522, + "learning_rate": 1.9916656994223863e-07, + "loss": 0.2322, + "step": 954 + }, + { + "epoch": 0.25412453432676957, + "grad_norm": 0.38984212279319763, + "learning_rate": 1.991643926575823e-07, + "loss": 0.2496, + "step": 955 + }, + { + "epoch": 0.2543906333155934, + "grad_norm": 0.25726065039634705, + "learning_rate": 1.9916221254455597e-07, + "loss": 0.2189, + "step": 956 + }, + { + "epoch": 0.25465673230441727, + "grad_norm": 0.26667508482933044, + "learning_rate": 1.9916002960322184e-07, + "loss": 0.2378, + "step": 957 + }, + { + "epoch": 0.25492283129324106, + "grad_norm": 0.35947397351264954, + "learning_rate": 1.9915784383364213e-07, + "loss": 0.2412, + "step": 958 + }, + { + "epoch": 0.2551889302820649, + "grad_norm": 0.3896760940551758, + "learning_rate": 1.9915565523587925e-07, + "loss": 0.2427, + "step": 959 + }, + { + "epoch": 0.25545502927088876, + "grad_norm": 0.2960081696510315, + "learning_rate": 1.991534638099956e-07, + "loss": 0.2445, + "step": 960 + }, + { + "epoch": 0.2557211282597126, + "grad_norm": 0.32988107204437256, + "learning_rate": 1.9915126955605363e-07, + "loss": 0.248, + "step": 961 + }, + { + "epoch": 0.25598722724853645, + "grad_norm": 0.29137200117111206, + "learning_rate": 1.99149072474116e-07, + "loss": 0.2485, + "step": 962 + }, + { + "epoch": 0.2562533262373603, + "grad_norm": 0.37254205346107483, + "learning_rate": 1.9914687256424535e-07, + "loss": 0.251, + "step": 963 + }, + { + "epoch": 0.25651942522618415, + "grad_norm": 0.27947887778282166, + "learning_rate": 1.991446698265044e-07, + "loss": 0.2344, + "step": 964 + }, + { + "epoch": 0.256785524215008, + "grad_norm": 0.43583324551582336, + "learning_rate": 1.99142464260956e-07, + "loss": 0.2556, + "step": 965 + }, + { + "epoch": 0.25705162320383185, + "grad_norm": 0.43384549021720886, + "learning_rate": 1.9914025586766306e-07, + "loss": 0.2676, + "step": 966 + }, + { + "epoch": 0.25731772219265564, + "grad_norm": 0.3103600740432739, + "learning_rate": 1.9913804464668853e-07, + "loss": 0.2529, + "step": 967 + }, + { + "epoch": 0.2575838211814795, + "grad_norm": 0.28094369173049927, + "learning_rate": 1.9913583059809553e-07, + "loss": 0.2392, + "step": 968 + }, + { + "epoch": 0.25784992017030334, + "grad_norm": 0.3024909198284149, + "learning_rate": 1.9913361372194719e-07, + "loss": 0.2651, + "step": 969 + }, + { + "epoch": 0.2581160191591272, + "grad_norm": 0.31036481261253357, + "learning_rate": 1.9913139401830672e-07, + "loss": 0.2596, + "step": 970 + }, + { + "epoch": 0.25838211814795103, + "grad_norm": 0.25491878390312195, + "learning_rate": 1.9912917148723749e-07, + "loss": 0.2261, + "step": 971 + }, + { + "epoch": 0.2586482171367749, + "grad_norm": 0.39177656173706055, + "learning_rate": 1.9912694612880282e-07, + "loss": 0.2679, + "step": 972 + }, + { + "epoch": 0.25891431612559873, + "grad_norm": 0.38983047008514404, + "learning_rate": 1.991247179430662e-07, + "loss": 0.2462, + "step": 973 + }, + { + "epoch": 0.2591804151144226, + "grad_norm": 0.4849991500377655, + "learning_rate": 1.991224869300912e-07, + "loss": 0.2421, + "step": 974 + }, + { + "epoch": 0.25944651410324643, + "grad_norm": 0.2819511890411377, + "learning_rate": 1.9912025308994144e-07, + "loss": 0.2567, + "step": 975 + }, + { + "epoch": 0.2597126130920703, + "grad_norm": 0.296725332736969, + "learning_rate": 1.9911801642268067e-07, + "loss": 0.2426, + "step": 976 + }, + { + "epoch": 0.25997871208089407, + "grad_norm": 0.3208054006099701, + "learning_rate": 1.9911577692837262e-07, + "loss": 0.2408, + "step": 977 + }, + { + "epoch": 0.2602448110697179, + "grad_norm": 0.25463271141052246, + "learning_rate": 1.9911353460708121e-07, + "loss": 0.2186, + "step": 978 + }, + { + "epoch": 0.26051091005854177, + "grad_norm": 0.37248602509498596, + "learning_rate": 1.9911128945887042e-07, + "loss": 0.2492, + "step": 979 + }, + { + "epoch": 0.2607770090473656, + "grad_norm": 0.3192874491214752, + "learning_rate": 1.9910904148380418e-07, + "loss": 0.256, + "step": 980 + }, + { + "epoch": 0.26104310803618946, + "grad_norm": 0.29471486806869507, + "learning_rate": 1.9910679068194675e-07, + "loss": 0.2382, + "step": 981 + }, + { + "epoch": 0.2613092070250133, + "grad_norm": 0.4337882697582245, + "learning_rate": 1.991045370533622e-07, + "loss": 0.2741, + "step": 982 + }, + { + "epoch": 0.26157530601383716, + "grad_norm": 0.2817007303237915, + "learning_rate": 1.991022805981149e-07, + "loss": 0.2544, + "step": 983 + }, + { + "epoch": 0.261841405002661, + "grad_norm": 0.27582797408103943, + "learning_rate": 1.9910002131626916e-07, + "loss": 0.2401, + "step": 984 + }, + { + "epoch": 0.26210750399148486, + "grad_norm": 0.33514779806137085, + "learning_rate": 1.9909775920788941e-07, + "loss": 0.2398, + "step": 985 + }, + { + "epoch": 0.26237360298030865, + "grad_norm": 0.39181673526763916, + "learning_rate": 1.990954942730402e-07, + "loss": 0.2438, + "step": 986 + }, + { + "epoch": 0.2626397019691325, + "grad_norm": 0.2938767373561859, + "learning_rate": 1.9909322651178614e-07, + "loss": 0.2438, + "step": 987 + }, + { + "epoch": 0.26290580095795635, + "grad_norm": 0.26392683386802673, + "learning_rate": 1.990909559241919e-07, + "loss": 0.2311, + "step": 988 + }, + { + "epoch": 0.2631718999467802, + "grad_norm": 0.30362942814826965, + "learning_rate": 1.9908868251032224e-07, + "loss": 0.2591, + "step": 989 + }, + { + "epoch": 0.26343799893560405, + "grad_norm": 0.34814777970314026, + "learning_rate": 1.9908640627024195e-07, + "loss": 0.2737, + "step": 990 + }, + { + "epoch": 0.2637040979244279, + "grad_norm": 0.2764028012752533, + "learning_rate": 1.99084127204016e-07, + "loss": 0.259, + "step": 991 + }, + { + "epoch": 0.26397019691325174, + "grad_norm": 0.2794129550457001, + "learning_rate": 1.9908184531170945e-07, + "loss": 0.2363, + "step": 992 + }, + { + "epoch": 0.2642362959020756, + "grad_norm": 0.38274744153022766, + "learning_rate": 1.990795605933873e-07, + "loss": 0.2544, + "step": 993 + }, + { + "epoch": 0.26450239489089944, + "grad_norm": 0.2865275740623474, + "learning_rate": 1.9907727304911472e-07, + "loss": 0.2529, + "step": 994 + }, + { + "epoch": 0.26476849387972323, + "grad_norm": 0.2817663252353668, + "learning_rate": 1.99074982678957e-07, + "loss": 0.2455, + "step": 995 + }, + { + "epoch": 0.2650345928685471, + "grad_norm": 0.26791873574256897, + "learning_rate": 1.9907268948297947e-07, + "loss": 0.2502, + "step": 996 + }, + { + "epoch": 0.26530069185737093, + "grad_norm": 0.2887971103191376, + "learning_rate": 1.9907039346124747e-07, + "loss": 0.2584, + "step": 997 + }, + { + "epoch": 0.2655667908461948, + "grad_norm": 0.28739675879478455, + "learning_rate": 1.9906809461382653e-07, + "loss": 0.2349, + "step": 998 + }, + { + "epoch": 0.2658328898350186, + "grad_norm": 0.4269833564758301, + "learning_rate": 1.9906579294078224e-07, + "loss": 0.2611, + "step": 999 + }, + { + "epoch": 0.2660989888238425, + "grad_norm": 0.28897422552108765, + "learning_rate": 1.990634884421802e-07, + "loss": 0.2509, + "step": 1000 + }, + { + "epoch": 0.2663650878126663, + "grad_norm": 0.2704593241214752, + "learning_rate": 1.990611811180862e-07, + "loss": 0.2407, + "step": 1001 + }, + { + "epoch": 0.26663118680149017, + "grad_norm": 0.28886815905570984, + "learning_rate": 1.9905887096856596e-07, + "loss": 0.2613, + "step": 1002 + }, + { + "epoch": 0.266897285790314, + "grad_norm": 0.39533570408821106, + "learning_rate": 1.9905655799368547e-07, + "loss": 0.255, + "step": 1003 + }, + { + "epoch": 0.2671633847791378, + "grad_norm": 0.40034371614456177, + "learning_rate": 1.990542421935106e-07, + "loss": 0.2437, + "step": 1004 + }, + { + "epoch": 0.26742948376796166, + "grad_norm": 0.26456382870674133, + "learning_rate": 1.990519235681075e-07, + "loss": 0.2363, + "step": 1005 + }, + { + "epoch": 0.2676955827567855, + "grad_norm": 0.4093647301197052, + "learning_rate": 1.9904960211754224e-07, + "loss": 0.2493, + "step": 1006 + }, + { + "epoch": 0.26796168174560936, + "grad_norm": 0.3216318190097809, + "learning_rate": 1.9904727784188105e-07, + "loss": 0.2693, + "step": 1007 + }, + { + "epoch": 0.2682277807344332, + "grad_norm": 0.27479955554008484, + "learning_rate": 1.9904495074119022e-07, + "loss": 0.2384, + "step": 1008 + }, + { + "epoch": 0.26849387972325706, + "grad_norm": 0.2972317337989807, + "learning_rate": 1.9904262081553612e-07, + "loss": 0.2569, + "step": 1009 + }, + { + "epoch": 0.2687599787120809, + "grad_norm": 0.2970508337020874, + "learning_rate": 1.9904028806498524e-07, + "loss": 0.2688, + "step": 1010 + }, + { + "epoch": 0.26902607770090475, + "grad_norm": 0.3678644597530365, + "learning_rate": 1.9903795248960407e-07, + "loss": 0.2487, + "step": 1011 + }, + { + "epoch": 0.2692921766897286, + "grad_norm": 0.3656746745109558, + "learning_rate": 1.990356140894592e-07, + "loss": 0.2694, + "step": 1012 + }, + { + "epoch": 0.2695582756785524, + "grad_norm": 0.3487870991230011, + "learning_rate": 1.9903327286461742e-07, + "loss": 0.2627, + "step": 1013 + }, + { + "epoch": 0.26982437466737624, + "grad_norm": 0.2627953588962555, + "learning_rate": 1.9903092881514545e-07, + "loss": 0.2199, + "step": 1014 + }, + { + "epoch": 0.2700904736562001, + "grad_norm": 0.302823007106781, + "learning_rate": 1.990285819411101e-07, + "loss": 0.2479, + "step": 1015 + }, + { + "epoch": 0.27035657264502394, + "grad_norm": 0.2843672037124634, + "learning_rate": 1.9902623224257842e-07, + "loss": 0.2497, + "step": 1016 + }, + { + "epoch": 0.2706226716338478, + "grad_norm": 0.3353700637817383, + "learning_rate": 1.990238797196173e-07, + "loss": 0.2695, + "step": 1017 + }, + { + "epoch": 0.27088877062267164, + "grad_norm": 0.35242921113967896, + "learning_rate": 1.9902152437229397e-07, + "loss": 0.2466, + "step": 1018 + }, + { + "epoch": 0.2711548696114955, + "grad_norm": 0.3977837860584259, + "learning_rate": 1.990191662006755e-07, + "loss": 0.264, + "step": 1019 + }, + { + "epoch": 0.27142096860031933, + "grad_norm": 0.2770446538925171, + "learning_rate": 1.990168052048292e-07, + "loss": 0.2426, + "step": 1020 + }, + { + "epoch": 0.2716870675891432, + "grad_norm": 0.27837538719177246, + "learning_rate": 1.9901444138482244e-07, + "loss": 0.2524, + "step": 1021 + }, + { + "epoch": 0.271953166577967, + "grad_norm": 0.28459715843200684, + "learning_rate": 1.9901207474072257e-07, + "loss": 0.245, + "step": 1022 + }, + { + "epoch": 0.2722192655667908, + "grad_norm": 0.33184945583343506, + "learning_rate": 1.9900970527259714e-07, + "loss": 0.246, + "step": 1023 + }, + { + "epoch": 0.2724853645556147, + "grad_norm": 0.27935367822647095, + "learning_rate": 1.990073329805137e-07, + "loss": 0.2625, + "step": 1024 + }, + { + "epoch": 0.2727514635444385, + "grad_norm": 0.3022770285606384, + "learning_rate": 1.9900495786453996e-07, + "loss": 0.2313, + "step": 1025 + }, + { + "epoch": 0.27301756253326237, + "grad_norm": 0.28502267599105835, + "learning_rate": 1.990025799247436e-07, + "loss": 0.2401, + "step": 1026 + }, + { + "epoch": 0.2732836615220862, + "grad_norm": 0.4865597188472748, + "learning_rate": 1.990001991611925e-07, + "loss": 0.2803, + "step": 1027 + }, + { + "epoch": 0.27354976051091007, + "grad_norm": 0.2665250897407532, + "learning_rate": 1.9899781557395454e-07, + "loss": 0.2357, + "step": 1028 + }, + { + "epoch": 0.2738158594997339, + "grad_norm": 0.270559698343277, + "learning_rate": 1.9899542916309771e-07, + "loss": 0.2408, + "step": 1029 + }, + { + "epoch": 0.27408195848855776, + "grad_norm": 0.26754871010780334, + "learning_rate": 1.9899303992869004e-07, + "loss": 0.2363, + "step": 1030 + }, + { + "epoch": 0.2743480574773816, + "grad_norm": 0.3351612091064453, + "learning_rate": 1.9899064787079974e-07, + "loss": 0.2376, + "step": 1031 + }, + { + "epoch": 0.2746141564662054, + "grad_norm": 0.4167374074459076, + "learning_rate": 1.9898825298949496e-07, + "loss": 0.2571, + "step": 1032 + }, + { + "epoch": 0.27488025545502925, + "grad_norm": 0.2795480787754059, + "learning_rate": 1.9898585528484411e-07, + "loss": 0.2683, + "step": 1033 + }, + { + "epoch": 0.2751463544438531, + "grad_norm": 0.40383830666542053, + "learning_rate": 1.9898345475691548e-07, + "loss": 0.2443, + "step": 1034 + }, + { + "epoch": 0.27541245343267695, + "grad_norm": 0.39139673113822937, + "learning_rate": 1.9898105140577762e-07, + "loss": 0.2346, + "step": 1035 + }, + { + "epoch": 0.2756785524215008, + "grad_norm": 0.3918757736682892, + "learning_rate": 1.9897864523149898e-07, + "loss": 0.2367, + "step": 1036 + }, + { + "epoch": 0.27594465141032465, + "grad_norm": 0.4421924352645874, + "learning_rate": 1.9897623623414829e-07, + "loss": 0.2449, + "step": 1037 + }, + { + "epoch": 0.2762107503991485, + "grad_norm": 0.2856324017047882, + "learning_rate": 1.989738244137942e-07, + "loss": 0.26, + "step": 1038 + }, + { + "epoch": 0.27647684938797235, + "grad_norm": 0.3339497745037079, + "learning_rate": 1.989714097705055e-07, + "loss": 0.2272, + "step": 1039 + }, + { + "epoch": 0.2767429483767962, + "grad_norm": 0.3285781145095825, + "learning_rate": 1.9896899230435107e-07, + "loss": 0.2461, + "step": 1040 + }, + { + "epoch": 0.27700904736562, + "grad_norm": 0.4388590455055237, + "learning_rate": 1.989665720153999e-07, + "loss": 0.2623, + "step": 1041 + }, + { + "epoch": 0.27727514635444384, + "grad_norm": 0.2731340229511261, + "learning_rate": 1.9896414890372096e-07, + "loss": 0.2471, + "step": 1042 + }, + { + "epoch": 0.2775412453432677, + "grad_norm": 0.35189107060432434, + "learning_rate": 1.989617229693834e-07, + "loss": 0.2456, + "step": 1043 + }, + { + "epoch": 0.27780734433209153, + "grad_norm": 0.38003209233283997, + "learning_rate": 1.9895929421245636e-07, + "loss": 0.2434, + "step": 1044 + }, + { + "epoch": 0.2780734433209154, + "grad_norm": 0.28535202145576477, + "learning_rate": 1.989568626330092e-07, + "loss": 0.2313, + "step": 1045 + }, + { + "epoch": 0.27833954230973923, + "grad_norm": 0.284348726272583, + "learning_rate": 1.989544282311112e-07, + "loss": 0.2182, + "step": 1046 + }, + { + "epoch": 0.2786056412985631, + "grad_norm": 0.46233558654785156, + "learning_rate": 1.9895199100683184e-07, + "loss": 0.2789, + "step": 1047 + }, + { + "epoch": 0.2788717402873869, + "grad_norm": 0.29136890172958374, + "learning_rate": 1.9894955096024062e-07, + "loss": 0.2228, + "step": 1048 + }, + { + "epoch": 0.2791378392762108, + "grad_norm": 0.38048720359802246, + "learning_rate": 1.9894710809140715e-07, + "loss": 0.2734, + "step": 1049 + }, + { + "epoch": 0.27940393826503457, + "grad_norm": 0.2637609839439392, + "learning_rate": 1.9894466240040103e-07, + "loss": 0.213, + "step": 1050 + }, + { + "epoch": 0.2796700372538584, + "grad_norm": 0.349360853433609, + "learning_rate": 1.989422138872921e-07, + "loss": 0.2446, + "step": 1051 + }, + { + "epoch": 0.27993613624268227, + "grad_norm": 0.31219759583473206, + "learning_rate": 1.9893976255215017e-07, + "loss": 0.2551, + "step": 1052 + }, + { + "epoch": 0.2802022352315061, + "grad_norm": 0.2642676830291748, + "learning_rate": 1.9893730839504516e-07, + "loss": 0.2367, + "step": 1053 + }, + { + "epoch": 0.28046833422032996, + "grad_norm": 0.27515485882759094, + "learning_rate": 1.9893485141604706e-07, + "loss": 0.2596, + "step": 1054 + }, + { + "epoch": 0.2807344332091538, + "grad_norm": 0.3807004690170288, + "learning_rate": 1.9893239161522596e-07, + "loss": 0.2704, + "step": 1055 + }, + { + "epoch": 0.28100053219797766, + "grad_norm": 0.3387758433818817, + "learning_rate": 1.98929928992652e-07, + "loss": 0.2591, + "step": 1056 + }, + { + "epoch": 0.2812666311868015, + "grad_norm": 0.2593544125556946, + "learning_rate": 1.9892746354839544e-07, + "loss": 0.2362, + "step": 1057 + }, + { + "epoch": 0.28153273017562536, + "grad_norm": 0.30490973591804504, + "learning_rate": 1.989249952825266e-07, + "loss": 0.2212, + "step": 1058 + }, + { + "epoch": 0.28179882916444915, + "grad_norm": 0.2600618004798889, + "learning_rate": 1.9892252419511585e-07, + "loss": 0.2335, + "step": 1059 + }, + { + "epoch": 0.282064928153273, + "grad_norm": 0.27362242341041565, + "learning_rate": 1.989200502862337e-07, + "loss": 0.2332, + "step": 1060 + }, + { + "epoch": 0.28233102714209685, + "grad_norm": 0.28175094723701477, + "learning_rate": 1.9891757355595068e-07, + "loss": 0.2265, + "step": 1061 + }, + { + "epoch": 0.2825971261309207, + "grad_norm": 0.39850422739982605, + "learning_rate": 1.9891509400433747e-07, + "loss": 0.2515, + "step": 1062 + }, + { + "epoch": 0.28286322511974454, + "grad_norm": 0.2637619376182556, + "learning_rate": 1.9891261163146478e-07, + "loss": 0.2223, + "step": 1063 + }, + { + "epoch": 0.2831293241085684, + "grad_norm": 0.28488636016845703, + "learning_rate": 1.989101264374034e-07, + "loss": 0.2523, + "step": 1064 + }, + { + "epoch": 0.28339542309739224, + "grad_norm": 0.32453325390815735, + "learning_rate": 1.9890763842222423e-07, + "loss": 0.2289, + "step": 1065 + }, + { + "epoch": 0.2836615220862161, + "grad_norm": 0.25465017557144165, + "learning_rate": 1.989051475859982e-07, + "loss": 0.217, + "step": 1066 + }, + { + "epoch": 0.28392762107503994, + "grad_norm": 0.2966037690639496, + "learning_rate": 1.989026539287964e-07, + "loss": 0.2473, + "step": 1067 + }, + { + "epoch": 0.28419372006386373, + "grad_norm": 0.33448290824890137, + "learning_rate": 1.989001574506899e-07, + "loss": 0.2283, + "step": 1068 + }, + { + "epoch": 0.2844598190526876, + "grad_norm": 0.2844521999359131, + "learning_rate": 1.9889765815174994e-07, + "loss": 0.2505, + "step": 1069 + }, + { + "epoch": 0.2847259180415114, + "grad_norm": 0.30147820711135864, + "learning_rate": 1.9889515603204783e-07, + "loss": 0.249, + "step": 1070 + }, + { + "epoch": 0.2849920170303353, + "grad_norm": 0.2872706651687622, + "learning_rate": 1.9889265109165488e-07, + "loss": 0.2449, + "step": 1071 + }, + { + "epoch": 0.2852581160191591, + "grad_norm": 0.27843016386032104, + "learning_rate": 1.9889014333064256e-07, + "loss": 0.2284, + "step": 1072 + }, + { + "epoch": 0.285524215007983, + "grad_norm": 0.29591119289398193, + "learning_rate": 1.988876327490824e-07, + "loss": 0.251, + "step": 1073 + }, + { + "epoch": 0.2857903139968068, + "grad_norm": 0.2585087716579437, + "learning_rate": 1.9888511934704603e-07, + "loss": 0.2272, + "step": 1074 + }, + { + "epoch": 0.28605641298563067, + "grad_norm": 0.3830554485321045, + "learning_rate": 1.988826031246051e-07, + "loss": 0.2574, + "step": 1075 + }, + { + "epoch": 0.2863225119744545, + "grad_norm": 0.2584632635116577, + "learning_rate": 1.9888008408183136e-07, + "loss": 0.2227, + "step": 1076 + }, + { + "epoch": 0.2865886109632783, + "grad_norm": 0.3509785830974579, + "learning_rate": 1.9887756221879673e-07, + "loss": 0.2391, + "step": 1077 + }, + { + "epoch": 0.28685470995210216, + "grad_norm": 0.39976945519447327, + "learning_rate": 1.9887503753557307e-07, + "loss": 0.2449, + "step": 1078 + }, + { + "epoch": 0.287120808940926, + "grad_norm": 0.37085631489753723, + "learning_rate": 1.9887251003223242e-07, + "loss": 0.2536, + "step": 1079 + }, + { + "epoch": 0.28738690792974986, + "grad_norm": 0.3498704135417938, + "learning_rate": 1.9886997970884686e-07, + "loss": 0.2449, + "step": 1080 + }, + { + "epoch": 0.2876530069185737, + "grad_norm": 0.29197415709495544, + "learning_rate": 1.9886744656548857e-07, + "loss": 0.2393, + "step": 1081 + }, + { + "epoch": 0.28791910590739755, + "grad_norm": 0.24352164566516876, + "learning_rate": 1.988649106022298e-07, + "loss": 0.2392, + "step": 1082 + }, + { + "epoch": 0.2881852048962214, + "grad_norm": 0.2701135277748108, + "learning_rate": 1.988623718191429e-07, + "loss": 0.2492, + "step": 1083 + }, + { + "epoch": 0.28845130388504525, + "grad_norm": 0.2516162097454071, + "learning_rate": 1.9885983021630023e-07, + "loss": 0.2261, + "step": 1084 + }, + { + "epoch": 0.2887174028738691, + "grad_norm": 0.5213042497634888, + "learning_rate": 1.988572857937743e-07, + "loss": 0.2418, + "step": 1085 + }, + { + "epoch": 0.28898350186269295, + "grad_norm": 0.28604787588119507, + "learning_rate": 1.988547385516377e-07, + "loss": 0.2467, + "step": 1086 + }, + { + "epoch": 0.28924960085151674, + "grad_norm": 0.37960880994796753, + "learning_rate": 1.988521884899631e-07, + "loss": 0.2488, + "step": 1087 + }, + { + "epoch": 0.2895156998403406, + "grad_norm": 0.29449301958084106, + "learning_rate": 1.9884963560882318e-07, + "loss": 0.2417, + "step": 1088 + }, + { + "epoch": 0.28978179882916444, + "grad_norm": 0.2663077116012573, + "learning_rate": 1.9884707990829077e-07, + "loss": 0.2327, + "step": 1089 + }, + { + "epoch": 0.2900478978179883, + "grad_norm": 0.30577293038368225, + "learning_rate": 1.988445213884388e-07, + "loss": 0.2344, + "step": 1090 + }, + { + "epoch": 0.29031399680681214, + "grad_norm": 0.3899892568588257, + "learning_rate": 1.988419600493402e-07, + "loss": 0.2636, + "step": 1091 + }, + { + "epoch": 0.290580095795636, + "grad_norm": 0.276457279920578, + "learning_rate": 1.9883939589106805e-07, + "loss": 0.2234, + "step": 1092 + }, + { + "epoch": 0.29084619478445983, + "grad_norm": 0.256149023771286, + "learning_rate": 1.9883682891369545e-07, + "loss": 0.2225, + "step": 1093 + }, + { + "epoch": 0.2911122937732837, + "grad_norm": 0.26826998591423035, + "learning_rate": 1.988342591172957e-07, + "loss": 0.2302, + "step": 1094 + }, + { + "epoch": 0.29137839276210753, + "grad_norm": 0.24989104270935059, + "learning_rate": 1.9883168650194203e-07, + "loss": 0.229, + "step": 1095 + }, + { + "epoch": 0.2916444917509313, + "grad_norm": 0.4090118408203125, + "learning_rate": 1.9882911106770778e-07, + "loss": 0.2514, + "step": 1096 + }, + { + "epoch": 0.29191059073975517, + "grad_norm": 0.2785210907459259, + "learning_rate": 1.9882653281466648e-07, + "loss": 0.2337, + "step": 1097 + }, + { + "epoch": 0.292176689728579, + "grad_norm": 0.5083312392234802, + "learning_rate": 1.9882395174289164e-07, + "loss": 0.2464, + "step": 1098 + }, + { + "epoch": 0.29244278871740287, + "grad_norm": 0.3242875933647156, + "learning_rate": 1.9882136785245688e-07, + "loss": 0.2434, + "step": 1099 + }, + { + "epoch": 0.2927088877062267, + "grad_norm": 0.2537277638912201, + "learning_rate": 1.988187811434359e-07, + "loss": 0.2214, + "step": 1100 + }, + { + "epoch": 0.29297498669505057, + "grad_norm": 0.35308974981307983, + "learning_rate": 1.9881619161590243e-07, + "loss": 0.2469, + "step": 1101 + }, + { + "epoch": 0.2932410856838744, + "grad_norm": 0.3564084470272064, + "learning_rate": 1.9881359926993042e-07, + "loss": 0.2571, + "step": 1102 + }, + { + "epoch": 0.29350718467269826, + "grad_norm": 0.3446555435657501, + "learning_rate": 1.9881100410559375e-07, + "loss": 0.2442, + "step": 1103 + }, + { + "epoch": 0.2937732836615221, + "grad_norm": 0.2850266993045807, + "learning_rate": 1.9880840612296643e-07, + "loss": 0.2515, + "step": 1104 + }, + { + "epoch": 0.2940393826503459, + "grad_norm": 0.25522270798683167, + "learning_rate": 1.988058053221226e-07, + "loss": 0.2253, + "step": 1105 + }, + { + "epoch": 0.29430548163916975, + "grad_norm": 0.2903016209602356, + "learning_rate": 1.9880320170313637e-07, + "loss": 0.2514, + "step": 1106 + }, + { + "epoch": 0.2945715806279936, + "grad_norm": 0.29639360308647156, + "learning_rate": 1.988005952660821e-07, + "loss": 0.2533, + "step": 1107 + }, + { + "epoch": 0.29483767961681745, + "grad_norm": 0.3241468667984009, + "learning_rate": 1.987979860110341e-07, + "loss": 0.2518, + "step": 1108 + }, + { + "epoch": 0.2951037786056413, + "grad_norm": 0.2846147119998932, + "learning_rate": 1.9879537393806673e-07, + "loss": 0.2605, + "step": 1109 + }, + { + "epoch": 0.29536987759446515, + "grad_norm": 0.2990715205669403, + "learning_rate": 1.9879275904725454e-07, + "loss": 0.2518, + "step": 1110 + }, + { + "epoch": 0.295635976583289, + "grad_norm": 0.4364882707595825, + "learning_rate": 1.9879014133867213e-07, + "loss": 0.2531, + "step": 1111 + }, + { + "epoch": 0.29590207557211284, + "grad_norm": 0.2937179505825043, + "learning_rate": 1.987875208123941e-07, + "loss": 0.2439, + "step": 1112 + }, + { + "epoch": 0.2961681745609367, + "grad_norm": 0.2704716920852661, + "learning_rate": 1.9878489746849526e-07, + "loss": 0.2242, + "step": 1113 + }, + { + "epoch": 0.2964342735497605, + "grad_norm": 0.3461718261241913, + "learning_rate": 1.9878227130705039e-07, + "loss": 0.2628, + "step": 1114 + }, + { + "epoch": 0.29670037253858433, + "grad_norm": 0.3081868886947632, + "learning_rate": 1.987796423281344e-07, + "loss": 0.2341, + "step": 1115 + }, + { + "epoch": 0.2969664715274082, + "grad_norm": 0.2797727882862091, + "learning_rate": 1.9877701053182228e-07, + "loss": 0.2461, + "step": 1116 + }, + { + "epoch": 0.29723257051623203, + "grad_norm": 0.3044676184654236, + "learning_rate": 1.987743759181891e-07, + "loss": 0.243, + "step": 1117 + }, + { + "epoch": 0.2974986695050559, + "grad_norm": 0.42542028427124023, + "learning_rate": 1.9877173848731005e-07, + "loss": 0.2471, + "step": 1118 + }, + { + "epoch": 0.29776476849387973, + "grad_norm": 0.2595873773097992, + "learning_rate": 1.9876909823926024e-07, + "loss": 0.2276, + "step": 1119 + }, + { + "epoch": 0.2980308674827036, + "grad_norm": 0.32811471819877625, + "learning_rate": 1.9876645517411506e-07, + "loss": 0.2348, + "step": 1120 + }, + { + "epoch": 0.2982969664715274, + "grad_norm": 0.3976075351238251, + "learning_rate": 1.9876380929194988e-07, + "loss": 0.2465, + "step": 1121 + }, + { + "epoch": 0.2985630654603513, + "grad_norm": 0.42125338315963745, + "learning_rate": 1.9876116059284015e-07, + "loss": 0.2573, + "step": 1122 + }, + { + "epoch": 0.29882916444917507, + "grad_norm": 0.32193294167518616, + "learning_rate": 1.9875850907686142e-07, + "loss": 0.2451, + "step": 1123 + }, + { + "epoch": 0.2990952634379989, + "grad_norm": 0.3108781576156616, + "learning_rate": 1.9875585474408934e-07, + "loss": 0.2067, + "step": 1124 + }, + { + "epoch": 0.29936136242682276, + "grad_norm": 0.40949511528015137, + "learning_rate": 1.987531975945996e-07, + "loss": 0.2564, + "step": 1125 + }, + { + "epoch": 0.2996274614156466, + "grad_norm": 0.346257746219635, + "learning_rate": 1.9875053762846797e-07, + "loss": 0.2715, + "step": 1126 + }, + { + "epoch": 0.29989356040447046, + "grad_norm": 0.2884940207004547, + "learning_rate": 1.9874787484577039e-07, + "loss": 0.24, + "step": 1127 + }, + { + "epoch": 0.3001596593932943, + "grad_norm": 0.2731996178627014, + "learning_rate": 1.9874520924658271e-07, + "loss": 0.2248, + "step": 1128 + }, + { + "epoch": 0.30042575838211816, + "grad_norm": 0.31430894136428833, + "learning_rate": 1.98742540830981e-07, + "loss": 0.2363, + "step": 1129 + }, + { + "epoch": 0.300691857370942, + "grad_norm": 0.2715678811073303, + "learning_rate": 1.9873986959904139e-07, + "loss": 0.2413, + "step": 1130 + }, + { + "epoch": 0.30095795635976585, + "grad_norm": 0.26191574335098267, + "learning_rate": 1.9873719555084004e-07, + "loss": 0.2311, + "step": 1131 + }, + { + "epoch": 0.30122405534858965, + "grad_norm": 0.29359009861946106, + "learning_rate": 1.9873451868645325e-07, + "loss": 0.2439, + "step": 1132 + }, + { + "epoch": 0.3014901543374135, + "grad_norm": 0.3338557183742523, + "learning_rate": 1.9873183900595732e-07, + "loss": 0.2513, + "step": 1133 + }, + { + "epoch": 0.30175625332623734, + "grad_norm": 0.2891707718372345, + "learning_rate": 1.987291565094287e-07, + "loss": 0.256, + "step": 1134 + }, + { + "epoch": 0.3020223523150612, + "grad_norm": 0.4443114697933197, + "learning_rate": 1.9872647119694394e-07, + "loss": 0.2401, + "step": 1135 + }, + { + "epoch": 0.30228845130388504, + "grad_norm": 0.27663522958755493, + "learning_rate": 1.987237830685796e-07, + "loss": 0.2292, + "step": 1136 + }, + { + "epoch": 0.3025545502927089, + "grad_norm": 0.2811214029788971, + "learning_rate": 1.9872109212441232e-07, + "loss": 0.2332, + "step": 1137 + }, + { + "epoch": 0.30282064928153274, + "grad_norm": 0.26474279165267944, + "learning_rate": 1.9871839836451892e-07, + "loss": 0.2209, + "step": 1138 + }, + { + "epoch": 0.3030867482703566, + "grad_norm": 0.31014472246170044, + "learning_rate": 1.9871570178897614e-07, + "loss": 0.2408, + "step": 1139 + }, + { + "epoch": 0.30335284725918044, + "grad_norm": 0.2939283847808838, + "learning_rate": 1.98713002397861e-07, + "loss": 0.2349, + "step": 1140 + }, + { + "epoch": 0.3036189462480043, + "grad_norm": 0.27223464846611023, + "learning_rate": 1.9871030019125042e-07, + "loss": 0.2327, + "step": 1141 + }, + { + "epoch": 0.3038850452368281, + "grad_norm": 0.2761884033679962, + "learning_rate": 1.9870759516922145e-07, + "loss": 0.2411, + "step": 1142 + }, + { + "epoch": 0.3041511442256519, + "grad_norm": 0.43909287452697754, + "learning_rate": 1.9870488733185135e-07, + "loss": 0.2517, + "step": 1143 + }, + { + "epoch": 0.3044172432144758, + "grad_norm": 0.2656424939632416, + "learning_rate": 1.9870217667921725e-07, + "loss": 0.2383, + "step": 1144 + }, + { + "epoch": 0.3046833422032996, + "grad_norm": 0.4072989225387573, + "learning_rate": 1.986994632113965e-07, + "loss": 0.2509, + "step": 1145 + }, + { + "epoch": 0.30494944119212347, + "grad_norm": 0.25848719477653503, + "learning_rate": 1.9869674692846648e-07, + "loss": 0.2357, + "step": 1146 + }, + { + "epoch": 0.3052155401809473, + "grad_norm": 0.3406834602355957, + "learning_rate": 1.986940278305047e-07, + "loss": 0.2689, + "step": 1147 + }, + { + "epoch": 0.30548163916977117, + "grad_norm": 0.40871942043304443, + "learning_rate": 1.9869130591758866e-07, + "loss": 0.251, + "step": 1148 + }, + { + "epoch": 0.305747738158595, + "grad_norm": 0.3423427939414978, + "learning_rate": 1.9868858118979608e-07, + "loss": 0.2468, + "step": 1149 + }, + { + "epoch": 0.30601383714741887, + "grad_norm": 0.37193742394447327, + "learning_rate": 1.9868585364720457e-07, + "loss": 0.2401, + "step": 1150 + }, + { + "epoch": 0.30627993613624266, + "grad_norm": 0.4089467525482178, + "learning_rate": 1.98683123289892e-07, + "loss": 0.2492, + "step": 1151 + }, + { + "epoch": 0.3065460351250665, + "grad_norm": 0.33883562684059143, + "learning_rate": 1.986803901179362e-07, + "loss": 0.2475, + "step": 1152 + }, + { + "epoch": 0.30681213411389036, + "grad_norm": 0.2692604064941406, + "learning_rate": 1.9867765413141514e-07, + "loss": 0.2321, + "step": 1153 + }, + { + "epoch": 0.3070782331027142, + "grad_norm": 0.2732444405555725, + "learning_rate": 1.986749153304069e-07, + "loss": 0.2322, + "step": 1154 + }, + { + "epoch": 0.30734433209153805, + "grad_norm": 0.2893897593021393, + "learning_rate": 1.9867217371498955e-07, + "loss": 0.2424, + "step": 1155 + }, + { + "epoch": 0.3076104310803619, + "grad_norm": 0.25848400592803955, + "learning_rate": 1.9866942928524127e-07, + "loss": 0.2438, + "step": 1156 + }, + { + "epoch": 0.30787653006918575, + "grad_norm": 0.2632853388786316, + "learning_rate": 1.9866668204124037e-07, + "loss": 0.2396, + "step": 1157 + }, + { + "epoch": 0.3081426290580096, + "grad_norm": 0.2933068871498108, + "learning_rate": 1.986639319830652e-07, + "loss": 0.2466, + "step": 1158 + }, + { + "epoch": 0.30840872804683345, + "grad_norm": 0.3404422700405121, + "learning_rate": 1.9866117911079415e-07, + "loss": 0.2531, + "step": 1159 + }, + { + "epoch": 0.30867482703565724, + "grad_norm": 0.45842599868774414, + "learning_rate": 1.9865842342450584e-07, + "loss": 0.2638, + "step": 1160 + }, + { + "epoch": 0.3089409260244811, + "grad_norm": 0.39002448320388794, + "learning_rate": 1.986556649242788e-07, + "loss": 0.2495, + "step": 1161 + }, + { + "epoch": 0.30920702501330494, + "grad_norm": 0.334973007440567, + "learning_rate": 1.9865290361019173e-07, + "loss": 0.2383, + "step": 1162 + }, + { + "epoch": 0.3094731240021288, + "grad_norm": 0.2802164554595947, + "learning_rate": 1.9865013948232337e-07, + "loss": 0.2419, + "step": 1163 + }, + { + "epoch": 0.30973922299095263, + "grad_norm": 0.34604015946388245, + "learning_rate": 1.9864737254075255e-07, + "loss": 0.2515, + "step": 1164 + }, + { + "epoch": 0.3100053219797765, + "grad_norm": 0.2399856597185135, + "learning_rate": 1.986446027855582e-07, + "loss": 0.2081, + "step": 1165 + }, + { + "epoch": 0.31027142096860033, + "grad_norm": 0.33351317048072815, + "learning_rate": 1.9864183021681934e-07, + "loss": 0.2507, + "step": 1166 + }, + { + "epoch": 0.3105375199574242, + "grad_norm": 0.3783053159713745, + "learning_rate": 1.9863905483461501e-07, + "loss": 0.2321, + "step": 1167 + }, + { + "epoch": 0.31080361894624803, + "grad_norm": 0.2589070796966553, + "learning_rate": 1.9863627663902442e-07, + "loss": 0.2209, + "step": 1168 + }, + { + "epoch": 0.3110697179350718, + "grad_norm": 0.3348328769207001, + "learning_rate": 1.9863349563012676e-07, + "loss": 0.237, + "step": 1169 + }, + { + "epoch": 0.31133581692389567, + "grad_norm": 0.38736069202423096, + "learning_rate": 1.986307118080014e-07, + "loss": 0.2437, + "step": 1170 + }, + { + "epoch": 0.3116019159127195, + "grad_norm": 0.3058469891548157, + "learning_rate": 1.986279251727277e-07, + "loss": 0.284, + "step": 1171 + }, + { + "epoch": 0.31186801490154337, + "grad_norm": 0.2809264063835144, + "learning_rate": 1.9862513572438515e-07, + "loss": 0.2583, + "step": 1172 + }, + { + "epoch": 0.3121341138903672, + "grad_norm": 0.3245799243450165, + "learning_rate": 1.9862234346305331e-07, + "loss": 0.2179, + "step": 1173 + }, + { + "epoch": 0.31240021287919106, + "grad_norm": 0.27183663845062256, + "learning_rate": 1.986195483888118e-07, + "loss": 0.24, + "step": 1174 + }, + { + "epoch": 0.3126663118680149, + "grad_norm": 0.2975230813026428, + "learning_rate": 1.986167505017404e-07, + "loss": 0.2555, + "step": 1175 + }, + { + "epoch": 0.31293241085683876, + "grad_norm": 0.343491792678833, + "learning_rate": 1.9861394980191887e-07, + "loss": 0.2414, + "step": 1176 + }, + { + "epoch": 0.3131985098456626, + "grad_norm": 0.26165151596069336, + "learning_rate": 1.986111462894271e-07, + "loss": 0.2439, + "step": 1177 + }, + { + "epoch": 0.3134646088344864, + "grad_norm": 0.26898881793022156, + "learning_rate": 1.9860833996434504e-07, + "loss": 0.2258, + "step": 1178 + }, + { + "epoch": 0.31373070782331025, + "grad_norm": 0.3008453845977783, + "learning_rate": 1.9860553082675274e-07, + "loss": 0.2158, + "step": 1179 + }, + { + "epoch": 0.3139968068121341, + "grad_norm": 0.2988293170928955, + "learning_rate": 1.9860271887673035e-07, + "loss": 0.2575, + "step": 1180 + }, + { + "epoch": 0.31426290580095795, + "grad_norm": 0.3814559578895569, + "learning_rate": 1.98599904114358e-07, + "loss": 0.2293, + "step": 1181 + }, + { + "epoch": 0.3145290047897818, + "grad_norm": 0.34655892848968506, + "learning_rate": 1.9859708653971606e-07, + "loss": 0.2437, + "step": 1182 + }, + { + "epoch": 0.31479510377860564, + "grad_norm": 0.35724177956581116, + "learning_rate": 1.9859426615288486e-07, + "loss": 0.254, + "step": 1183 + }, + { + "epoch": 0.3150612027674295, + "grad_norm": 0.28735587000846863, + "learning_rate": 1.985914429539448e-07, + "loss": 0.2306, + "step": 1184 + }, + { + "epoch": 0.31532730175625334, + "grad_norm": 0.31324246525764465, + "learning_rate": 1.9858861694297646e-07, + "loss": 0.2256, + "step": 1185 + }, + { + "epoch": 0.3155934007450772, + "grad_norm": 0.43429288268089294, + "learning_rate": 1.985857881200604e-07, + "loss": 0.2393, + "step": 1186 + }, + { + "epoch": 0.315859499733901, + "grad_norm": 0.2628338932991028, + "learning_rate": 1.9858295648527738e-07, + "loss": 0.2285, + "step": 1187 + }, + { + "epoch": 0.31612559872272483, + "grad_norm": 0.3281818628311157, + "learning_rate": 1.985801220387081e-07, + "loss": 0.242, + "step": 1188 + }, + { + "epoch": 0.3163916977115487, + "grad_norm": 0.27779731154441833, + "learning_rate": 1.9857728478043336e-07, + "loss": 0.2403, + "step": 1189 + }, + { + "epoch": 0.31665779670037253, + "grad_norm": 0.3537668287754059, + "learning_rate": 1.985744447105342e-07, + "loss": 0.2389, + "step": 1190 + }, + { + "epoch": 0.3169238956891964, + "grad_norm": 0.3461396396160126, + "learning_rate": 1.9857160182909151e-07, + "loss": 0.2516, + "step": 1191 + }, + { + "epoch": 0.3171899946780202, + "grad_norm": 0.41829103231430054, + "learning_rate": 1.9856875613618646e-07, + "loss": 0.2474, + "step": 1192 + }, + { + "epoch": 0.3174560936668441, + "grad_norm": 0.28589367866516113, + "learning_rate": 1.9856590763190017e-07, + "loss": 0.2285, + "step": 1193 + }, + { + "epoch": 0.3177221926556679, + "grad_norm": 0.2591603994369507, + "learning_rate": 1.985630563163139e-07, + "loss": 0.2292, + "step": 1194 + }, + { + "epoch": 0.31798829164449177, + "grad_norm": 0.2814132869243622, + "learning_rate": 1.9856020218950903e-07, + "loss": 0.2492, + "step": 1195 + }, + { + "epoch": 0.3182543906333156, + "grad_norm": 0.4022989869117737, + "learning_rate": 1.9855734525156686e-07, + "loss": 0.2498, + "step": 1196 + }, + { + "epoch": 0.3185204896221394, + "grad_norm": 0.28724825382232666, + "learning_rate": 1.9855448550256893e-07, + "loss": 0.236, + "step": 1197 + }, + { + "epoch": 0.31878658861096326, + "grad_norm": 0.3089914917945862, + "learning_rate": 1.985516229425968e-07, + "loss": 0.2284, + "step": 1198 + }, + { + "epoch": 0.3190526875997871, + "grad_norm": 0.36599433422088623, + "learning_rate": 1.9854875757173208e-07, + "loss": 0.2432, + "step": 1199 + }, + { + "epoch": 0.31931878658861096, + "grad_norm": 0.39699792861938477, + "learning_rate": 1.9854588939005659e-07, + "loss": 0.2517, + "step": 1200 + }, + { + "epoch": 0.3195848855774348, + "grad_norm": 0.34009531140327454, + "learning_rate": 1.9854301839765205e-07, + "loss": 0.2383, + "step": 1201 + }, + { + "epoch": 0.31985098456625866, + "grad_norm": 0.29752010107040405, + "learning_rate": 1.985401445946004e-07, + "loss": 0.2336, + "step": 1202 + }, + { + "epoch": 0.3201170835550825, + "grad_norm": 0.4611661732196808, + "learning_rate": 1.9853726798098355e-07, + "loss": 0.2345, + "step": 1203 + }, + { + "epoch": 0.32038318254390635, + "grad_norm": 1.182275414466858, + "learning_rate": 1.9853438855688364e-07, + "loss": 0.2355, + "step": 1204 + }, + { + "epoch": 0.3206492815327302, + "grad_norm": 0.28681719303131104, + "learning_rate": 1.9853150632238267e-07, + "loss": 0.2571, + "step": 1205 + }, + { + "epoch": 0.320915380521554, + "grad_norm": 0.3762682378292084, + "learning_rate": 1.9852862127756295e-07, + "loss": 0.2312, + "step": 1206 + }, + { + "epoch": 0.32118147951037784, + "grad_norm": 0.32047393918037415, + "learning_rate": 1.985257334225067e-07, + "loss": 0.2289, + "step": 1207 + }, + { + "epoch": 0.3214475784992017, + "grad_norm": 0.342648983001709, + "learning_rate": 1.9852284275729634e-07, + "loss": 0.2545, + "step": 1208 + }, + { + "epoch": 0.32171367748802554, + "grad_norm": 0.32587331533432007, + "learning_rate": 1.985199492820143e-07, + "loss": 0.2424, + "step": 1209 + }, + { + "epoch": 0.3219797764768494, + "grad_norm": 0.2663286626338959, + "learning_rate": 1.9851705299674312e-07, + "loss": 0.2204, + "step": 1210 + }, + { + "epoch": 0.32224587546567324, + "grad_norm": 0.2875593602657318, + "learning_rate": 1.9851415390156535e-07, + "loss": 0.2533, + "step": 1211 + }, + { + "epoch": 0.3225119744544971, + "grad_norm": 0.2678220868110657, + "learning_rate": 1.9851125199656376e-07, + "loss": 0.2498, + "step": 1212 + }, + { + "epoch": 0.32277807344332093, + "grad_norm": 0.3624131381511688, + "learning_rate": 1.9850834728182108e-07, + "loss": 0.2347, + "step": 1213 + }, + { + "epoch": 0.3230441724321448, + "grad_norm": 0.2754782438278198, + "learning_rate": 1.985054397574201e-07, + "loss": 0.2323, + "step": 1214 + }, + { + "epoch": 0.3233102714209686, + "grad_norm": 0.5733137726783752, + "learning_rate": 1.9850252942344387e-07, + "loss": 0.2596, + "step": 1215 + }, + { + "epoch": 0.3235763704097924, + "grad_norm": 0.2754276990890503, + "learning_rate": 1.984996162799753e-07, + "loss": 0.2299, + "step": 1216 + }, + { + "epoch": 0.3238424693986163, + "grad_norm": 0.3660556972026825, + "learning_rate": 1.984967003270975e-07, + "loss": 0.2623, + "step": 1217 + }, + { + "epoch": 0.3241085683874401, + "grad_norm": 0.30321964621543884, + "learning_rate": 1.984937815648937e-07, + "loss": 0.2578, + "step": 1218 + }, + { + "epoch": 0.32437466737626397, + "grad_norm": 0.2787325084209442, + "learning_rate": 1.9849085999344708e-07, + "loss": 0.212, + "step": 1219 + }, + { + "epoch": 0.3246407663650878, + "grad_norm": 0.3626425266265869, + "learning_rate": 1.9848793561284096e-07, + "loss": 0.2395, + "step": 1220 + }, + { + "epoch": 0.32490686535391167, + "grad_norm": 0.26095202565193176, + "learning_rate": 1.984850084231588e-07, + "loss": 0.2274, + "step": 1221 + }, + { + "epoch": 0.3251729643427355, + "grad_norm": 0.28032010793685913, + "learning_rate": 1.984820784244841e-07, + "loss": 0.2301, + "step": 1222 + }, + { + "epoch": 0.32543906333155936, + "grad_norm": 0.3426423966884613, + "learning_rate": 1.9847914561690038e-07, + "loss": 0.2347, + "step": 1223 + }, + { + "epoch": 0.32570516232038316, + "grad_norm": 0.34698575735092163, + "learning_rate": 1.9847621000049128e-07, + "loss": 0.2309, + "step": 1224 + }, + { + "epoch": 0.325971261309207, + "grad_norm": 0.37095773220062256, + "learning_rate": 1.9847327157534058e-07, + "loss": 0.2386, + "step": 1225 + }, + { + "epoch": 0.32623736029803085, + "grad_norm": 0.2585175037384033, + "learning_rate": 1.9847033034153203e-07, + "loss": 0.2356, + "step": 1226 + }, + { + "epoch": 0.3265034592868547, + "grad_norm": 0.2615545094013214, + "learning_rate": 1.9846738629914964e-07, + "loss": 0.2253, + "step": 1227 + }, + { + "epoch": 0.32676955827567855, + "grad_norm": 0.3090428113937378, + "learning_rate": 1.9846443944827723e-07, + "loss": 0.2281, + "step": 1228 + }, + { + "epoch": 0.3270356572645024, + "grad_norm": 0.28001368045806885, + "learning_rate": 1.9846148978899895e-07, + "loss": 0.2403, + "step": 1229 + }, + { + "epoch": 0.32730175625332625, + "grad_norm": 0.28535735607147217, + "learning_rate": 1.9845853732139885e-07, + "loss": 0.2294, + "step": 1230 + }, + { + "epoch": 0.3275678552421501, + "grad_norm": 0.40839967131614685, + "learning_rate": 1.9845558204556125e-07, + "loss": 0.27, + "step": 1231 + }, + { + "epoch": 0.32783395423097395, + "grad_norm": 0.30204492807388306, + "learning_rate": 1.9845262396157035e-07, + "loss": 0.2384, + "step": 1232 + }, + { + "epoch": 0.32810005321979774, + "grad_norm": 0.272452175617218, + "learning_rate": 1.9844966306951054e-07, + "loss": 0.2168, + "step": 1233 + }, + { + "epoch": 0.3283661522086216, + "grad_norm": 0.5890095829963684, + "learning_rate": 1.984466993694663e-07, + "loss": 0.2369, + "step": 1234 + }, + { + "epoch": 0.32863225119744544, + "grad_norm": 0.375940203666687, + "learning_rate": 1.9844373286152214e-07, + "loss": 0.2497, + "step": 1235 + }, + { + "epoch": 0.3288983501862693, + "grad_norm": 0.23878799378871918, + "learning_rate": 1.9844076354576263e-07, + "loss": 0.2139, + "step": 1236 + }, + { + "epoch": 0.32916444917509313, + "grad_norm": 0.38738298416137695, + "learning_rate": 1.9843779142227256e-07, + "loss": 0.2451, + "step": 1237 + }, + { + "epoch": 0.329430548163917, + "grad_norm": 0.3244592547416687, + "learning_rate": 1.984348164911366e-07, + "loss": 0.2486, + "step": 1238 + }, + { + "epoch": 0.32969664715274083, + "grad_norm": 0.6486718654632568, + "learning_rate": 1.9843183875243968e-07, + "loss": 0.2359, + "step": 1239 + }, + { + "epoch": 0.3299627461415647, + "grad_norm": 0.3045386075973511, + "learning_rate": 1.9842885820626668e-07, + "loss": 0.2293, + "step": 1240 + }, + { + "epoch": 0.3302288451303885, + "grad_norm": 0.3315276503562927, + "learning_rate": 1.9842587485270261e-07, + "loss": 0.2393, + "step": 1241 + }, + { + "epoch": 0.3304949441192123, + "grad_norm": 0.33492380380630493, + "learning_rate": 1.984228886918326e-07, + "loss": 0.2266, + "step": 1242 + }, + { + "epoch": 0.33076104310803617, + "grad_norm": 0.2655009627342224, + "learning_rate": 1.984198997237418e-07, + "loss": 0.2178, + "step": 1243 + }, + { + "epoch": 0.33102714209686, + "grad_norm": 0.39611390233039856, + "learning_rate": 1.9841690794851544e-07, + "loss": 0.2511, + "step": 1244 + }, + { + "epoch": 0.33129324108568386, + "grad_norm": 0.2692622244358063, + "learning_rate": 1.9841391336623888e-07, + "loss": 0.2355, + "step": 1245 + }, + { + "epoch": 0.3315593400745077, + "grad_norm": 0.39325591921806335, + "learning_rate": 1.9841091597699755e-07, + "loss": 0.2404, + "step": 1246 + }, + { + "epoch": 0.33182543906333156, + "grad_norm": 0.2930662930011749, + "learning_rate": 1.984079157808769e-07, + "loss": 0.2301, + "step": 1247 + }, + { + "epoch": 0.3320915380521554, + "grad_norm": 0.5139453411102295, + "learning_rate": 1.9840491277796253e-07, + "loss": 0.262, + "step": 1248 + }, + { + "epoch": 0.33235763704097926, + "grad_norm": 0.25215426087379456, + "learning_rate": 1.9840190696834004e-07, + "loss": 0.2135, + "step": 1249 + }, + { + "epoch": 0.3326237360298031, + "grad_norm": 0.38293230533599854, + "learning_rate": 1.9839889835209521e-07, + "loss": 0.2401, + "step": 1250 + }, + { + "epoch": 0.33288983501862696, + "grad_norm": 0.2997799515724182, + "learning_rate": 1.9839588692931387e-07, + "loss": 0.2375, + "step": 1251 + }, + { + "epoch": 0.33315593400745075, + "grad_norm": 0.37920552492141724, + "learning_rate": 1.9839287270008187e-07, + "loss": 0.2343, + "step": 1252 + }, + { + "epoch": 0.3334220329962746, + "grad_norm": 0.26824167370796204, + "learning_rate": 1.9838985566448522e-07, + "loss": 0.2234, + "step": 1253 + }, + { + "epoch": 0.33368813198509845, + "grad_norm": 0.30228057503700256, + "learning_rate": 1.9838683582260992e-07, + "loss": 0.265, + "step": 1254 + }, + { + "epoch": 0.3339542309739223, + "grad_norm": 0.422921746969223, + "learning_rate": 1.9838381317454215e-07, + "loss": 0.2441, + "step": 1255 + }, + { + "epoch": 0.33422032996274614, + "grad_norm": 0.26567646861076355, + "learning_rate": 1.983807877203681e-07, + "loss": 0.2415, + "step": 1256 + }, + { + "epoch": 0.33448642895157, + "grad_norm": 0.4637302756309509, + "learning_rate": 1.9837775946017403e-07, + "loss": 0.2611, + "step": 1257 + }, + { + "epoch": 0.33475252794039384, + "grad_norm": 0.2576817572116852, + "learning_rate": 1.9837472839404636e-07, + "loss": 0.2306, + "step": 1258 + }, + { + "epoch": 0.3350186269292177, + "grad_norm": 0.2782202959060669, + "learning_rate": 1.9837169452207156e-07, + "loss": 0.2211, + "step": 1259 + }, + { + "epoch": 0.33528472591804154, + "grad_norm": 0.2807655930519104, + "learning_rate": 1.9836865784433612e-07, + "loss": 0.2139, + "step": 1260 + }, + { + "epoch": 0.33555082490686533, + "grad_norm": 0.36444398760795593, + "learning_rate": 1.9836561836092665e-07, + "loss": 0.2584, + "step": 1261 + }, + { + "epoch": 0.3358169238956892, + "grad_norm": 0.2655319273471832, + "learning_rate": 1.9836257607192986e-07, + "loss": 0.2331, + "step": 1262 + }, + { + "epoch": 0.336083022884513, + "grad_norm": 0.2779974639415741, + "learning_rate": 1.9835953097743253e-07, + "loss": 0.2404, + "step": 1263 + }, + { + "epoch": 0.3363491218733369, + "grad_norm": 0.2753720283508301, + "learning_rate": 1.983564830775215e-07, + "loss": 0.2481, + "step": 1264 + }, + { + "epoch": 0.3366152208621607, + "grad_norm": 0.5397790670394897, + "learning_rate": 1.9835343237228367e-07, + "loss": 0.2621, + "step": 1265 + }, + { + "epoch": 0.3368813198509846, + "grad_norm": 0.27078601717948914, + "learning_rate": 1.9835037886180612e-07, + "loss": 0.2499, + "step": 1266 + }, + { + "epoch": 0.3371474188398084, + "grad_norm": 0.34134441614151, + "learning_rate": 1.983473225461759e-07, + "loss": 0.2428, + "step": 1267 + }, + { + "epoch": 0.33741351782863227, + "grad_norm": 0.38357624411582947, + "learning_rate": 1.9834426342548018e-07, + "loss": 0.2567, + "step": 1268 + }, + { + "epoch": 0.3376796168174561, + "grad_norm": 0.3238860070705414, + "learning_rate": 1.9834120149980622e-07, + "loss": 0.2443, + "step": 1269 + }, + { + "epoch": 0.3379457158062799, + "grad_norm": 0.4528186619281769, + "learning_rate": 1.983381367692414e-07, + "loss": 0.2272, + "step": 1270 + }, + { + "epoch": 0.33821181479510376, + "grad_norm": 0.3183150291442871, + "learning_rate": 1.9833506923387302e-07, + "loss": 0.2177, + "step": 1271 + }, + { + "epoch": 0.3384779137839276, + "grad_norm": 0.33499211072921753, + "learning_rate": 1.983319988937887e-07, + "loss": 0.2538, + "step": 1272 + }, + { + "epoch": 0.33874401277275146, + "grad_norm": 0.35164976119995117, + "learning_rate": 1.9832892574907588e-07, + "loss": 0.2363, + "step": 1273 + }, + { + "epoch": 0.3390101117615753, + "grad_norm": 0.34409791231155396, + "learning_rate": 1.9832584979982233e-07, + "loss": 0.2674, + "step": 1274 + }, + { + "epoch": 0.33927621075039915, + "grad_norm": 0.2808719575405121, + "learning_rate": 1.9832277104611571e-07, + "loss": 0.2323, + "step": 1275 + }, + { + "epoch": 0.339542309739223, + "grad_norm": 0.3820178210735321, + "learning_rate": 1.983196894880439e-07, + "loss": 0.2478, + "step": 1276 + }, + { + "epoch": 0.33980840872804685, + "grad_norm": 0.3464067280292511, + "learning_rate": 1.9831660512569472e-07, + "loss": 0.2301, + "step": 1277 + }, + { + "epoch": 0.3400745077168707, + "grad_norm": 0.300513356924057, + "learning_rate": 1.983135179591562e-07, + "loss": 0.2318, + "step": 1278 + }, + { + "epoch": 0.3403406067056945, + "grad_norm": 0.2634001672267914, + "learning_rate": 1.9831042798851635e-07, + "loss": 0.2588, + "step": 1279 + }, + { + "epoch": 0.34060670569451834, + "grad_norm": 0.4254695177078247, + "learning_rate": 1.983073352138633e-07, + "loss": 0.2452, + "step": 1280 + }, + { + "epoch": 0.3408728046833422, + "grad_norm": 0.3022894859313965, + "learning_rate": 1.983042396352853e-07, + "loss": 0.2416, + "step": 1281 + }, + { + "epoch": 0.34113890367216604, + "grad_norm": 0.29397931694984436, + "learning_rate": 1.983011412528706e-07, + "loss": 0.2409, + "step": 1282 + }, + { + "epoch": 0.3414050026609899, + "grad_norm": 0.2816607654094696, + "learning_rate": 1.9829804006670764e-07, + "loss": 0.2285, + "step": 1283 + }, + { + "epoch": 0.34167110164981374, + "grad_norm": 0.32810118794441223, + "learning_rate": 1.9829493607688478e-07, + "loss": 0.2272, + "step": 1284 + }, + { + "epoch": 0.3419372006386376, + "grad_norm": 0.2933768332004547, + "learning_rate": 1.9829182928349063e-07, + "loss": 0.2261, + "step": 1285 + }, + { + "epoch": 0.34220329962746143, + "grad_norm": 0.2598637342453003, + "learning_rate": 1.9828871968661374e-07, + "loss": 0.2203, + "step": 1286 + }, + { + "epoch": 0.3424693986162853, + "grad_norm": 1.1478180885314941, + "learning_rate": 1.9828560728634286e-07, + "loss": 0.2419, + "step": 1287 + }, + { + "epoch": 0.3427354976051091, + "grad_norm": 0.27756497263908386, + "learning_rate": 1.982824920827667e-07, + "loss": 0.2477, + "step": 1288 + }, + { + "epoch": 0.3430015965939329, + "grad_norm": 0.45165324211120605, + "learning_rate": 1.9827937407597418e-07, + "loss": 0.2301, + "step": 1289 + }, + { + "epoch": 0.34326769558275677, + "grad_norm": 0.37473976612091064, + "learning_rate": 1.982762532660542e-07, + "loss": 0.2464, + "step": 1290 + }, + { + "epoch": 0.3435337945715806, + "grad_norm": 0.31027883291244507, + "learning_rate": 1.9827312965309574e-07, + "loss": 0.2311, + "step": 1291 + }, + { + "epoch": 0.34379989356040447, + "grad_norm": 0.29478588700294495, + "learning_rate": 1.982700032371879e-07, + "loss": 0.2431, + "step": 1292 + }, + { + "epoch": 0.3440659925492283, + "grad_norm": 0.32737600803375244, + "learning_rate": 1.9826687401841995e-07, + "loss": 0.2375, + "step": 1293 + }, + { + "epoch": 0.34433209153805217, + "grad_norm": 0.3316459357738495, + "learning_rate": 1.9826374199688104e-07, + "loss": 0.2543, + "step": 1294 + }, + { + "epoch": 0.344598190526876, + "grad_norm": 0.35218504071235657, + "learning_rate": 1.982606071726605e-07, + "loss": 0.2347, + "step": 1295 + }, + { + "epoch": 0.34486428951569986, + "grad_norm": 0.2726024091243744, + "learning_rate": 1.9825746954584776e-07, + "loss": 0.2323, + "step": 1296 + }, + { + "epoch": 0.34513038850452366, + "grad_norm": 0.3437001705169678, + "learning_rate": 1.9825432911653236e-07, + "loss": 0.2561, + "step": 1297 + }, + { + "epoch": 0.3453964874933475, + "grad_norm": 0.27894675731658936, + "learning_rate": 1.9825118588480382e-07, + "loss": 0.2082, + "step": 1298 + }, + { + "epoch": 0.34566258648217135, + "grad_norm": 0.3811212182044983, + "learning_rate": 1.982480398507518e-07, + "loss": 0.2301, + "step": 1299 + }, + { + "epoch": 0.3459286854709952, + "grad_norm": 0.28547513484954834, + "learning_rate": 1.98244891014466e-07, + "loss": 0.236, + "step": 1300 + }, + { + "epoch": 0.34619478445981905, + "grad_norm": 0.2657378613948822, + "learning_rate": 1.982417393760363e-07, + "loss": 0.2204, + "step": 1301 + }, + { + "epoch": 0.3464608834486429, + "grad_norm": 0.2622591555118561, + "learning_rate": 1.9823858493555255e-07, + "loss": 0.2229, + "step": 1302 + }, + { + "epoch": 0.34672698243746675, + "grad_norm": 0.28940483927726746, + "learning_rate": 1.9823542769310474e-07, + "loss": 0.2304, + "step": 1303 + }, + { + "epoch": 0.3469930814262906, + "grad_norm": 0.3294830620288849, + "learning_rate": 1.9823226764878286e-07, + "loss": 0.2186, + "step": 1304 + }, + { + "epoch": 0.34725918041511444, + "grad_norm": 0.2635875940322876, + "learning_rate": 1.9822910480267714e-07, + "loss": 0.2101, + "step": 1305 + }, + { + "epoch": 0.3475252794039383, + "grad_norm": 0.26488327980041504, + "learning_rate": 1.9822593915487773e-07, + "loss": 0.222, + "step": 1306 + }, + { + "epoch": 0.3477913783927621, + "grad_norm": 0.27648839354515076, + "learning_rate": 1.9822277070547492e-07, + "loss": 0.2311, + "step": 1307 + }, + { + "epoch": 0.34805747738158593, + "grad_norm": 0.2645310163497925, + "learning_rate": 1.982195994545591e-07, + "loss": 0.2184, + "step": 1308 + }, + { + "epoch": 0.3483235763704098, + "grad_norm": 0.25879886746406555, + "learning_rate": 1.982164254022207e-07, + "loss": 0.216, + "step": 1309 + }, + { + "epoch": 0.34858967535923363, + "grad_norm": 0.4077329933643341, + "learning_rate": 1.982132485485503e-07, + "loss": 0.2598, + "step": 1310 + }, + { + "epoch": 0.3488557743480575, + "grad_norm": 0.30274271965026855, + "learning_rate": 1.9821006889363843e-07, + "loss": 0.228, + "step": 1311 + }, + { + "epoch": 0.3491218733368813, + "grad_norm": 0.2562258541584015, + "learning_rate": 1.9820688643757585e-07, + "loss": 0.2139, + "step": 1312 + }, + { + "epoch": 0.3493879723257052, + "grad_norm": 0.39195337891578674, + "learning_rate": 1.9820370118045327e-07, + "loss": 0.266, + "step": 1313 + }, + { + "epoch": 0.349654071314529, + "grad_norm": 0.28212788701057434, + "learning_rate": 1.9820051312236158e-07, + "loss": 0.2229, + "step": 1314 + }, + { + "epoch": 0.3499201703033529, + "grad_norm": 0.388368159532547, + "learning_rate": 1.981973222633917e-07, + "loss": 0.2447, + "step": 1315 + }, + { + "epoch": 0.35018626929217667, + "grad_norm": 0.3029737174510956, + "learning_rate": 1.9819412860363466e-07, + "loss": 0.2415, + "step": 1316 + }, + { + "epoch": 0.3504523682810005, + "grad_norm": 0.39146852493286133, + "learning_rate": 1.9819093214318155e-07, + "loss": 0.245, + "step": 1317 + }, + { + "epoch": 0.35071846726982436, + "grad_norm": 0.3702529966831207, + "learning_rate": 1.981877328821235e-07, + "loss": 0.2502, + "step": 1318 + }, + { + "epoch": 0.3509845662586482, + "grad_norm": 0.3212684392929077, + "learning_rate": 1.9818453082055177e-07, + "loss": 0.2412, + "step": 1319 + }, + { + "epoch": 0.35125066524747206, + "grad_norm": 0.24741926789283752, + "learning_rate": 1.9818132595855773e-07, + "loss": 0.2207, + "step": 1320 + }, + { + "epoch": 0.3515167642362959, + "grad_norm": 0.2710859179496765, + "learning_rate": 1.9817811829623272e-07, + "loss": 0.2359, + "step": 1321 + }, + { + "epoch": 0.35178286322511976, + "grad_norm": 0.40043905377388, + "learning_rate": 1.981749078336683e-07, + "loss": 0.2728, + "step": 1322 + }, + { + "epoch": 0.3520489622139436, + "grad_norm": 0.275113582611084, + "learning_rate": 1.98171694570956e-07, + "loss": 0.2452, + "step": 1323 + }, + { + "epoch": 0.35231506120276745, + "grad_norm": 0.3497631847858429, + "learning_rate": 1.9816847850818744e-07, + "loss": 0.2402, + "step": 1324 + }, + { + "epoch": 0.35258116019159125, + "grad_norm": 0.2555312216281891, + "learning_rate": 1.9816525964545445e-07, + "loss": 0.2117, + "step": 1325 + }, + { + "epoch": 0.3528472591804151, + "grad_norm": 0.2773703634738922, + "learning_rate": 1.9816203798284875e-07, + "loss": 0.2381, + "step": 1326 + }, + { + "epoch": 0.35311335816923894, + "grad_norm": 0.3359493315219879, + "learning_rate": 1.9815881352046223e-07, + "loss": 0.2335, + "step": 1327 + }, + { + "epoch": 0.3533794571580628, + "grad_norm": 0.3962811529636383, + "learning_rate": 1.981555862583869e-07, + "loss": 0.2526, + "step": 1328 + }, + { + "epoch": 0.35364555614688664, + "grad_norm": 0.3317287862300873, + "learning_rate": 1.981523561967148e-07, + "loss": 0.2466, + "step": 1329 + }, + { + "epoch": 0.3539116551357105, + "grad_norm": 0.2931465208530426, + "learning_rate": 1.9814912333553803e-07, + "loss": 0.2181, + "step": 1330 + }, + { + "epoch": 0.35417775412453434, + "grad_norm": 0.32501715421676636, + "learning_rate": 1.981458876749488e-07, + "loss": 0.2277, + "step": 1331 + }, + { + "epoch": 0.3544438531133582, + "grad_norm": 0.28398922085762024, + "learning_rate": 1.9814264921503943e-07, + "loss": 0.2383, + "step": 1332 + }, + { + "epoch": 0.35470995210218204, + "grad_norm": 0.2624034881591797, + "learning_rate": 1.9813940795590224e-07, + "loss": 0.2199, + "step": 1333 + }, + { + "epoch": 0.35497605109100583, + "grad_norm": 0.35913515090942383, + "learning_rate": 1.9813616389762975e-07, + "loss": 0.2341, + "step": 1334 + }, + { + "epoch": 0.3552421500798297, + "grad_norm": 0.46384355425834656, + "learning_rate": 1.981329170403144e-07, + "loss": 0.2479, + "step": 1335 + }, + { + "epoch": 0.3555082490686535, + "grad_norm": 0.29348024725914, + "learning_rate": 1.981296673840489e-07, + "loss": 0.233, + "step": 1336 + }, + { + "epoch": 0.3557743480574774, + "grad_norm": 0.45135805010795593, + "learning_rate": 1.9812641492892582e-07, + "loss": 0.2665, + "step": 1337 + }, + { + "epoch": 0.3560404470463012, + "grad_norm": 0.42650020122528076, + "learning_rate": 1.98123159675038e-07, + "loss": 0.2472, + "step": 1338 + }, + { + "epoch": 0.35630654603512507, + "grad_norm": 0.2602451741695404, + "learning_rate": 1.9811990162247828e-07, + "loss": 0.2158, + "step": 1339 + }, + { + "epoch": 0.3565726450239489, + "grad_norm": 0.2627120614051819, + "learning_rate": 1.9811664077133959e-07, + "loss": 0.2287, + "step": 1340 + }, + { + "epoch": 0.35683874401277277, + "grad_norm": 0.27313047647476196, + "learning_rate": 1.981133771217149e-07, + "loss": 0.2212, + "step": 1341 + }, + { + "epoch": 0.3571048430015966, + "grad_norm": 0.2868640124797821, + "learning_rate": 1.981101106736973e-07, + "loss": 0.2404, + "step": 1342 + }, + { + "epoch": 0.3573709419904204, + "grad_norm": 0.3203345835208893, + "learning_rate": 1.9810684142738002e-07, + "loss": 0.2346, + "step": 1343 + }, + { + "epoch": 0.35763704097924426, + "grad_norm": 0.37953487038612366, + "learning_rate": 1.9810356938285623e-07, + "loss": 0.2294, + "step": 1344 + }, + { + "epoch": 0.3579031399680681, + "grad_norm": 0.27638110518455505, + "learning_rate": 1.981002945402193e-07, + "loss": 0.2276, + "step": 1345 + }, + { + "epoch": 0.35816923895689196, + "grad_norm": 0.36310556530952454, + "learning_rate": 1.980970168995626e-07, + "loss": 0.2313, + "step": 1346 + }, + { + "epoch": 0.3584353379457158, + "grad_norm": 0.27418121695518494, + "learning_rate": 1.9809373646097965e-07, + "loss": 0.2243, + "step": 1347 + }, + { + "epoch": 0.35870143693453965, + "grad_norm": 0.3804631233215332, + "learning_rate": 1.98090453224564e-07, + "loss": 0.222, + "step": 1348 + }, + { + "epoch": 0.3589675359233635, + "grad_norm": 0.27658185362815857, + "learning_rate": 1.9808716719040926e-07, + "loss": 0.2147, + "step": 1349 + }, + { + "epoch": 0.35923363491218735, + "grad_norm": 0.35779520869255066, + "learning_rate": 1.9808387835860924e-07, + "loss": 0.2287, + "step": 1350 + }, + { + "epoch": 0.3594997339010112, + "grad_norm": 0.2884373962879181, + "learning_rate": 1.980805867292577e-07, + "loss": 0.2265, + "step": 1351 + }, + { + "epoch": 0.359765832889835, + "grad_norm": 0.28592416644096375, + "learning_rate": 1.9807729230244847e-07, + "loss": 0.229, + "step": 1352 + }, + { + "epoch": 0.36003193187865884, + "grad_norm": 0.24059073626995087, + "learning_rate": 1.9807399507827557e-07, + "loss": 0.2156, + "step": 1353 + }, + { + "epoch": 0.3602980308674827, + "grad_norm": 0.3775005340576172, + "learning_rate": 1.9807069505683306e-07, + "loss": 0.2355, + "step": 1354 + }, + { + "epoch": 0.36056412985630654, + "grad_norm": 0.2823440432548523, + "learning_rate": 1.9806739223821503e-07, + "loss": 0.2393, + "step": 1355 + }, + { + "epoch": 0.3608302288451304, + "grad_norm": 0.2558285892009735, + "learning_rate": 1.980640866225157e-07, + "loss": 0.2201, + "step": 1356 + }, + { + "epoch": 0.36109632783395423, + "grad_norm": 0.2821640074253082, + "learning_rate": 1.980607782098293e-07, + "loss": 0.23, + "step": 1357 + }, + { + "epoch": 0.3613624268227781, + "grad_norm": 0.28017666935920715, + "learning_rate": 1.980574670002503e-07, + "loss": 0.2212, + "step": 1358 + }, + { + "epoch": 0.36162852581160193, + "grad_norm": 0.3399125039577484, + "learning_rate": 1.9805415299387302e-07, + "loss": 0.2602, + "step": 1359 + }, + { + "epoch": 0.3618946248004258, + "grad_norm": 0.2889019846916199, + "learning_rate": 1.9805083619079207e-07, + "loss": 0.242, + "step": 1360 + }, + { + "epoch": 0.36216072378924963, + "grad_norm": 0.3677397072315216, + "learning_rate": 1.9804751659110201e-07, + "loss": 0.248, + "step": 1361 + }, + { + "epoch": 0.3624268227780734, + "grad_norm": 0.3214714825153351, + "learning_rate": 1.9804419419489756e-07, + "loss": 0.2231, + "step": 1362 + }, + { + "epoch": 0.36269292176689727, + "grad_norm": 0.33044546842575073, + "learning_rate": 1.9804086900227346e-07, + "loss": 0.2412, + "step": 1363 + }, + { + "epoch": 0.3629590207557211, + "grad_norm": 0.48057636618614197, + "learning_rate": 1.980375410133245e-07, + "loss": 0.2606, + "step": 1364 + }, + { + "epoch": 0.36322511974454497, + "grad_norm": 0.27926552295684814, + "learning_rate": 1.9803421022814568e-07, + "loss": 0.2154, + "step": 1365 + }, + { + "epoch": 0.3634912187333688, + "grad_norm": 0.28678402304649353, + "learning_rate": 1.9803087664683198e-07, + "loss": 0.2169, + "step": 1366 + }, + { + "epoch": 0.36375731772219266, + "grad_norm": 0.2697899043560028, + "learning_rate": 1.9802754026947843e-07, + "loss": 0.2265, + "step": 1367 + }, + { + "epoch": 0.3640234167110165, + "grad_norm": 0.2769899368286133, + "learning_rate": 1.980242010961803e-07, + "loss": 0.2174, + "step": 1368 + }, + { + "epoch": 0.36428951569984036, + "grad_norm": 0.2687085270881653, + "learning_rate": 1.980208591270327e-07, + "loss": 0.2376, + "step": 1369 + }, + { + "epoch": 0.3645556146886642, + "grad_norm": 0.36727774143218994, + "learning_rate": 1.9801751436213102e-07, + "loss": 0.2411, + "step": 1370 + }, + { + "epoch": 0.364821713677488, + "grad_norm": 0.29223671555519104, + "learning_rate": 1.9801416680157063e-07, + "loss": 0.2447, + "step": 1371 + }, + { + "epoch": 0.36508781266631185, + "grad_norm": 0.49249759316444397, + "learning_rate": 1.980108164454471e-07, + "loss": 0.2387, + "step": 1372 + }, + { + "epoch": 0.3653539116551357, + "grad_norm": 0.30140241980552673, + "learning_rate": 1.9800746329385584e-07, + "loss": 0.2358, + "step": 1373 + }, + { + "epoch": 0.36562001064395955, + "grad_norm": 0.3660133481025696, + "learning_rate": 1.980041073468926e-07, + "loss": 0.2462, + "step": 1374 + }, + { + "epoch": 0.3658861096327834, + "grad_norm": 0.2764596939086914, + "learning_rate": 1.9800074860465305e-07, + "loss": 0.217, + "step": 1375 + }, + { + "epoch": 0.36615220862160724, + "grad_norm": 0.4017556607723236, + "learning_rate": 1.9799738706723303e-07, + "loss": 0.2515, + "step": 1376 + }, + { + "epoch": 0.3664183076104311, + "grad_norm": 0.27334854006767273, + "learning_rate": 1.9799402273472836e-07, + "loss": 0.2279, + "step": 1377 + }, + { + "epoch": 0.36668440659925494, + "grad_norm": 0.2667626142501831, + "learning_rate": 1.9799065560723504e-07, + "loss": 0.251, + "step": 1378 + }, + { + "epoch": 0.3669505055880788, + "grad_norm": 0.26055535674095154, + "learning_rate": 1.979872856848491e-07, + "loss": 0.2297, + "step": 1379 + }, + { + "epoch": 0.3672166045769026, + "grad_norm": 0.27589598298072815, + "learning_rate": 1.9798391296766667e-07, + "loss": 0.2205, + "step": 1380 + }, + { + "epoch": 0.36748270356572643, + "grad_norm": 0.300843209028244, + "learning_rate": 1.9798053745578392e-07, + "loss": 0.2352, + "step": 1381 + }, + { + "epoch": 0.3677488025545503, + "grad_norm": 0.2744441330432892, + "learning_rate": 1.9797715914929712e-07, + "loss": 0.231, + "step": 1382 + }, + { + "epoch": 0.36801490154337413, + "grad_norm": 0.3585168421268463, + "learning_rate": 1.9797377804830265e-07, + "loss": 0.2506, + "step": 1383 + }, + { + "epoch": 0.368281000532198, + "grad_norm": 0.2579617500305176, + "learning_rate": 1.979703941528969e-07, + "loss": 0.2263, + "step": 1384 + }, + { + "epoch": 0.3685470995210218, + "grad_norm": 0.282884806394577, + "learning_rate": 1.9796700746317648e-07, + "loss": 0.2446, + "step": 1385 + }, + { + "epoch": 0.3688131985098457, + "grad_norm": 0.4982525706291199, + "learning_rate": 1.979636179792379e-07, + "loss": 0.2642, + "step": 1386 + }, + { + "epoch": 0.3690792974986695, + "grad_norm": 0.2864317297935486, + "learning_rate": 1.9796022570117787e-07, + "loss": 0.2315, + "step": 1387 + }, + { + "epoch": 0.36934539648749337, + "grad_norm": 0.2747011184692383, + "learning_rate": 1.9795683062909317e-07, + "loss": 0.2288, + "step": 1388 + }, + { + "epoch": 0.36961149547631716, + "grad_norm": 0.26753970980644226, + "learning_rate": 1.9795343276308055e-07, + "loss": 0.2359, + "step": 1389 + }, + { + "epoch": 0.369877594465141, + "grad_norm": 0.40127649903297424, + "learning_rate": 1.97950032103237e-07, + "loss": 0.244, + "step": 1390 + }, + { + "epoch": 0.37014369345396486, + "grad_norm": 0.371024489402771, + "learning_rate": 1.979466286496595e-07, + "loss": 0.2349, + "step": 1391 + }, + { + "epoch": 0.3704097924427887, + "grad_norm": 0.2810652256011963, + "learning_rate": 1.979432224024451e-07, + "loss": 0.2299, + "step": 1392 + }, + { + "epoch": 0.37067589143161256, + "grad_norm": 0.2931922376155853, + "learning_rate": 1.9793981336169098e-07, + "loss": 0.2371, + "step": 1393 + }, + { + "epoch": 0.3709419904204364, + "grad_norm": 0.27741217613220215, + "learning_rate": 1.9793640152749433e-07, + "loss": 0.2449, + "step": 1394 + }, + { + "epoch": 0.37120808940926026, + "grad_norm": 0.28559088706970215, + "learning_rate": 1.979329868999525e-07, + "loss": 0.2165, + "step": 1395 + }, + { + "epoch": 0.3714741883980841, + "grad_norm": 0.4087582230567932, + "learning_rate": 1.979295694791629e-07, + "loss": 0.2252, + "step": 1396 + }, + { + "epoch": 0.37174028738690795, + "grad_norm": 0.3350978493690491, + "learning_rate": 1.9792614926522296e-07, + "loss": 0.2322, + "step": 1397 + }, + { + "epoch": 0.37200638637573175, + "grad_norm": 0.2685418128967285, + "learning_rate": 1.9792272625823025e-07, + "loss": 0.1999, + "step": 1398 + }, + { + "epoch": 0.3722724853645556, + "grad_norm": 0.3903714716434479, + "learning_rate": 1.979193004582824e-07, + "loss": 0.252, + "step": 1399 + }, + { + "epoch": 0.37253858435337944, + "grad_norm": 0.36344945430755615, + "learning_rate": 1.979158718654771e-07, + "loss": 0.2635, + "step": 1400 + }, + { + "epoch": 0.3728046833422033, + "grad_norm": 0.29663729667663574, + "learning_rate": 1.979124404799122e-07, + "loss": 0.2272, + "step": 1401 + }, + { + "epoch": 0.37307078233102714, + "grad_norm": 0.3372038006782532, + "learning_rate": 1.979090063016855e-07, + "loss": 0.2387, + "step": 1402 + }, + { + "epoch": 0.373336881319851, + "grad_norm": 0.25227242708206177, + "learning_rate": 1.97905569330895e-07, + "loss": 0.2205, + "step": 1403 + }, + { + "epoch": 0.37360298030867484, + "grad_norm": 0.512021541595459, + "learning_rate": 1.979021295676387e-07, + "loss": 0.2411, + "step": 1404 + }, + { + "epoch": 0.3738690792974987, + "grad_norm": 0.36611539125442505, + "learning_rate": 1.978986870120147e-07, + "loss": 0.25, + "step": 1405 + }, + { + "epoch": 0.37413517828632253, + "grad_norm": 0.27595463395118713, + "learning_rate": 1.9789524166412123e-07, + "loss": 0.2334, + "step": 1406 + }, + { + "epoch": 0.3744012772751463, + "grad_norm": 0.25958871841430664, + "learning_rate": 1.9789179352405653e-07, + "loss": 0.2403, + "step": 1407 + }, + { + "epoch": 0.3746673762639702, + "grad_norm": 0.2782961130142212, + "learning_rate": 1.9788834259191893e-07, + "loss": 0.2344, + "step": 1408 + }, + { + "epoch": 0.374933475252794, + "grad_norm": 0.40265557169914246, + "learning_rate": 1.9788488886780693e-07, + "loss": 0.2256, + "step": 1409 + }, + { + "epoch": 0.3751995742416179, + "grad_norm": 0.3099413216114044, + "learning_rate": 1.9788143235181892e-07, + "loss": 0.243, + "step": 1410 + }, + { + "epoch": 0.3754656732304417, + "grad_norm": 0.24233072996139526, + "learning_rate": 1.9787797304405363e-07, + "loss": 0.2238, + "step": 1411 + }, + { + "epoch": 0.37573177221926557, + "grad_norm": 0.27088451385498047, + "learning_rate": 1.9787451094460962e-07, + "loss": 0.2344, + "step": 1412 + }, + { + "epoch": 0.3759978712080894, + "grad_norm": 0.3632282316684723, + "learning_rate": 1.9787104605358565e-07, + "loss": 0.2469, + "step": 1413 + }, + { + "epoch": 0.37626397019691327, + "grad_norm": 0.373450368642807, + "learning_rate": 1.9786757837108057e-07, + "loss": 0.2331, + "step": 1414 + }, + { + "epoch": 0.3765300691857371, + "grad_norm": 0.3992008566856384, + "learning_rate": 1.9786410789719328e-07, + "loss": 0.2549, + "step": 1415 + }, + { + "epoch": 0.37679616817456096, + "grad_norm": 0.2831254005432129, + "learning_rate": 1.9786063463202276e-07, + "loss": 0.2484, + "step": 1416 + }, + { + "epoch": 0.37706226716338476, + "grad_norm": 0.3413431942462921, + "learning_rate": 1.978571585756681e-07, + "loss": 0.2263, + "step": 1417 + }, + { + "epoch": 0.3773283661522086, + "grad_norm": 0.2877960801124573, + "learning_rate": 1.978536797282284e-07, + "loss": 0.231, + "step": 1418 + }, + { + "epoch": 0.37759446514103245, + "grad_norm": 0.28038379549980164, + "learning_rate": 1.978501980898029e-07, + "loss": 0.223, + "step": 1419 + }, + { + "epoch": 0.3778605641298563, + "grad_norm": 0.28110718727111816, + "learning_rate": 1.9784671366049094e-07, + "loss": 0.2377, + "step": 1420 + }, + { + "epoch": 0.37812666311868015, + "grad_norm": 0.4383774399757385, + "learning_rate": 1.9784322644039184e-07, + "loss": 0.2251, + "step": 1421 + }, + { + "epoch": 0.378392762107504, + "grad_norm": 0.3269578516483307, + "learning_rate": 1.9783973642960513e-07, + "loss": 0.2217, + "step": 1422 + }, + { + "epoch": 0.37865886109632785, + "grad_norm": 0.28129884600639343, + "learning_rate": 1.978362436282303e-07, + "loss": 0.2246, + "step": 1423 + }, + { + "epoch": 0.3789249600851517, + "grad_norm": 0.34410223364830017, + "learning_rate": 1.9783274803636697e-07, + "loss": 0.2593, + "step": 1424 + }, + { + "epoch": 0.37919105907397554, + "grad_norm": 0.3310176432132721, + "learning_rate": 1.9782924965411486e-07, + "loss": 0.2431, + "step": 1425 + }, + { + "epoch": 0.37945715806279934, + "grad_norm": 0.28137776255607605, + "learning_rate": 1.9782574848157378e-07, + "loss": 0.2247, + "step": 1426 + }, + { + "epoch": 0.3797232570516232, + "grad_norm": 0.25492778420448303, + "learning_rate": 1.9782224451884354e-07, + "loss": 0.2139, + "step": 1427 + }, + { + "epoch": 0.37998935604044703, + "grad_norm": 0.4214039146900177, + "learning_rate": 1.9781873776602412e-07, + "loss": 0.2244, + "step": 1428 + }, + { + "epoch": 0.3802554550292709, + "grad_norm": 0.2695170044898987, + "learning_rate": 1.978152282232155e-07, + "loss": 0.2316, + "step": 1429 + }, + { + "epoch": 0.38052155401809473, + "grad_norm": 0.4874190092086792, + "learning_rate": 1.9781171589051782e-07, + "loss": 0.25, + "step": 1430 + }, + { + "epoch": 0.3807876530069186, + "grad_norm": 0.2504897117614746, + "learning_rate": 1.9780820076803125e-07, + "loss": 0.2147, + "step": 1431 + }, + { + "epoch": 0.38105375199574243, + "grad_norm": 0.2687884569168091, + "learning_rate": 1.9780468285585601e-07, + "loss": 0.2294, + "step": 1432 + }, + { + "epoch": 0.3813198509845663, + "grad_norm": 0.38651716709136963, + "learning_rate": 1.978011621540925e-07, + "loss": 0.2439, + "step": 1433 + }, + { + "epoch": 0.3815859499733901, + "grad_norm": 0.2569604516029358, + "learning_rate": 1.9779763866284105e-07, + "loss": 0.22, + "step": 1434 + }, + { + "epoch": 0.3818520489622139, + "grad_norm": 0.3783814311027527, + "learning_rate": 1.9779411238220224e-07, + "loss": 0.2634, + "step": 1435 + }, + { + "epoch": 0.38211814795103777, + "grad_norm": 0.2555271089076996, + "learning_rate": 1.9779058331227664e-07, + "loss": 0.2198, + "step": 1436 + }, + { + "epoch": 0.3823842469398616, + "grad_norm": 0.37164920568466187, + "learning_rate": 1.9778705145316485e-07, + "loss": 0.2389, + "step": 1437 + }, + { + "epoch": 0.38265034592868546, + "grad_norm": 0.28532275557518005, + "learning_rate": 1.9778351680496766e-07, + "loss": 0.2473, + "step": 1438 + }, + { + "epoch": 0.3829164449175093, + "grad_norm": 0.27687016129493713, + "learning_rate": 1.9777997936778585e-07, + "loss": 0.2233, + "step": 1439 + }, + { + "epoch": 0.38318254390633316, + "grad_norm": 0.3953215777873993, + "learning_rate": 1.9777643914172035e-07, + "loss": 0.2329, + "step": 1440 + }, + { + "epoch": 0.383448642895157, + "grad_norm": 0.3999815285205841, + "learning_rate": 1.977728961268721e-07, + "loss": 0.2281, + "step": 1441 + }, + { + "epoch": 0.38371474188398086, + "grad_norm": 0.35406869649887085, + "learning_rate": 1.9776935032334218e-07, + "loss": 0.2533, + "step": 1442 + }, + { + "epoch": 0.3839808408728047, + "grad_norm": 0.4003659188747406, + "learning_rate": 1.977658017312317e-07, + "loss": 0.2389, + "step": 1443 + }, + { + "epoch": 0.3842469398616285, + "grad_norm": 0.2872161269187927, + "learning_rate": 1.977622503506419e-07, + "loss": 0.2421, + "step": 1444 + }, + { + "epoch": 0.38451303885045235, + "grad_norm": 0.25905317068099976, + "learning_rate": 1.9775869618167404e-07, + "loss": 0.2192, + "step": 1445 + }, + { + "epoch": 0.3847791378392762, + "grad_norm": 0.2645016312599182, + "learning_rate": 1.9775513922442952e-07, + "loss": 0.2314, + "step": 1446 + }, + { + "epoch": 0.38504523682810005, + "grad_norm": 0.31536024808883667, + "learning_rate": 1.9775157947900975e-07, + "loss": 0.2362, + "step": 1447 + }, + { + "epoch": 0.3853113358169239, + "grad_norm": 0.3990001082420349, + "learning_rate": 1.9774801694551634e-07, + "loss": 0.2191, + "step": 1448 + }, + { + "epoch": 0.38557743480574774, + "grad_norm": 0.2776584327220917, + "learning_rate": 1.9774445162405083e-07, + "loss": 0.2203, + "step": 1449 + }, + { + "epoch": 0.3858435337945716, + "grad_norm": 0.24737094342708588, + "learning_rate": 1.9774088351471495e-07, + "loss": 0.2182, + "step": 1450 + }, + { + "epoch": 0.38610963278339544, + "grad_norm": 0.2799822986125946, + "learning_rate": 1.977373126176104e-07, + "loss": 0.2273, + "step": 1451 + }, + { + "epoch": 0.3863757317722193, + "grad_norm": 0.3752997815608978, + "learning_rate": 1.9773373893283915e-07, + "loss": 0.2464, + "step": 1452 + }, + { + "epoch": 0.3866418307610431, + "grad_norm": 0.2851467430591583, + "learning_rate": 1.9773016246050306e-07, + "loss": 0.2282, + "step": 1453 + }, + { + "epoch": 0.38690792974986693, + "grad_norm": 0.2834787368774414, + "learning_rate": 1.977265832007041e-07, + "loss": 0.2332, + "step": 1454 + }, + { + "epoch": 0.3871740287386908, + "grad_norm": 0.37263575196266174, + "learning_rate": 1.9772300115354442e-07, + "loss": 0.2418, + "step": 1455 + }, + { + "epoch": 0.3874401277275146, + "grad_norm": 0.23412460088729858, + "learning_rate": 1.9771941631912615e-07, + "loss": 0.2024, + "step": 1456 + }, + { + "epoch": 0.3877062267163385, + "grad_norm": 0.3644157648086548, + "learning_rate": 1.9771582869755154e-07, + "loss": 0.2439, + "step": 1457 + }, + { + "epoch": 0.3879723257051623, + "grad_norm": 0.2987091541290283, + "learning_rate": 1.9771223828892297e-07, + "loss": 0.2275, + "step": 1458 + }, + { + "epoch": 0.3882384246939862, + "grad_norm": 0.34569454193115234, + "learning_rate": 1.9770864509334276e-07, + "loss": 0.2433, + "step": 1459 + }, + { + "epoch": 0.38850452368281, + "grad_norm": 0.44253402948379517, + "learning_rate": 1.9770504911091343e-07, + "loss": 0.2468, + "step": 1460 + }, + { + "epoch": 0.38877062267163387, + "grad_norm": 0.332347571849823, + "learning_rate": 1.977014503417376e-07, + "loss": 0.2164, + "step": 1461 + }, + { + "epoch": 0.38903672166045766, + "grad_norm": 0.33636829257011414, + "learning_rate": 1.9769784878591783e-07, + "loss": 0.2209, + "step": 1462 + }, + { + "epoch": 0.3893028206492815, + "grad_norm": 0.3876345753669739, + "learning_rate": 1.976942444435569e-07, + "loss": 0.2378, + "step": 1463 + }, + { + "epoch": 0.38956891963810536, + "grad_norm": 0.29092496633529663, + "learning_rate": 1.976906373147576e-07, + "loss": 0.2227, + "step": 1464 + }, + { + "epoch": 0.3898350186269292, + "grad_norm": 0.2581065893173218, + "learning_rate": 1.976870273996228e-07, + "loss": 0.2221, + "step": 1465 + }, + { + "epoch": 0.39010111761575306, + "grad_norm": 0.35909807682037354, + "learning_rate": 1.9768341469825548e-07, + "loss": 0.2419, + "step": 1466 + }, + { + "epoch": 0.3903672166045769, + "grad_norm": 0.42693954706192017, + "learning_rate": 1.9767979921075865e-07, + "loss": 0.2391, + "step": 1467 + }, + { + "epoch": 0.39063331559340075, + "grad_norm": 0.36610063910484314, + "learning_rate": 1.9767618093723546e-07, + "loss": 0.2457, + "step": 1468 + }, + { + "epoch": 0.3908994145822246, + "grad_norm": 0.6602458357810974, + "learning_rate": 1.976725598777891e-07, + "loss": 0.2461, + "step": 1469 + }, + { + "epoch": 0.39116551357104845, + "grad_norm": 0.3386934697628021, + "learning_rate": 1.9766893603252286e-07, + "loss": 0.2366, + "step": 1470 + }, + { + "epoch": 0.3914316125598723, + "grad_norm": 0.289343923330307, + "learning_rate": 1.976653094015401e-07, + "loss": 0.225, + "step": 1471 + }, + { + "epoch": 0.3916977115486961, + "grad_norm": 0.2560144066810608, + "learning_rate": 1.9766167998494422e-07, + "loss": 0.2142, + "step": 1472 + }, + { + "epoch": 0.39196381053751994, + "grad_norm": 0.762561023235321, + "learning_rate": 1.9765804778283878e-07, + "loss": 0.2389, + "step": 1473 + }, + { + "epoch": 0.3922299095263438, + "grad_norm": 0.35351940989494324, + "learning_rate": 1.9765441279532737e-07, + "loss": 0.2308, + "step": 1474 + }, + { + "epoch": 0.39249600851516764, + "grad_norm": 0.295230895280838, + "learning_rate": 1.9765077502251366e-07, + "loss": 0.2311, + "step": 1475 + }, + { + "epoch": 0.3927621075039915, + "grad_norm": 0.2976970374584198, + "learning_rate": 1.9764713446450144e-07, + "loss": 0.2353, + "step": 1476 + }, + { + "epoch": 0.39302820649281534, + "grad_norm": 0.26408475637435913, + "learning_rate": 1.9764349112139448e-07, + "loss": 0.1855, + "step": 1477 + }, + { + "epoch": 0.3932943054816392, + "grad_norm": 0.30430784821510315, + "learning_rate": 1.9763984499329672e-07, + "loss": 0.2459, + "step": 1478 + }, + { + "epoch": 0.39356040447046303, + "grad_norm": 0.330452024936676, + "learning_rate": 1.976361960803122e-07, + "loss": 0.2669, + "step": 1479 + }, + { + "epoch": 0.3938265034592869, + "grad_norm": 0.36993345618247986, + "learning_rate": 1.9763254438254498e-07, + "loss": 0.2373, + "step": 1480 + }, + { + "epoch": 0.3940926024481107, + "grad_norm": 0.28949010372161865, + "learning_rate": 1.9762888990009916e-07, + "loss": 0.2464, + "step": 1481 + }, + { + "epoch": 0.3943587014369345, + "grad_norm": 0.3769736588001251, + "learning_rate": 1.97625232633079e-07, + "loss": 0.2433, + "step": 1482 + }, + { + "epoch": 0.39462480042575837, + "grad_norm": 0.27104535698890686, + "learning_rate": 1.9762157258158884e-07, + "loss": 0.2168, + "step": 1483 + }, + { + "epoch": 0.3948908994145822, + "grad_norm": 0.445472776889801, + "learning_rate": 1.9761790974573308e-07, + "loss": 0.2464, + "step": 1484 + }, + { + "epoch": 0.39515699840340607, + "grad_norm": 0.25592777132987976, + "learning_rate": 1.9761424412561612e-07, + "loss": 0.208, + "step": 1485 + }, + { + "epoch": 0.3954230973922299, + "grad_norm": 0.3312970995903015, + "learning_rate": 1.9761057572134258e-07, + "loss": 0.2313, + "step": 1486 + }, + { + "epoch": 0.39568919638105376, + "grad_norm": 0.28130534291267395, + "learning_rate": 1.9760690453301705e-07, + "loss": 0.2295, + "step": 1487 + }, + { + "epoch": 0.3959552953698776, + "grad_norm": 0.3407542407512665, + "learning_rate": 1.9760323056074427e-07, + "loss": 0.24, + "step": 1488 + }, + { + "epoch": 0.39622139435870146, + "grad_norm": 0.2686028480529785, + "learning_rate": 1.97599553804629e-07, + "loss": 0.215, + "step": 1489 + }, + { + "epoch": 0.39648749334752525, + "grad_norm": 0.2936232388019562, + "learning_rate": 1.9759587426477613e-07, + "loss": 0.2295, + "step": 1490 + }, + { + "epoch": 0.3967535923363491, + "grad_norm": 0.28139829635620117, + "learning_rate": 1.9759219194129063e-07, + "loss": 0.2253, + "step": 1491 + }, + { + "epoch": 0.39701969132517295, + "grad_norm": 0.36895841360092163, + "learning_rate": 1.9758850683427747e-07, + "loss": 0.2678, + "step": 1492 + }, + { + "epoch": 0.3972857903139968, + "grad_norm": 0.27207571268081665, + "learning_rate": 1.9758481894384178e-07, + "loss": 0.2249, + "step": 1493 + }, + { + "epoch": 0.39755188930282065, + "grad_norm": 0.3230326771736145, + "learning_rate": 1.9758112827008878e-07, + "loss": 0.2344, + "step": 1494 + }, + { + "epoch": 0.3978179882916445, + "grad_norm": 0.3382130265235901, + "learning_rate": 1.9757743481312366e-07, + "loss": 0.2443, + "step": 1495 + }, + { + "epoch": 0.39808408728046835, + "grad_norm": 0.3121512830257416, + "learning_rate": 1.9757373857305185e-07, + "loss": 0.2183, + "step": 1496 + }, + { + "epoch": 0.3983501862692922, + "grad_norm": 0.3481309711933136, + "learning_rate": 1.9757003954997873e-07, + "loss": 0.238, + "step": 1497 + }, + { + "epoch": 0.39861628525811604, + "grad_norm": 0.33920323848724365, + "learning_rate": 1.9756633774400983e-07, + "loss": 0.244, + "step": 1498 + }, + { + "epoch": 0.39888238424693984, + "grad_norm": 0.31791165471076965, + "learning_rate": 1.9756263315525068e-07, + "loss": 0.2374, + "step": 1499 + }, + { + "epoch": 0.3991484832357637, + "grad_norm": 0.2700265944004059, + "learning_rate": 1.9755892578380696e-07, + "loss": 0.2243, + "step": 1500 + }, + { + "epoch": 0.39941458222458753, + "grad_norm": 0.29030242562294006, + "learning_rate": 1.9755521562978446e-07, + "loss": 0.2131, + "step": 1501 + }, + { + "epoch": 0.3996806812134114, + "grad_norm": 0.26225847005844116, + "learning_rate": 1.9755150269328896e-07, + "loss": 0.2355, + "step": 1502 + }, + { + "epoch": 0.39994678020223523, + "grad_norm": 0.3869900703430176, + "learning_rate": 1.9754778697442637e-07, + "loss": 0.2357, + "step": 1503 + }, + { + "epoch": 0.4002128791910591, + "grad_norm": 0.2751007378101349, + "learning_rate": 1.9754406847330268e-07, + "loss": 0.2361, + "step": 1504 + }, + { + "epoch": 0.4004789781798829, + "grad_norm": 0.4249611496925354, + "learning_rate": 1.9754034719002392e-07, + "loss": 0.2243, + "step": 1505 + }, + { + "epoch": 0.4007450771687068, + "grad_norm": 0.29668882489204407, + "learning_rate": 1.9753662312469629e-07, + "loss": 0.2319, + "step": 1506 + }, + { + "epoch": 0.4010111761575306, + "grad_norm": 0.30094113945961, + "learning_rate": 1.9753289627742591e-07, + "loss": 0.2357, + "step": 1507 + }, + { + "epoch": 0.4012772751463544, + "grad_norm": 0.28665634989738464, + "learning_rate": 1.9752916664831916e-07, + "loss": 0.243, + "step": 1508 + }, + { + "epoch": 0.40154337413517827, + "grad_norm": 0.3446057438850403, + "learning_rate": 1.975254342374824e-07, + "loss": 0.2609, + "step": 1509 + }, + { + "epoch": 0.4018094731240021, + "grad_norm": 0.3589448928833008, + "learning_rate": 1.9752169904502206e-07, + "loss": 0.2184, + "step": 1510 + }, + { + "epoch": 0.40207557211282596, + "grad_norm": 0.262887179851532, + "learning_rate": 1.9751796107104468e-07, + "loss": 0.2324, + "step": 1511 + }, + { + "epoch": 0.4023416711016498, + "grad_norm": 0.3989064395427704, + "learning_rate": 1.9751422031565687e-07, + "loss": 0.2478, + "step": 1512 + }, + { + "epoch": 0.40260777009047366, + "grad_norm": 0.27404364943504333, + "learning_rate": 1.9751047677896536e-07, + "loss": 0.227, + "step": 1513 + }, + { + "epoch": 0.4028738690792975, + "grad_norm": 0.26236262917518616, + "learning_rate": 1.975067304610769e-07, + "loss": 0.2005, + "step": 1514 + }, + { + "epoch": 0.40313996806812136, + "grad_norm": 0.25944000482559204, + "learning_rate": 1.9750298136209836e-07, + "loss": 0.2096, + "step": 1515 + }, + { + "epoch": 0.4034060670569452, + "grad_norm": 0.4183819591999054, + "learning_rate": 1.9749922948213663e-07, + "loss": 0.2687, + "step": 1516 + }, + { + "epoch": 0.403672166045769, + "grad_norm": 0.2812224328517914, + "learning_rate": 1.9749547482129874e-07, + "loss": 0.2074, + "step": 1517 + }, + { + "epoch": 0.40393826503459285, + "grad_norm": 0.3151700496673584, + "learning_rate": 1.9749171737969182e-07, + "loss": 0.2241, + "step": 1518 + }, + { + "epoch": 0.4042043640234167, + "grad_norm": 0.3291691541671753, + "learning_rate": 1.9748795715742298e-07, + "loss": 0.2351, + "step": 1519 + }, + { + "epoch": 0.40447046301224054, + "grad_norm": 0.2876807153224945, + "learning_rate": 1.974841941545995e-07, + "loss": 0.2546, + "step": 1520 + }, + { + "epoch": 0.4047365620010644, + "grad_norm": 0.443215012550354, + "learning_rate": 1.9748042837132872e-07, + "loss": 0.2474, + "step": 1521 + }, + { + "epoch": 0.40500266098988824, + "grad_norm": 0.2700797915458679, + "learning_rate": 1.9747665980771804e-07, + "loss": 0.2298, + "step": 1522 + }, + { + "epoch": 0.4052687599787121, + "grad_norm": 0.26020342111587524, + "learning_rate": 1.9747288846387492e-07, + "loss": 0.2334, + "step": 1523 + }, + { + "epoch": 0.40553485896753594, + "grad_norm": 0.33626601099967957, + "learning_rate": 1.9746911433990696e-07, + "loss": 0.2416, + "step": 1524 + }, + { + "epoch": 0.4058009579563598, + "grad_norm": 0.2725061774253845, + "learning_rate": 1.9746533743592178e-07, + "loss": 0.2415, + "step": 1525 + }, + { + "epoch": 0.40606705694518364, + "grad_norm": 0.5408943295478821, + "learning_rate": 1.9746155775202712e-07, + "loss": 0.2792, + "step": 1526 + }, + { + "epoch": 0.40633315593400743, + "grad_norm": 0.26040035486221313, + "learning_rate": 1.974577752883308e-07, + "loss": 0.2122, + "step": 1527 + }, + { + "epoch": 0.4065992549228313, + "grad_norm": 0.2799723446369171, + "learning_rate": 1.9745399004494066e-07, + "loss": 0.2088, + "step": 1528 + }, + { + "epoch": 0.4068653539116551, + "grad_norm": 0.26558005809783936, + "learning_rate": 1.974502020219647e-07, + "loss": 0.2333, + "step": 1529 + }, + { + "epoch": 0.407131452900479, + "grad_norm": 0.3709883987903595, + "learning_rate": 1.9744641121951096e-07, + "loss": 0.2313, + "step": 1530 + }, + { + "epoch": 0.4073975518893028, + "grad_norm": 0.2762540280818939, + "learning_rate": 1.9744261763768755e-07, + "loss": 0.2405, + "step": 1531 + }, + { + "epoch": 0.40766365087812667, + "grad_norm": 0.3538624048233032, + "learning_rate": 1.9743882127660264e-07, + "loss": 0.2363, + "step": 1532 + }, + { + "epoch": 0.4079297498669505, + "grad_norm": 0.39772605895996094, + "learning_rate": 1.9743502213636457e-07, + "loss": 0.2543, + "step": 1533 + }, + { + "epoch": 0.40819584885577437, + "grad_norm": 0.2889617383480072, + "learning_rate": 1.9743122021708167e-07, + "loss": 0.211, + "step": 1534 + }, + { + "epoch": 0.4084619478445982, + "grad_norm": 0.33491209149360657, + "learning_rate": 1.9742741551886237e-07, + "loss": 0.2302, + "step": 1535 + }, + { + "epoch": 0.408728046833422, + "grad_norm": 0.2742573916912079, + "learning_rate": 1.974236080418152e-07, + "loss": 0.2141, + "step": 1536 + }, + { + "epoch": 0.40899414582224586, + "grad_norm": 0.2915595471858978, + "learning_rate": 1.9741979778604875e-07, + "loss": 0.2371, + "step": 1537 + }, + { + "epoch": 0.4092602448110697, + "grad_norm": 0.23564524948596954, + "learning_rate": 1.9741598475167172e-07, + "loss": 0.1947, + "step": 1538 + }, + { + "epoch": 0.40952634379989356, + "grad_norm": 0.27215877175331116, + "learning_rate": 1.9741216893879282e-07, + "loss": 0.2262, + "step": 1539 + }, + { + "epoch": 0.4097924427887174, + "grad_norm": 0.3344111442565918, + "learning_rate": 1.9740835034752093e-07, + "loss": 0.2253, + "step": 1540 + }, + { + "epoch": 0.41005854177754125, + "grad_norm": 0.33900928497314453, + "learning_rate": 1.9740452897796494e-07, + "loss": 0.2373, + "step": 1541 + }, + { + "epoch": 0.4103246407663651, + "grad_norm": 0.28146183490753174, + "learning_rate": 1.9740070483023384e-07, + "loss": 0.2419, + "step": 1542 + }, + { + "epoch": 0.41059073975518895, + "grad_norm": 0.2703622579574585, + "learning_rate": 1.9739687790443672e-07, + "loss": 0.2048, + "step": 1543 + }, + { + "epoch": 0.4108568387440128, + "grad_norm": 0.2698519825935364, + "learning_rate": 1.9739304820068274e-07, + "loss": 0.2185, + "step": 1544 + }, + { + "epoch": 0.4111229377328366, + "grad_norm": 0.39611414074897766, + "learning_rate": 1.9738921571908106e-07, + "loss": 0.2382, + "step": 1545 + }, + { + "epoch": 0.41138903672166044, + "grad_norm": 0.31891748309135437, + "learning_rate": 1.973853804597411e-07, + "loss": 0.218, + "step": 1546 + }, + { + "epoch": 0.4116551357104843, + "grad_norm": 0.3007456362247467, + "learning_rate": 1.973815424227722e-07, + "loss": 0.2317, + "step": 1547 + }, + { + "epoch": 0.41192123469930814, + "grad_norm": 0.41045475006103516, + "learning_rate": 1.9737770160828378e-07, + "loss": 0.2354, + "step": 1548 + }, + { + "epoch": 0.412187333688132, + "grad_norm": 0.312750905752182, + "learning_rate": 1.9737385801638542e-07, + "loss": 0.2375, + "step": 1549 + }, + { + "epoch": 0.41245343267695583, + "grad_norm": 0.29503393173217773, + "learning_rate": 1.9737001164718679e-07, + "loss": 0.2399, + "step": 1550 + }, + { + "epoch": 0.4127195316657797, + "grad_norm": 0.2977735996246338, + "learning_rate": 1.9736616250079754e-07, + "loss": 0.2165, + "step": 1551 + }, + { + "epoch": 0.41298563065460353, + "grad_norm": 0.29754337668418884, + "learning_rate": 1.973623105773275e-07, + "loss": 0.2427, + "step": 1552 + }, + { + "epoch": 0.4132517296434274, + "grad_norm": 0.2802336812019348, + "learning_rate": 1.9735845587688649e-07, + "loss": 0.2265, + "step": 1553 + }, + { + "epoch": 0.41351782863225117, + "grad_norm": 0.2630413770675659, + "learning_rate": 1.973545983995845e-07, + "loss": 0.2084, + "step": 1554 + }, + { + "epoch": 0.413783927621075, + "grad_norm": 0.27598148584365845, + "learning_rate": 1.973507381455315e-07, + "loss": 0.2308, + "step": 1555 + }, + { + "epoch": 0.41405002660989887, + "grad_norm": 0.26001954078674316, + "learning_rate": 1.9734687511483763e-07, + "loss": 0.2027, + "step": 1556 + }, + { + "epoch": 0.4143161255987227, + "grad_norm": 0.5655499696731567, + "learning_rate": 1.9734300930761307e-07, + "loss": 0.2124, + "step": 1557 + }, + { + "epoch": 0.41458222458754657, + "grad_norm": 0.2678808867931366, + "learning_rate": 1.9733914072396808e-07, + "loss": 0.2284, + "step": 1558 + }, + { + "epoch": 0.4148483235763704, + "grad_norm": 0.45039913058280945, + "learning_rate": 1.97335269364013e-07, + "loss": 0.2359, + "step": 1559 + }, + { + "epoch": 0.41511442256519426, + "grad_norm": 0.33954352140426636, + "learning_rate": 1.9733139522785823e-07, + "loss": 0.2186, + "step": 1560 + }, + { + "epoch": 0.4153805215540181, + "grad_norm": 0.2713543474674225, + "learning_rate": 1.9732751831561427e-07, + "loss": 0.23, + "step": 1561 + }, + { + "epoch": 0.41564662054284196, + "grad_norm": 0.4331529438495636, + "learning_rate": 1.9732363862739173e-07, + "loss": 0.2585, + "step": 1562 + }, + { + "epoch": 0.41591271953166575, + "grad_norm": 0.27582499384880066, + "learning_rate": 1.9731975616330124e-07, + "loss": 0.2308, + "step": 1563 + }, + { + "epoch": 0.4161788185204896, + "grad_norm": 0.2766045033931732, + "learning_rate": 1.9731587092345353e-07, + "loss": 0.2017, + "step": 1564 + }, + { + "epoch": 0.41644491750931345, + "grad_norm": 0.3018302619457245, + "learning_rate": 1.9731198290795942e-07, + "loss": 0.2376, + "step": 1565 + }, + { + "epoch": 0.4167110164981373, + "grad_norm": 0.3521026074886322, + "learning_rate": 1.9730809211692985e-07, + "loss": 0.2199, + "step": 1566 + }, + { + "epoch": 0.41697711548696115, + "grad_norm": 0.4206453859806061, + "learning_rate": 1.973041985504757e-07, + "loss": 0.2408, + "step": 1567 + }, + { + "epoch": 0.417243214475785, + "grad_norm": 0.24573253095149994, + "learning_rate": 1.9730030220870813e-07, + "loss": 0.2104, + "step": 1568 + }, + { + "epoch": 0.41750931346460884, + "grad_norm": 0.2712368071079254, + "learning_rate": 1.972964030917382e-07, + "loss": 0.2147, + "step": 1569 + }, + { + "epoch": 0.4177754124534327, + "grad_norm": 0.34585386514663696, + "learning_rate": 1.9729250119967712e-07, + "loss": 0.229, + "step": 1570 + }, + { + "epoch": 0.41804151144225654, + "grad_norm": 0.29427972435951233, + "learning_rate": 1.972885965326362e-07, + "loss": 0.2348, + "step": 1571 + }, + { + "epoch": 0.41830761043108033, + "grad_norm": 0.25535666942596436, + "learning_rate": 1.972846890907268e-07, + "loss": 0.2139, + "step": 1572 + }, + { + "epoch": 0.4185737094199042, + "grad_norm": 0.3072420656681061, + "learning_rate": 1.9728077887406039e-07, + "loss": 0.2481, + "step": 1573 + }, + { + "epoch": 0.41883980840872803, + "grad_norm": 0.27835118770599365, + "learning_rate": 1.9727686588274848e-07, + "loss": 0.2238, + "step": 1574 + }, + { + "epoch": 0.4191059073975519, + "grad_norm": 0.3563469648361206, + "learning_rate": 1.9727295011690267e-07, + "loss": 0.2454, + "step": 1575 + }, + { + "epoch": 0.41937200638637573, + "grad_norm": 0.2510879337787628, + "learning_rate": 1.9726903157663467e-07, + "loss": 0.2093, + "step": 1576 + }, + { + "epoch": 0.4196381053751996, + "grad_norm": 0.28718316555023193, + "learning_rate": 1.9726511026205623e-07, + "loss": 0.2466, + "step": 1577 + }, + { + "epoch": 0.4199042043640234, + "grad_norm": 0.3351322412490845, + "learning_rate": 1.972611861732792e-07, + "loss": 0.2313, + "step": 1578 + }, + { + "epoch": 0.4201703033528473, + "grad_norm": 0.36949604749679565, + "learning_rate": 1.972572593104155e-07, + "loss": 0.2457, + "step": 1579 + }, + { + "epoch": 0.4204364023416711, + "grad_norm": 0.4014890491962433, + "learning_rate": 1.972533296735771e-07, + "loss": 0.2522, + "step": 1580 + }, + { + "epoch": 0.42070250133049497, + "grad_norm": 0.26496371626853943, + "learning_rate": 1.972493972628761e-07, + "loss": 0.2282, + "step": 1581 + }, + { + "epoch": 0.42096860031931876, + "grad_norm": 0.2723938226699829, + "learning_rate": 1.972454620784247e-07, + "loss": 0.2217, + "step": 1582 + }, + { + "epoch": 0.4212346993081426, + "grad_norm": 0.29972341656684875, + "learning_rate": 1.972415241203351e-07, + "loss": 0.2477, + "step": 1583 + }, + { + "epoch": 0.42150079829696646, + "grad_norm": 0.34958142042160034, + "learning_rate": 1.9723758338871962e-07, + "loss": 0.2421, + "step": 1584 + }, + { + "epoch": 0.4217668972857903, + "grad_norm": 0.3205435574054718, + "learning_rate": 1.9723363988369067e-07, + "loss": 0.2389, + "step": 1585 + }, + { + "epoch": 0.42203299627461416, + "grad_norm": 0.28366324305534363, + "learning_rate": 1.9722969360536076e-07, + "loss": 0.216, + "step": 1586 + }, + { + "epoch": 0.422299095263438, + "grad_norm": 0.3423897325992584, + "learning_rate": 1.9722574455384236e-07, + "loss": 0.2499, + "step": 1587 + }, + { + "epoch": 0.42256519425226186, + "grad_norm": 0.358970046043396, + "learning_rate": 1.9722179272924815e-07, + "loss": 0.2275, + "step": 1588 + }, + { + "epoch": 0.4228312932410857, + "grad_norm": 0.31522104144096375, + "learning_rate": 1.972178381316909e-07, + "loss": 0.2448, + "step": 1589 + }, + { + "epoch": 0.42309739222990955, + "grad_norm": 0.28606194257736206, + "learning_rate": 1.972138807612833e-07, + "loss": 0.2328, + "step": 1590 + }, + { + "epoch": 0.42336349121873335, + "grad_norm": 0.25281277298927307, + "learning_rate": 1.9720992061813829e-07, + "loss": 0.2121, + "step": 1591 + }, + { + "epoch": 0.4236295902075572, + "grad_norm": 0.37541279196739197, + "learning_rate": 1.972059577023688e-07, + "loss": 0.2241, + "step": 1592 + }, + { + "epoch": 0.42389568919638104, + "grad_norm": 0.24879620969295502, + "learning_rate": 1.9720199201408784e-07, + "loss": 0.2051, + "step": 1593 + }, + { + "epoch": 0.4241617881852049, + "grad_norm": 0.28103819489479065, + "learning_rate": 1.9719802355340857e-07, + "loss": 0.226, + "step": 1594 + }, + { + "epoch": 0.42442788717402874, + "grad_norm": 0.2989526093006134, + "learning_rate": 1.9719405232044417e-07, + "loss": 0.2482, + "step": 1595 + }, + { + "epoch": 0.4246939861628526, + "grad_norm": 0.2689415514469147, + "learning_rate": 1.9719007831530788e-07, + "loss": 0.2152, + "step": 1596 + }, + { + "epoch": 0.42496008515167644, + "grad_norm": 0.259412944316864, + "learning_rate": 1.9718610153811304e-07, + "loss": 0.2043, + "step": 1597 + }, + { + "epoch": 0.4252261841405003, + "grad_norm": 0.26880955696105957, + "learning_rate": 1.971821219889731e-07, + "loss": 0.2177, + "step": 1598 + }, + { + "epoch": 0.42549228312932413, + "grad_norm": 0.38873863220214844, + "learning_rate": 1.9717813966800157e-07, + "loss": 0.2218, + "step": 1599 + }, + { + "epoch": 0.4257583821181479, + "grad_norm": 0.28082025051116943, + "learning_rate": 1.9717415457531202e-07, + "loss": 0.2361, + "step": 1600 + }, + { + "epoch": 0.4260244811069718, + "grad_norm": 0.2762211859226227, + "learning_rate": 1.9717016671101807e-07, + "loss": 0.2159, + "step": 1601 + }, + { + "epoch": 0.4262905800957956, + "grad_norm": 0.2657347321510315, + "learning_rate": 1.9716617607523356e-07, + "loss": 0.2311, + "step": 1602 + }, + { + "epoch": 0.42655667908461947, + "grad_norm": 0.2964862585067749, + "learning_rate": 1.9716218266807222e-07, + "loss": 0.231, + "step": 1603 + }, + { + "epoch": 0.4268227780734433, + "grad_norm": 0.3198148310184479, + "learning_rate": 1.9715818648964803e-07, + "loss": 0.2164, + "step": 1604 + }, + { + "epoch": 0.42708887706226717, + "grad_norm": 0.27163994312286377, + "learning_rate": 1.971541875400749e-07, + "loss": 0.2089, + "step": 1605 + }, + { + "epoch": 0.427354976051091, + "grad_norm": 0.36036422848701477, + "learning_rate": 1.971501858194669e-07, + "loss": 0.2261, + "step": 1606 + }, + { + "epoch": 0.42762107503991487, + "grad_norm": 0.4939109683036804, + "learning_rate": 1.971461813279382e-07, + "loss": 0.2412, + "step": 1607 + }, + { + "epoch": 0.4278871740287387, + "grad_norm": 0.2460053712129593, + "learning_rate": 1.97142174065603e-07, + "loss": 0.2209, + "step": 1608 + }, + { + "epoch": 0.4281532730175625, + "grad_norm": 0.34200531244277954, + "learning_rate": 1.971381640325756e-07, + "loss": 0.2446, + "step": 1609 + }, + { + "epoch": 0.42841937200638636, + "grad_norm": 0.3946037292480469, + "learning_rate": 1.9713415122897036e-07, + "loss": 0.2427, + "step": 1610 + }, + { + "epoch": 0.4286854709952102, + "grad_norm": 0.26617175340652466, + "learning_rate": 1.9713013565490174e-07, + "loss": 0.2301, + "step": 1611 + }, + { + "epoch": 0.42895156998403405, + "grad_norm": 0.27276650071144104, + "learning_rate": 1.9712611731048426e-07, + "loss": 0.2176, + "step": 1612 + }, + { + "epoch": 0.4292176689728579, + "grad_norm": 0.27885308861732483, + "learning_rate": 1.9712209619583255e-07, + "loss": 0.245, + "step": 1613 + }, + { + "epoch": 0.42948376796168175, + "grad_norm": 0.5062170624732971, + "learning_rate": 1.971180723110613e-07, + "loss": 0.2306, + "step": 1614 + }, + { + "epoch": 0.4297498669505056, + "grad_norm": 0.29938381910324097, + "learning_rate": 1.971140456562853e-07, + "loss": 0.2378, + "step": 1615 + }, + { + "epoch": 0.43001596593932945, + "grad_norm": 0.2388211190700531, + "learning_rate": 1.971100162316193e-07, + "loss": 0.2051, + "step": 1616 + }, + { + "epoch": 0.4302820649281533, + "grad_norm": 0.4295382499694824, + "learning_rate": 1.9710598403717838e-07, + "loss": 0.2195, + "step": 1617 + }, + { + "epoch": 0.4305481639169771, + "grad_norm": 0.24767690896987915, + "learning_rate": 1.971019490730774e-07, + "loss": 0.1987, + "step": 1618 + }, + { + "epoch": 0.43081426290580094, + "grad_norm": 0.26312583684921265, + "learning_rate": 1.970979113394316e-07, + "loss": 0.2406, + "step": 1619 + }, + { + "epoch": 0.4310803618946248, + "grad_norm": 0.2699165344238281, + "learning_rate": 1.9709387083635597e-07, + "loss": 0.2253, + "step": 1620 + }, + { + "epoch": 0.43134646088344863, + "grad_norm": 0.2718670666217804, + "learning_rate": 1.9708982756396587e-07, + "loss": 0.2244, + "step": 1621 + }, + { + "epoch": 0.4316125598722725, + "grad_norm": 0.3460484743118286, + "learning_rate": 1.970857815223766e-07, + "loss": 0.2296, + "step": 1622 + }, + { + "epoch": 0.43187865886109633, + "grad_norm": 0.4351239800453186, + "learning_rate": 1.9708173271170355e-07, + "loss": 0.2462, + "step": 1623 + }, + { + "epoch": 0.4321447578499202, + "grad_norm": 0.3093509376049042, + "learning_rate": 1.970776811320622e-07, + "loss": 0.2099, + "step": 1624 + }, + { + "epoch": 0.43241085683874403, + "grad_norm": 0.3225369453430176, + "learning_rate": 1.970736267835681e-07, + "loss": 0.2406, + "step": 1625 + }, + { + "epoch": 0.4326769558275679, + "grad_norm": 0.2768619656562805, + "learning_rate": 1.9706956966633692e-07, + "loss": 0.2252, + "step": 1626 + }, + { + "epoch": 0.43294305481639167, + "grad_norm": 0.3394775688648224, + "learning_rate": 1.9706550978048434e-07, + "loss": 0.2551, + "step": 1627 + }, + { + "epoch": 0.4332091538052155, + "grad_norm": 0.3780362606048584, + "learning_rate": 1.970614471261262e-07, + "loss": 0.2227, + "step": 1628 + }, + { + "epoch": 0.43347525279403937, + "grad_norm": 0.31495121121406555, + "learning_rate": 1.9705738170337835e-07, + "loss": 0.2467, + "step": 1629 + }, + { + "epoch": 0.4337413517828632, + "grad_norm": 0.32234862446784973, + "learning_rate": 1.970533135123567e-07, + "loss": 0.2362, + "step": 1630 + }, + { + "epoch": 0.43400745077168706, + "grad_norm": 0.3218546211719513, + "learning_rate": 1.9704924255317738e-07, + "loss": 0.2306, + "step": 1631 + }, + { + "epoch": 0.4342735497605109, + "grad_norm": 0.37706103920936584, + "learning_rate": 1.9704516882595641e-07, + "loss": 0.2182, + "step": 1632 + }, + { + "epoch": 0.43453964874933476, + "grad_norm": 0.3934604227542877, + "learning_rate": 1.9704109233081005e-07, + "loss": 0.2336, + "step": 1633 + }, + { + "epoch": 0.4348057477381586, + "grad_norm": 0.3575260043144226, + "learning_rate": 1.9703701306785453e-07, + "loss": 0.2479, + "step": 1634 + }, + { + "epoch": 0.43507184672698246, + "grad_norm": 0.28413936495780945, + "learning_rate": 1.9703293103720622e-07, + "loss": 0.2132, + "step": 1635 + }, + { + "epoch": 0.4353379457158063, + "grad_norm": 0.316815584897995, + "learning_rate": 1.970288462389815e-07, + "loss": 0.2175, + "step": 1636 + }, + { + "epoch": 0.4356040447046301, + "grad_norm": 0.2803954780101776, + "learning_rate": 1.9702475867329694e-07, + "loss": 0.2271, + "step": 1637 + }, + { + "epoch": 0.43587014369345395, + "grad_norm": 0.779708743095398, + "learning_rate": 1.9702066834026908e-07, + "loss": 0.2519, + "step": 1638 + }, + { + "epoch": 0.4361362426822778, + "grad_norm": 0.27297696471214294, + "learning_rate": 1.9701657524001464e-07, + "loss": 0.2359, + "step": 1639 + }, + { + "epoch": 0.43640234167110165, + "grad_norm": 0.3315456509590149, + "learning_rate": 1.9701247937265027e-07, + "loss": 0.2383, + "step": 1640 + }, + { + "epoch": 0.4366684406599255, + "grad_norm": 0.29130446910858154, + "learning_rate": 1.970083807382929e-07, + "loss": 0.2308, + "step": 1641 + }, + { + "epoch": 0.43693453964874934, + "grad_norm": 0.3489105701446533, + "learning_rate": 1.9700427933705936e-07, + "loss": 0.2215, + "step": 1642 + }, + { + "epoch": 0.4372006386375732, + "grad_norm": 0.27545008063316345, + "learning_rate": 1.9700017516906667e-07, + "loss": 0.2281, + "step": 1643 + }, + { + "epoch": 0.43746673762639704, + "grad_norm": 0.27236104011535645, + "learning_rate": 1.9699606823443183e-07, + "loss": 0.2182, + "step": 1644 + }, + { + "epoch": 0.4377328366152209, + "grad_norm": 0.28362932801246643, + "learning_rate": 1.9699195853327207e-07, + "loss": 0.2359, + "step": 1645 + }, + { + "epoch": 0.4379989356040447, + "grad_norm": 0.32757073640823364, + "learning_rate": 1.9698784606570454e-07, + "loss": 0.2259, + "step": 1646 + }, + { + "epoch": 0.43826503459286853, + "grad_norm": 0.2754421830177307, + "learning_rate": 1.9698373083184653e-07, + "loss": 0.2381, + "step": 1647 + }, + { + "epoch": 0.4385311335816924, + "grad_norm": 0.5770829319953918, + "learning_rate": 1.9697961283181542e-07, + "loss": 0.2457, + "step": 1648 + }, + { + "epoch": 0.4387972325705162, + "grad_norm": 0.2695469558238983, + "learning_rate": 1.969754920657287e-07, + "loss": 0.2309, + "step": 1649 + }, + { + "epoch": 0.4390633315593401, + "grad_norm": 0.38192856311798096, + "learning_rate": 1.969713685337039e-07, + "loss": 0.2451, + "step": 1650 + }, + { + "epoch": 0.4393294305481639, + "grad_norm": 0.28335893154144287, + "learning_rate": 1.9696724223585862e-07, + "loss": 0.2363, + "step": 1651 + }, + { + "epoch": 0.4395955295369878, + "grad_norm": 0.36533239483833313, + "learning_rate": 1.9696311317231054e-07, + "loss": 0.2469, + "step": 1652 + }, + { + "epoch": 0.4398616285258116, + "grad_norm": 0.28613847494125366, + "learning_rate": 1.9695898134317743e-07, + "loss": 0.2009, + "step": 1653 + }, + { + "epoch": 0.44012772751463547, + "grad_norm": 0.4204244613647461, + "learning_rate": 1.9695484674857712e-07, + "loss": 0.2103, + "step": 1654 + }, + { + "epoch": 0.44039382650345926, + "grad_norm": 0.28690803050994873, + "learning_rate": 1.9695070938862756e-07, + "loss": 0.2258, + "step": 1655 + }, + { + "epoch": 0.4406599254922831, + "grad_norm": 0.3900402784347534, + "learning_rate": 1.969465692634468e-07, + "loss": 0.2296, + "step": 1656 + }, + { + "epoch": 0.44092602448110696, + "grad_norm": 0.28999653458595276, + "learning_rate": 1.9694242637315285e-07, + "loss": 0.2185, + "step": 1657 + }, + { + "epoch": 0.4411921234699308, + "grad_norm": 0.2647063136100769, + "learning_rate": 1.9693828071786389e-07, + "loss": 0.2249, + "step": 1658 + }, + { + "epoch": 0.44145822245875466, + "grad_norm": 0.4054471254348755, + "learning_rate": 1.9693413229769818e-07, + "loss": 0.2415, + "step": 1659 + }, + { + "epoch": 0.4417243214475785, + "grad_norm": 0.2882966101169586, + "learning_rate": 1.9692998111277403e-07, + "loss": 0.2365, + "step": 1660 + }, + { + "epoch": 0.44199042043640235, + "grad_norm": 0.2701032757759094, + "learning_rate": 1.969258271632099e-07, + "loss": 0.2092, + "step": 1661 + }, + { + "epoch": 0.4422565194252262, + "grad_norm": 0.2840425968170166, + "learning_rate": 1.9692167044912415e-07, + "loss": 0.2234, + "step": 1662 + }, + { + "epoch": 0.44252261841405005, + "grad_norm": 0.3218797743320465, + "learning_rate": 1.9691751097063545e-07, + "loss": 0.2363, + "step": 1663 + }, + { + "epoch": 0.44278871740287384, + "grad_norm": 0.3112090229988098, + "learning_rate": 1.9691334872786238e-07, + "loss": 0.2132, + "step": 1664 + }, + { + "epoch": 0.4430548163916977, + "grad_norm": 0.26553353667259216, + "learning_rate": 1.9690918372092367e-07, + "loss": 0.2039, + "step": 1665 + }, + { + "epoch": 0.44332091538052154, + "grad_norm": 0.2571031451225281, + "learning_rate": 1.9690501594993808e-07, + "loss": 0.2049, + "step": 1666 + }, + { + "epoch": 0.4435870143693454, + "grad_norm": 0.3651712238788605, + "learning_rate": 1.9690084541502454e-07, + "loss": 0.2434, + "step": 1667 + }, + { + "epoch": 0.44385311335816924, + "grad_norm": 0.33113884925842285, + "learning_rate": 1.9689667211630199e-07, + "loss": 0.2351, + "step": 1668 + }, + { + "epoch": 0.4441192123469931, + "grad_norm": 0.2984287142753601, + "learning_rate": 1.9689249605388941e-07, + "loss": 0.2346, + "step": 1669 + }, + { + "epoch": 0.44438531133581693, + "grad_norm": 0.29145553708076477, + "learning_rate": 1.96888317227906e-07, + "loss": 0.2302, + "step": 1670 + }, + { + "epoch": 0.4446514103246408, + "grad_norm": 0.3688400983810425, + "learning_rate": 1.9688413563847085e-07, + "loss": 0.2284, + "step": 1671 + }, + { + "epoch": 0.44491750931346463, + "grad_norm": 0.33607226610183716, + "learning_rate": 1.968799512857033e-07, + "loss": 0.2195, + "step": 1672 + }, + { + "epoch": 0.4451836083022884, + "grad_norm": 0.36537015438079834, + "learning_rate": 1.9687576416972265e-07, + "loss": 0.2647, + "step": 1673 + }, + { + "epoch": 0.4454497072911123, + "grad_norm": 0.29497140645980835, + "learning_rate": 1.9687157429064835e-07, + "loss": 0.2301, + "step": 1674 + }, + { + "epoch": 0.4457158062799361, + "grad_norm": 0.37132447957992554, + "learning_rate": 1.968673816485999e-07, + "loss": 0.2283, + "step": 1675 + }, + { + "epoch": 0.44598190526875997, + "grad_norm": 0.25592419505119324, + "learning_rate": 1.9686318624369688e-07, + "loss": 0.2134, + "step": 1676 + }, + { + "epoch": 0.4462480042575838, + "grad_norm": 0.37029218673706055, + "learning_rate": 1.96858988076059e-07, + "loss": 0.2298, + "step": 1677 + }, + { + "epoch": 0.44651410324640767, + "grad_norm": 0.3146544396877289, + "learning_rate": 1.968547871458059e-07, + "loss": 0.2486, + "step": 1678 + }, + { + "epoch": 0.4467802022352315, + "grad_norm": 0.2570297122001648, + "learning_rate": 1.9685058345305745e-07, + "loss": 0.2209, + "step": 1679 + }, + { + "epoch": 0.44704630122405536, + "grad_norm": 0.3135516345500946, + "learning_rate": 1.9684637699793356e-07, + "loss": 0.2597, + "step": 1680 + }, + { + "epoch": 0.4473124002128792, + "grad_norm": 0.3956271708011627, + "learning_rate": 1.968421677805542e-07, + "loss": 0.2143, + "step": 1681 + }, + { + "epoch": 0.447578499201703, + "grad_norm": 0.34842994809150696, + "learning_rate": 1.9683795580103943e-07, + "loss": 0.2312, + "step": 1682 + }, + { + "epoch": 0.44784459819052685, + "grad_norm": 0.32372722029685974, + "learning_rate": 1.9683374105950937e-07, + "loss": 0.221, + "step": 1683 + }, + { + "epoch": 0.4481106971793507, + "grad_norm": 0.35460755228996277, + "learning_rate": 1.968295235560842e-07, + "loss": 0.2568, + "step": 1684 + }, + { + "epoch": 0.44837679616817455, + "grad_norm": 0.3489168584346771, + "learning_rate": 1.9682530329088433e-07, + "loss": 0.2394, + "step": 1685 + }, + { + "epoch": 0.4486428951569984, + "grad_norm": 0.257916122674942, + "learning_rate": 1.9682108026403e-07, + "loss": 0.1966, + "step": 1686 + }, + { + "epoch": 0.44890899414582225, + "grad_norm": 0.44601836800575256, + "learning_rate": 1.9681685447564172e-07, + "loss": 0.2262, + "step": 1687 + }, + { + "epoch": 0.4491750931346461, + "grad_norm": 0.399024099111557, + "learning_rate": 1.9681262592584004e-07, + "loss": 0.2194, + "step": 1688 + }, + { + "epoch": 0.44944119212346995, + "grad_norm": 0.26346221566200256, + "learning_rate": 1.968083946147455e-07, + "loss": 0.2234, + "step": 1689 + }, + { + "epoch": 0.4497072911122938, + "grad_norm": 0.3729410171508789, + "learning_rate": 1.968041605424788e-07, + "loss": 0.2311, + "step": 1690 + }, + { + "epoch": 0.44997339010111764, + "grad_norm": 0.28513625264167786, + "learning_rate": 1.9679992370916076e-07, + "loss": 0.246, + "step": 1691 + }, + { + "epoch": 0.45023948908994144, + "grad_norm": 0.25965842604637146, + "learning_rate": 1.967956841149122e-07, + "loss": 0.2118, + "step": 1692 + }, + { + "epoch": 0.4505055880787653, + "grad_norm": 0.34391656517982483, + "learning_rate": 1.96791441759854e-07, + "loss": 0.2253, + "step": 1693 + }, + { + "epoch": 0.45077168706758913, + "grad_norm": 0.28163400292396545, + "learning_rate": 1.9678719664410722e-07, + "loss": 0.2263, + "step": 1694 + }, + { + "epoch": 0.451037786056413, + "grad_norm": 0.3479425609111786, + "learning_rate": 1.967829487677929e-07, + "loss": 0.2402, + "step": 1695 + }, + { + "epoch": 0.45130388504523683, + "grad_norm": 0.2745039761066437, + "learning_rate": 1.967786981310322e-07, + "loss": 0.2136, + "step": 1696 + }, + { + "epoch": 0.4515699840340607, + "grad_norm": 0.26810041069984436, + "learning_rate": 1.9677444473394638e-07, + "loss": 0.2167, + "step": 1697 + }, + { + "epoch": 0.4518360830228845, + "grad_norm": 0.3496895134449005, + "learning_rate": 1.967701885766567e-07, + "loss": 0.244, + "step": 1698 + }, + { + "epoch": 0.4521021820117084, + "grad_norm": 0.269308477640152, + "learning_rate": 1.9676592965928464e-07, + "loss": 0.2148, + "step": 1699 + }, + { + "epoch": 0.4523682810005322, + "grad_norm": 0.38697144389152527, + "learning_rate": 1.9676166798195162e-07, + "loss": 0.2409, + "step": 1700 + }, + { + "epoch": 0.452634379989356, + "grad_norm": 0.29200345277786255, + "learning_rate": 1.9675740354477922e-07, + "loss": 0.2408, + "step": 1701 + }, + { + "epoch": 0.45290047897817987, + "grad_norm": 0.2676614820957184, + "learning_rate": 1.96753136347889e-07, + "loss": 0.201, + "step": 1702 + }, + { + "epoch": 0.4531665779670037, + "grad_norm": 0.2786303758621216, + "learning_rate": 1.9674886639140276e-07, + "loss": 0.2167, + "step": 1703 + }, + { + "epoch": 0.45343267695582756, + "grad_norm": 0.3508264422416687, + "learning_rate": 1.9674459367544222e-07, + "loss": 0.2345, + "step": 1704 + }, + { + "epoch": 0.4536987759446514, + "grad_norm": 0.33923107385635376, + "learning_rate": 1.9674031820012928e-07, + "loss": 0.2137, + "step": 1705 + }, + { + "epoch": 0.45396487493347526, + "grad_norm": 0.2746260166168213, + "learning_rate": 1.967360399655859e-07, + "loss": 0.231, + "step": 1706 + }, + { + "epoch": 0.4542309739222991, + "grad_norm": 0.279002845287323, + "learning_rate": 1.9673175897193406e-07, + "loss": 0.2173, + "step": 1707 + }, + { + "epoch": 0.45449707291112296, + "grad_norm": 0.4202900230884552, + "learning_rate": 1.9672747521929592e-07, + "loss": 0.2627, + "step": 1708 + }, + { + "epoch": 0.4547631718999468, + "grad_norm": 0.24246391654014587, + "learning_rate": 1.967231887077936e-07, + "loss": 0.2171, + "step": 1709 + }, + { + "epoch": 0.4550292708887706, + "grad_norm": 0.2914736568927765, + "learning_rate": 1.9671889943754935e-07, + "loss": 0.2342, + "step": 1710 + }, + { + "epoch": 0.45529536987759445, + "grad_norm": 0.3567613363265991, + "learning_rate": 1.9671460740868562e-07, + "loss": 0.226, + "step": 1711 + }, + { + "epoch": 0.4555614688664183, + "grad_norm": 0.33564096689224243, + "learning_rate": 1.9671031262132473e-07, + "loss": 0.2511, + "step": 1712 + }, + { + "epoch": 0.45582756785524214, + "grad_norm": 0.43331822752952576, + "learning_rate": 1.9670601507558918e-07, + "loss": 0.213, + "step": 1713 + }, + { + "epoch": 0.456093666844066, + "grad_norm": 0.28834518790245056, + "learning_rate": 1.9670171477160154e-07, + "loss": 0.2204, + "step": 1714 + }, + { + "epoch": 0.45635976583288984, + "grad_norm": 0.30802077054977417, + "learning_rate": 1.9669741170948452e-07, + "loss": 0.2346, + "step": 1715 + }, + { + "epoch": 0.4566258648217137, + "grad_norm": 0.4528670608997345, + "learning_rate": 1.9669310588936083e-07, + "loss": 0.2354, + "step": 1716 + }, + { + "epoch": 0.45689196381053754, + "grad_norm": 0.2760123014450073, + "learning_rate": 1.9668879731135327e-07, + "loss": 0.2174, + "step": 1717 + }, + { + "epoch": 0.4571580627993614, + "grad_norm": 0.24306243658065796, + "learning_rate": 1.966844859755847e-07, + "loss": 0.2178, + "step": 1718 + }, + { + "epoch": 0.4574241617881852, + "grad_norm": 0.28812023997306824, + "learning_rate": 1.9668017188217814e-07, + "loss": 0.2375, + "step": 1719 + }, + { + "epoch": 0.45769026077700903, + "grad_norm": 0.2828085720539093, + "learning_rate": 1.966758550312566e-07, + "loss": 0.216, + "step": 1720 + }, + { + "epoch": 0.4579563597658329, + "grad_norm": 0.3331604301929474, + "learning_rate": 1.9667153542294325e-07, + "loss": 0.255, + "step": 1721 + }, + { + "epoch": 0.4582224587546567, + "grad_norm": 0.26787883043289185, + "learning_rate": 1.9666721305736126e-07, + "loss": 0.217, + "step": 1722 + }, + { + "epoch": 0.4584885577434806, + "grad_norm": 0.27457842230796814, + "learning_rate": 1.9666288793463387e-07, + "loss": 0.2322, + "step": 1723 + }, + { + "epoch": 0.4587546567323044, + "grad_norm": 0.33070242404937744, + "learning_rate": 1.9665856005488454e-07, + "loss": 0.2092, + "step": 1724 + }, + { + "epoch": 0.45902075572112827, + "grad_norm": 0.28674060106277466, + "learning_rate": 1.9665422941823664e-07, + "loss": 0.2429, + "step": 1725 + }, + { + "epoch": 0.4592868547099521, + "grad_norm": 0.2446848452091217, + "learning_rate": 1.966498960248137e-07, + "loss": 0.1931, + "step": 1726 + }, + { + "epoch": 0.45955295369877597, + "grad_norm": 0.2563767731189728, + "learning_rate": 1.9664555987473933e-07, + "loss": 0.2089, + "step": 1727 + }, + { + "epoch": 0.45981905268759976, + "grad_norm": 0.2836669981479645, + "learning_rate": 1.966412209681372e-07, + "loss": 0.2199, + "step": 1728 + }, + { + "epoch": 0.4600851516764236, + "grad_norm": 0.2653895914554596, + "learning_rate": 1.9663687930513105e-07, + "loss": 0.2116, + "step": 1729 + }, + { + "epoch": 0.46035125066524746, + "grad_norm": 0.26542046666145325, + "learning_rate": 1.9663253488584473e-07, + "loss": 0.2178, + "step": 1730 + }, + { + "epoch": 0.4606173496540713, + "grad_norm": 0.2698568105697632, + "learning_rate": 1.9662818771040216e-07, + "loss": 0.2064, + "step": 1731 + }, + { + "epoch": 0.46088344864289515, + "grad_norm": 0.29418954253196716, + "learning_rate": 1.966238377789273e-07, + "loss": 0.2167, + "step": 1732 + }, + { + "epoch": 0.461149547631719, + "grad_norm": 0.3466700613498688, + "learning_rate": 1.9661948509154426e-07, + "loss": 0.2314, + "step": 1733 + }, + { + "epoch": 0.46141564662054285, + "grad_norm": 0.2862553596496582, + "learning_rate": 1.9661512964837715e-07, + "loss": 0.1939, + "step": 1734 + }, + { + "epoch": 0.4616817456093667, + "grad_norm": 0.5404584407806396, + "learning_rate": 1.9661077144955022e-07, + "loss": 0.2343, + "step": 1735 + }, + { + "epoch": 0.46194784459819055, + "grad_norm": 0.3768211901187897, + "learning_rate": 1.9660641049518775e-07, + "loss": 0.241, + "step": 1736 + }, + { + "epoch": 0.46221394358701434, + "grad_norm": 0.3334949016571045, + "learning_rate": 1.9660204678541414e-07, + "loss": 0.2055, + "step": 1737 + }, + { + "epoch": 0.4624800425758382, + "grad_norm": 0.3399241864681244, + "learning_rate": 1.9659768032035387e-07, + "loss": 0.2195, + "step": 1738 + }, + { + "epoch": 0.46274614156466204, + "grad_norm": 0.27603957056999207, + "learning_rate": 1.9659331110013145e-07, + "loss": 0.2308, + "step": 1739 + }, + { + "epoch": 0.4630122405534859, + "grad_norm": 0.2873268723487854, + "learning_rate": 1.9658893912487153e-07, + "loss": 0.2422, + "step": 1740 + }, + { + "epoch": 0.46327833954230974, + "grad_norm": 0.33888646960258484, + "learning_rate": 1.9658456439469876e-07, + "loss": 0.2346, + "step": 1741 + }, + { + "epoch": 0.4635444385311336, + "grad_norm": 0.2658902108669281, + "learning_rate": 1.9658018690973797e-07, + "loss": 0.227, + "step": 1742 + }, + { + "epoch": 0.46381053751995743, + "grad_norm": 0.31430384516716003, + "learning_rate": 1.9657580667011398e-07, + "loss": 0.2436, + "step": 1743 + }, + { + "epoch": 0.4640766365087813, + "grad_norm": 0.3317306637763977, + "learning_rate": 1.965714236759517e-07, + "loss": 0.2379, + "step": 1744 + }, + { + "epoch": 0.46434273549760513, + "grad_norm": 0.29465270042419434, + "learning_rate": 1.965670379273762e-07, + "loss": 0.2466, + "step": 1745 + }, + { + "epoch": 0.464608834486429, + "grad_norm": 0.2887181341648102, + "learning_rate": 1.9656264942451254e-07, + "loss": 0.2269, + "step": 1746 + }, + { + "epoch": 0.46487493347525277, + "grad_norm": 0.2728267312049866, + "learning_rate": 1.9655825816748593e-07, + "loss": 0.2024, + "step": 1747 + }, + { + "epoch": 0.4651410324640766, + "grad_norm": 0.33768197894096375, + "learning_rate": 1.9655386415642153e-07, + "loss": 0.2346, + "step": 1748 + }, + { + "epoch": 0.46540713145290047, + "grad_norm": 0.39458638429641724, + "learning_rate": 1.9654946739144473e-07, + "loss": 0.212, + "step": 1749 + }, + { + "epoch": 0.4656732304417243, + "grad_norm": 0.2710312008857727, + "learning_rate": 1.9654506787268094e-07, + "loss": 0.228, + "step": 1750 + }, + { + "epoch": 0.46593932943054817, + "grad_norm": 0.3266092538833618, + "learning_rate": 1.9654066560025563e-07, + "loss": 0.2236, + "step": 1751 + }, + { + "epoch": 0.466205428419372, + "grad_norm": 0.3464217782020569, + "learning_rate": 1.9653626057429434e-07, + "loss": 0.2238, + "step": 1752 + }, + { + "epoch": 0.46647152740819586, + "grad_norm": 0.3505083918571472, + "learning_rate": 1.9653185279492273e-07, + "loss": 0.2314, + "step": 1753 + }, + { + "epoch": 0.4667376263970197, + "grad_norm": 0.4859958589076996, + "learning_rate": 1.965274422622665e-07, + "loss": 0.2451, + "step": 1754 + }, + { + "epoch": 0.46700372538584356, + "grad_norm": 0.28720876574516296, + "learning_rate": 1.965230289764515e-07, + "loss": 0.2157, + "step": 1755 + }, + { + "epoch": 0.46726982437466735, + "grad_norm": 0.30628883838653564, + "learning_rate": 1.9651861293760354e-07, + "loss": 0.2395, + "step": 1756 + }, + { + "epoch": 0.4675359233634912, + "grad_norm": 0.3177613317966461, + "learning_rate": 1.9651419414584862e-07, + "loss": 0.2354, + "step": 1757 + }, + { + "epoch": 0.46780202235231505, + "grad_norm": 0.2661770284175873, + "learning_rate": 1.9650977260131274e-07, + "loss": 0.2325, + "step": 1758 + }, + { + "epoch": 0.4680681213411389, + "grad_norm": 0.276431143283844, + "learning_rate": 1.9650534830412203e-07, + "loss": 0.2089, + "step": 1759 + }, + { + "epoch": 0.46833422032996275, + "grad_norm": 0.27750757336616516, + "learning_rate": 1.965009212544027e-07, + "loss": 0.2269, + "step": 1760 + }, + { + "epoch": 0.4686003193187866, + "grad_norm": 0.2794908285140991, + "learning_rate": 1.96496491452281e-07, + "loss": 0.2316, + "step": 1761 + }, + { + "epoch": 0.46886641830761044, + "grad_norm": 0.2561020255088806, + "learning_rate": 1.9649205889788326e-07, + "loss": 0.2218, + "step": 1762 + }, + { + "epoch": 0.4691325172964343, + "grad_norm": 0.3615921437740326, + "learning_rate": 1.9648762359133587e-07, + "loss": 0.237, + "step": 1763 + }, + { + "epoch": 0.46939861628525814, + "grad_norm": 0.29209664463996887, + "learning_rate": 1.9648318553276547e-07, + "loss": 0.2135, + "step": 1764 + }, + { + "epoch": 0.46966471527408193, + "grad_norm": 0.2649155855178833, + "learning_rate": 1.964787447222985e-07, + "loss": 0.2079, + "step": 1765 + }, + { + "epoch": 0.4699308142629058, + "grad_norm": 0.35027289390563965, + "learning_rate": 1.9647430116006167e-07, + "loss": 0.2441, + "step": 1766 + }, + { + "epoch": 0.47019691325172963, + "grad_norm": 0.3416731655597687, + "learning_rate": 1.9646985484618177e-07, + "loss": 0.237, + "step": 1767 + }, + { + "epoch": 0.4704630122405535, + "grad_norm": 0.2830830216407776, + "learning_rate": 1.9646540578078554e-07, + "loss": 0.2247, + "step": 1768 + }, + { + "epoch": 0.47072911122937733, + "grad_norm": 0.4543875455856323, + "learning_rate": 1.964609539639999e-07, + "loss": 0.2248, + "step": 1769 + }, + { + "epoch": 0.4709952102182012, + "grad_norm": 0.346659392118454, + "learning_rate": 1.9645649939595188e-07, + "loss": 0.2341, + "step": 1770 + }, + { + "epoch": 0.471261309207025, + "grad_norm": 0.2571704685688019, + "learning_rate": 1.9645204207676846e-07, + "loss": 0.2131, + "step": 1771 + }, + { + "epoch": 0.4715274081958489, + "grad_norm": 0.2693014442920685, + "learning_rate": 1.9644758200657682e-07, + "loss": 0.2171, + "step": 1772 + }, + { + "epoch": 0.4717935071846727, + "grad_norm": 0.27858248353004456, + "learning_rate": 1.9644311918550412e-07, + "loss": 0.2207, + "step": 1773 + }, + { + "epoch": 0.4720596061734965, + "grad_norm": 0.36964908242225647, + "learning_rate": 1.964386536136777e-07, + "loss": 0.2311, + "step": 1774 + }, + { + "epoch": 0.47232570516232036, + "grad_norm": 0.307213693857193, + "learning_rate": 1.964341852912249e-07, + "loss": 0.2218, + "step": 1775 + }, + { + "epoch": 0.4725918041511442, + "grad_norm": 0.36618006229400635, + "learning_rate": 1.9642971421827315e-07, + "loss": 0.2399, + "step": 1776 + }, + { + "epoch": 0.47285790313996806, + "grad_norm": 0.2887413203716278, + "learning_rate": 1.9642524039495e-07, + "loss": 0.2385, + "step": 1777 + }, + { + "epoch": 0.4731240021287919, + "grad_norm": 0.3699083626270294, + "learning_rate": 1.964207638213831e-07, + "loss": 0.2242, + "step": 1778 + }, + { + "epoch": 0.47339010111761576, + "grad_norm": 0.2877765893936157, + "learning_rate": 1.9641628449770002e-07, + "loss": 0.2239, + "step": 1779 + }, + { + "epoch": 0.4736562001064396, + "grad_norm": 0.27503708004951477, + "learning_rate": 1.964118024240286e-07, + "loss": 0.24, + "step": 1780 + }, + { + "epoch": 0.47392229909526346, + "grad_norm": 0.3104378283023834, + "learning_rate": 1.9640731760049668e-07, + "loss": 0.23, + "step": 1781 + }, + { + "epoch": 0.4741883980840873, + "grad_norm": 0.27289292216300964, + "learning_rate": 1.9640283002723212e-07, + "loss": 0.211, + "step": 1782 + }, + { + "epoch": 0.4744544970729111, + "grad_norm": 0.3365730941295624, + "learning_rate": 1.9639833970436297e-07, + "loss": 0.2422, + "step": 1783 + }, + { + "epoch": 0.47472059606173495, + "grad_norm": 0.4149131178855896, + "learning_rate": 1.963938466320173e-07, + "loss": 0.2377, + "step": 1784 + }, + { + "epoch": 0.4749866950505588, + "grad_norm": 0.2605113089084625, + "learning_rate": 1.963893508103232e-07, + "loss": 0.2122, + "step": 1785 + }, + { + "epoch": 0.47525279403938264, + "grad_norm": 0.28558027744293213, + "learning_rate": 1.9638485223940893e-07, + "loss": 0.2161, + "step": 1786 + }, + { + "epoch": 0.4755188930282065, + "grad_norm": 0.3466443121433258, + "learning_rate": 1.9638035091940285e-07, + "loss": 0.2247, + "step": 1787 + }, + { + "epoch": 0.47578499201703034, + "grad_norm": 0.3555475175380707, + "learning_rate": 1.963758468504333e-07, + "loss": 0.2046, + "step": 1788 + }, + { + "epoch": 0.4760510910058542, + "grad_norm": 0.35343942046165466, + "learning_rate": 1.9637134003262874e-07, + "loss": 0.2211, + "step": 1789 + }, + { + "epoch": 0.47631718999467804, + "grad_norm": 0.2951878607273102, + "learning_rate": 1.9636683046611772e-07, + "loss": 0.2375, + "step": 1790 + }, + { + "epoch": 0.4765832889835019, + "grad_norm": 0.2793707549571991, + "learning_rate": 1.963623181510289e-07, + "loss": 0.2299, + "step": 1791 + }, + { + "epoch": 0.4768493879723257, + "grad_norm": 0.42717719078063965, + "learning_rate": 1.9635780308749092e-07, + "loss": 0.2397, + "step": 1792 + }, + { + "epoch": 0.4771154869611495, + "grad_norm": 0.36314642429351807, + "learning_rate": 1.963532852756326e-07, + "loss": 0.229, + "step": 1793 + }, + { + "epoch": 0.4773815859499734, + "grad_norm": 0.3464435338973999, + "learning_rate": 1.9634876471558278e-07, + "loss": 0.2449, + "step": 1794 + }, + { + "epoch": 0.4776476849387972, + "grad_norm": 0.3210110366344452, + "learning_rate": 1.963442414074704e-07, + "loss": 0.249, + "step": 1795 + }, + { + "epoch": 0.47791378392762107, + "grad_norm": 0.2804130017757416, + "learning_rate": 1.963397153514245e-07, + "loss": 0.2272, + "step": 1796 + }, + { + "epoch": 0.4781798829164449, + "grad_norm": 0.36155834794044495, + "learning_rate": 1.9633518654757411e-07, + "loss": 0.2328, + "step": 1797 + }, + { + "epoch": 0.47844598190526877, + "grad_norm": 0.34839892387390137, + "learning_rate": 1.9633065499604844e-07, + "loss": 0.2203, + "step": 1798 + }, + { + "epoch": 0.4787120808940926, + "grad_norm": 0.3753220736980438, + "learning_rate": 1.9632612069697677e-07, + "loss": 0.2385, + "step": 1799 + }, + { + "epoch": 0.47897817988291647, + "grad_norm": 0.3055884540081024, + "learning_rate": 1.9632158365048835e-07, + "loss": 0.2262, + "step": 1800 + }, + { + "epoch": 0.4792442788717403, + "grad_norm": 0.33094051480293274, + "learning_rate": 1.9631704385671265e-07, + "loss": 0.2301, + "step": 1801 + }, + { + "epoch": 0.4795103778605641, + "grad_norm": 0.41712668538093567, + "learning_rate": 1.9631250131577913e-07, + "loss": 0.2207, + "step": 1802 + }, + { + "epoch": 0.47977647684938796, + "grad_norm": 0.42261242866516113, + "learning_rate": 1.9630795602781737e-07, + "loss": 0.231, + "step": 1803 + }, + { + "epoch": 0.4800425758382118, + "grad_norm": 0.27475783228874207, + "learning_rate": 1.96303407992957e-07, + "loss": 0.201, + "step": 1804 + }, + { + "epoch": 0.48030867482703565, + "grad_norm": 0.2866148054599762, + "learning_rate": 1.9629885721132772e-07, + "loss": 0.2268, + "step": 1805 + }, + { + "epoch": 0.4805747738158595, + "grad_norm": 0.26646947860717773, + "learning_rate": 1.9629430368305932e-07, + "loss": 0.2275, + "step": 1806 + }, + { + "epoch": 0.48084087280468335, + "grad_norm": 0.3570164740085602, + "learning_rate": 1.9628974740828171e-07, + "loss": 0.2365, + "step": 1807 + }, + { + "epoch": 0.4811069717935072, + "grad_norm": 0.3161805272102356, + "learning_rate": 1.9628518838712488e-07, + "loss": 0.2265, + "step": 1808 + }, + { + "epoch": 0.48137307078233105, + "grad_norm": 0.2625880539417267, + "learning_rate": 1.962806266197188e-07, + "loss": 0.2167, + "step": 1809 + }, + { + "epoch": 0.4816391697711549, + "grad_norm": 0.2774200141429901, + "learning_rate": 1.9627606210619357e-07, + "loss": 0.2117, + "step": 1810 + }, + { + "epoch": 0.4819052687599787, + "grad_norm": 0.26908430457115173, + "learning_rate": 1.962714948466794e-07, + "loss": 0.2191, + "step": 1811 + }, + { + "epoch": 0.48217136774880254, + "grad_norm": 0.2598683834075928, + "learning_rate": 1.962669248413066e-07, + "loss": 0.2069, + "step": 1812 + }, + { + "epoch": 0.4824374667376264, + "grad_norm": 0.270634263753891, + "learning_rate": 1.9626235209020546e-07, + "loss": 0.2111, + "step": 1813 + }, + { + "epoch": 0.48270356572645023, + "grad_norm": 0.45944276452064514, + "learning_rate": 1.9625777659350642e-07, + "loss": 0.2454, + "step": 1814 + }, + { + "epoch": 0.4829696647152741, + "grad_norm": 0.28161516785621643, + "learning_rate": 1.9625319835134e-07, + "loss": 0.2221, + "step": 1815 + }, + { + "epoch": 0.48323576370409793, + "grad_norm": 0.4790109097957611, + "learning_rate": 1.9624861736383677e-07, + "loss": 0.2514, + "step": 1816 + }, + { + "epoch": 0.4835018626929218, + "grad_norm": 0.3789800703525543, + "learning_rate": 1.9624403363112738e-07, + "loss": 0.2158, + "step": 1817 + }, + { + "epoch": 0.48376796168174563, + "grad_norm": 0.2836802005767822, + "learning_rate": 1.9623944715334258e-07, + "loss": 0.2184, + "step": 1818 + }, + { + "epoch": 0.4840340606705695, + "grad_norm": 0.36240196228027344, + "learning_rate": 1.9623485793061315e-07, + "loss": 0.2407, + "step": 1819 + }, + { + "epoch": 0.48430015965939327, + "grad_norm": 0.29129308462142944, + "learning_rate": 1.9623026596307004e-07, + "loss": 0.2487, + "step": 1820 + }, + { + "epoch": 0.4845662586482171, + "grad_norm": 0.2525905966758728, + "learning_rate": 1.962256712508442e-07, + "loss": 0.2048, + "step": 1821 + }, + { + "epoch": 0.48483235763704097, + "grad_norm": 0.27428969740867615, + "learning_rate": 1.9622107379406665e-07, + "loss": 0.2217, + "step": 1822 + }, + { + "epoch": 0.4850984566258648, + "grad_norm": 0.2817106246948242, + "learning_rate": 1.9621647359286855e-07, + "loss": 0.2146, + "step": 1823 + }, + { + "epoch": 0.48536455561468866, + "grad_norm": 0.27411773800849915, + "learning_rate": 1.9621187064738113e-07, + "loss": 0.2324, + "step": 1824 + }, + { + "epoch": 0.4856306546035125, + "grad_norm": 0.35649412870407104, + "learning_rate": 1.9620726495773564e-07, + "loss": 0.2224, + "step": 1825 + }, + { + "epoch": 0.48589675359233636, + "grad_norm": 0.274800568819046, + "learning_rate": 1.9620265652406344e-07, + "loss": 0.2454, + "step": 1826 + }, + { + "epoch": 0.4861628525811602, + "grad_norm": 0.26062530279159546, + "learning_rate": 1.9619804534649598e-07, + "loss": 0.2266, + "step": 1827 + }, + { + "epoch": 0.48642895156998406, + "grad_norm": 0.36359846591949463, + "learning_rate": 1.9619343142516476e-07, + "loss": 0.2209, + "step": 1828 + }, + { + "epoch": 0.48669505055880785, + "grad_norm": 0.2658989429473877, + "learning_rate": 1.9618881476020143e-07, + "loss": 0.2144, + "step": 1829 + }, + { + "epoch": 0.4869611495476317, + "grad_norm": 0.263414204120636, + "learning_rate": 1.9618419535173763e-07, + "loss": 0.2184, + "step": 1830 + }, + { + "epoch": 0.48722724853645555, + "grad_norm": 0.26800525188446045, + "learning_rate": 1.9617957319990514e-07, + "loss": 0.23, + "step": 1831 + }, + { + "epoch": 0.4874933475252794, + "grad_norm": 0.2749759256839752, + "learning_rate": 1.9617494830483577e-07, + "loss": 0.2284, + "step": 1832 + }, + { + "epoch": 0.48775944651410325, + "grad_norm": 0.34696274995803833, + "learning_rate": 1.9617032066666145e-07, + "loss": 0.1968, + "step": 1833 + }, + { + "epoch": 0.4880255455029271, + "grad_norm": 0.40410417318344116, + "learning_rate": 1.961656902855141e-07, + "loss": 0.2465, + "step": 1834 + }, + { + "epoch": 0.48829164449175094, + "grad_norm": 0.2748878300189972, + "learning_rate": 1.961610571615259e-07, + "loss": 0.2069, + "step": 1835 + }, + { + "epoch": 0.4885577434805748, + "grad_norm": 0.2738482356071472, + "learning_rate": 1.9615642129482892e-07, + "loss": 0.2284, + "step": 1836 + }, + { + "epoch": 0.48882384246939864, + "grad_norm": 0.2719925343990326, + "learning_rate": 1.961517826855554e-07, + "loss": 0.2176, + "step": 1837 + }, + { + "epoch": 0.48908994145822243, + "grad_norm": 0.26422882080078125, + "learning_rate": 1.9614714133383768e-07, + "loss": 0.2169, + "step": 1838 + }, + { + "epoch": 0.4893560404470463, + "grad_norm": 0.24565623700618744, + "learning_rate": 1.9614249723980806e-07, + "loss": 0.2073, + "step": 1839 + }, + { + "epoch": 0.48962213943587013, + "grad_norm": 0.2591991126537323, + "learning_rate": 1.961378504035991e-07, + "loss": 0.2108, + "step": 1840 + }, + { + "epoch": 0.489888238424694, + "grad_norm": 0.3902721703052521, + "learning_rate": 1.9613320082534325e-07, + "loss": 0.2303, + "step": 1841 + }, + { + "epoch": 0.4901543374135178, + "grad_norm": 0.3104109764099121, + "learning_rate": 1.9612854850517317e-07, + "loss": 0.2486, + "step": 1842 + }, + { + "epoch": 0.4904204364023417, + "grad_norm": 0.31254974007606506, + "learning_rate": 1.9612389344322153e-07, + "loss": 0.2393, + "step": 1843 + }, + { + "epoch": 0.4906865353911655, + "grad_norm": 0.27249565720558167, + "learning_rate": 1.9611923563962117e-07, + "loss": 0.2374, + "step": 1844 + }, + { + "epoch": 0.49095263437998937, + "grad_norm": 0.3624659776687622, + "learning_rate": 1.9611457509450482e-07, + "loss": 0.2197, + "step": 1845 + }, + { + "epoch": 0.4912187333688132, + "grad_norm": 0.2704516053199768, + "learning_rate": 1.961099118080055e-07, + "loss": 0.2222, + "step": 1846 + }, + { + "epoch": 0.491484832357637, + "grad_norm": 0.27170124650001526, + "learning_rate": 1.961052457802562e-07, + "loss": 0.218, + "step": 1847 + }, + { + "epoch": 0.49175093134646086, + "grad_norm": 0.3528658449649811, + "learning_rate": 1.9610057701139e-07, + "loss": 0.2184, + "step": 1848 + }, + { + "epoch": 0.4920170303352847, + "grad_norm": 0.2728787660598755, + "learning_rate": 1.9609590550154002e-07, + "loss": 0.2102, + "step": 1849 + }, + { + "epoch": 0.49228312932410856, + "grad_norm": 0.25112253427505493, + "learning_rate": 1.9609123125083954e-07, + "loss": 0.225, + "step": 1850 + }, + { + "epoch": 0.4925492283129324, + "grad_norm": 0.2711484432220459, + "learning_rate": 1.9608655425942194e-07, + "loss": 0.225, + "step": 1851 + }, + { + "epoch": 0.49281532730175626, + "grad_norm": 0.2632373869419098, + "learning_rate": 1.960818745274205e-07, + "loss": 0.2101, + "step": 1852 + }, + { + "epoch": 0.4930814262905801, + "grad_norm": 0.3617382049560547, + "learning_rate": 1.9607719205496878e-07, + "loss": 0.2133, + "step": 1853 + }, + { + "epoch": 0.49334752527940395, + "grad_norm": 0.459052711725235, + "learning_rate": 1.9607250684220027e-07, + "loss": 0.2355, + "step": 1854 + }, + { + "epoch": 0.4936136242682278, + "grad_norm": 0.29723599553108215, + "learning_rate": 1.9606781888924865e-07, + "loss": 0.2231, + "step": 1855 + }, + { + "epoch": 0.49387972325705165, + "grad_norm": 0.2471708059310913, + "learning_rate": 1.9606312819624765e-07, + "loss": 0.2098, + "step": 1856 + }, + { + "epoch": 0.49414582224587544, + "grad_norm": 0.30987632274627686, + "learning_rate": 1.9605843476333098e-07, + "loss": 0.2102, + "step": 1857 + }, + { + "epoch": 0.4944119212346993, + "grad_norm": 0.265229195356369, + "learning_rate": 1.960537385906326e-07, + "loss": 0.221, + "step": 1858 + }, + { + "epoch": 0.49467802022352314, + "grad_norm": 0.28644663095474243, + "learning_rate": 1.9604903967828635e-07, + "loss": 0.221, + "step": 1859 + }, + { + "epoch": 0.494944119212347, + "grad_norm": 0.25924479961395264, + "learning_rate": 1.9604433802642633e-07, + "loss": 0.2115, + "step": 1860 + }, + { + "epoch": 0.49521021820117084, + "grad_norm": 0.2870248556137085, + "learning_rate": 1.960396336351866e-07, + "loss": 0.2238, + "step": 1861 + }, + { + "epoch": 0.4954763171899947, + "grad_norm": 0.2657662034034729, + "learning_rate": 1.960349265047014e-07, + "loss": 0.2118, + "step": 1862 + }, + { + "epoch": 0.49574241617881853, + "grad_norm": 0.29410091042518616, + "learning_rate": 1.9603021663510494e-07, + "loss": 0.2261, + "step": 1863 + }, + { + "epoch": 0.4960085151676424, + "grad_norm": 0.3518030047416687, + "learning_rate": 1.9602550402653152e-07, + "loss": 0.213, + "step": 1864 + }, + { + "epoch": 0.49627461415646623, + "grad_norm": 0.2724989652633667, + "learning_rate": 1.960207886791156e-07, + "loss": 0.2251, + "step": 1865 + }, + { + "epoch": 0.49654071314529, + "grad_norm": 0.246816948056221, + "learning_rate": 1.960160705929917e-07, + "loss": 0.2117, + "step": 1866 + }, + { + "epoch": 0.4968068121341139, + "grad_norm": 0.26074323058128357, + "learning_rate": 1.960113497682943e-07, + "loss": 0.205, + "step": 1867 + }, + { + "epoch": 0.4970729111229377, + "grad_norm": 0.25791507959365845, + "learning_rate": 1.9600662620515814e-07, + "loss": 0.2143, + "step": 1868 + }, + { + "epoch": 0.49733901011176157, + "grad_norm": 0.2748030722141266, + "learning_rate": 1.960018999037179e-07, + "loss": 0.2239, + "step": 1869 + }, + { + "epoch": 0.4976051091005854, + "grad_norm": 0.26145076751708984, + "learning_rate": 1.9599717086410838e-07, + "loss": 0.2183, + "step": 1870 + }, + { + "epoch": 0.49787120808940927, + "grad_norm": 0.29027625918388367, + "learning_rate": 1.9599243908646446e-07, + "loss": 0.2093, + "step": 1871 + }, + { + "epoch": 0.4981373070782331, + "grad_norm": 0.34020426869392395, + "learning_rate": 1.959877045709211e-07, + "loss": 0.2351, + "step": 1872 + }, + { + "epoch": 0.49840340606705696, + "grad_norm": 0.27210116386413574, + "learning_rate": 1.959829673176134e-07, + "loss": 0.1963, + "step": 1873 + }, + { + "epoch": 0.4986695050558808, + "grad_norm": 0.30328357219696045, + "learning_rate": 1.959782273266764e-07, + "loss": 0.2455, + "step": 1874 + }, + { + "epoch": 0.4989356040447046, + "grad_norm": 0.33116114139556885, + "learning_rate": 1.959734845982453e-07, + "loss": 0.2312, + "step": 1875 + }, + { + "epoch": 0.49920170303352845, + "grad_norm": 0.3336091637611389, + "learning_rate": 1.9596873913245542e-07, + "loss": 0.2131, + "step": 1876 + }, + { + "epoch": 0.4994678020223523, + "grad_norm": 0.3426041901111603, + "learning_rate": 1.9596399092944206e-07, + "loss": 0.2311, + "step": 1877 + }, + { + "epoch": 0.49973390101117615, + "grad_norm": 0.34443986415863037, + "learning_rate": 1.9595923998934068e-07, + "loss": 0.2089, + "step": 1878 + }, + { + "epoch": 0.5, + "grad_norm": 0.2720508277416229, + "learning_rate": 1.959544863122868e-07, + "loss": 0.2205, + "step": 1879 + }, + { + "epoch": 0.5002660989888238, + "grad_norm": 0.2688937187194824, + "learning_rate": 1.9594972989841595e-07, + "loss": 0.2252, + "step": 1880 + }, + { + "epoch": 0.5005321979776477, + "grad_norm": 0.29706844687461853, + "learning_rate": 1.9594497074786383e-07, + "loss": 0.2363, + "step": 1881 + }, + { + "epoch": 0.5007982969664715, + "grad_norm": 0.2524850070476532, + "learning_rate": 1.959402088607662e-07, + "loss": 0.1909, + "step": 1882 + }, + { + "epoch": 0.5010643959552954, + "grad_norm": 0.2583775222301483, + "learning_rate": 1.9593544423725884e-07, + "loss": 0.2025, + "step": 1883 + }, + { + "epoch": 0.5013304949441192, + "grad_norm": 0.29100990295410156, + "learning_rate": 1.9593067687747764e-07, + "loss": 0.2047, + "step": 1884 + }, + { + "epoch": 0.5015965939329431, + "grad_norm": 0.3074449598789215, + "learning_rate": 1.9592590678155863e-07, + "loss": 0.2154, + "step": 1885 + }, + { + "epoch": 0.5018626929217669, + "grad_norm": 0.3404049575328827, + "learning_rate": 1.9592113394963778e-07, + "loss": 0.233, + "step": 1886 + }, + { + "epoch": 0.5021287919105908, + "grad_norm": 0.3393394649028778, + "learning_rate": 1.9591635838185131e-07, + "loss": 0.2261, + "step": 1887 + }, + { + "epoch": 0.5023948908994146, + "grad_norm": 0.2801637649536133, + "learning_rate": 1.9591158007833538e-07, + "loss": 0.2317, + "step": 1888 + }, + { + "epoch": 0.5026609898882384, + "grad_norm": 0.3489146828651428, + "learning_rate": 1.9590679903922628e-07, + "loss": 0.2359, + "step": 1889 + }, + { + "epoch": 0.5029270888770623, + "grad_norm": 0.3976122736930847, + "learning_rate": 1.9590201526466035e-07, + "loss": 0.2345, + "step": 1890 + }, + { + "epoch": 0.5031931878658861, + "grad_norm": 0.28642866015434265, + "learning_rate": 1.958972287547741e-07, + "loss": 0.216, + "step": 1891 + }, + { + "epoch": 0.50345928685471, + "grad_norm": 0.3701094686985016, + "learning_rate": 1.95892439509704e-07, + "loss": 0.226, + "step": 1892 + }, + { + "epoch": 0.5037253858435338, + "grad_norm": 0.29865553975105286, + "learning_rate": 1.9588764752958665e-07, + "loss": 0.2149, + "step": 1893 + }, + { + "epoch": 0.5039914848323577, + "grad_norm": 0.25789162516593933, + "learning_rate": 1.9588285281455876e-07, + "loss": 0.2106, + "step": 1894 + }, + { + "epoch": 0.5042575838211815, + "grad_norm": 0.2904580235481262, + "learning_rate": 1.9587805536475707e-07, + "loss": 0.2136, + "step": 1895 + }, + { + "epoch": 0.5045236828100054, + "grad_norm": 0.36989110708236694, + "learning_rate": 1.9587325518031838e-07, + "loss": 0.2338, + "step": 1896 + }, + { + "epoch": 0.5047897817988292, + "grad_norm": 0.2724355161190033, + "learning_rate": 1.9586845226137967e-07, + "loss": 0.2254, + "step": 1897 + }, + { + "epoch": 0.505055880787653, + "grad_norm": 0.41854482889175415, + "learning_rate": 1.9586364660807786e-07, + "loss": 0.2293, + "step": 1898 + }, + { + "epoch": 0.5053219797764769, + "grad_norm": 0.2751142382621765, + "learning_rate": 1.9585883822055004e-07, + "loss": 0.218, + "step": 1899 + }, + { + "epoch": 0.5055880787653007, + "grad_norm": 0.2974337041378021, + "learning_rate": 1.9585402709893336e-07, + "loss": 0.2329, + "step": 1900 + }, + { + "epoch": 0.5058541777541246, + "grad_norm": 0.3320919871330261, + "learning_rate": 1.9584921324336509e-07, + "loss": 0.2219, + "step": 1901 + }, + { + "epoch": 0.5061202767429483, + "grad_norm": 0.3436225950717926, + "learning_rate": 1.958443966539824e-07, + "loss": 0.2008, + "step": 1902 + }, + { + "epoch": 0.5063863757317723, + "grad_norm": 0.34650304913520813, + "learning_rate": 1.958395773309228e-07, + "loss": 0.2312, + "step": 1903 + }, + { + "epoch": 0.506652474720596, + "grad_norm": 0.32786309719085693, + "learning_rate": 1.958347552743237e-07, + "loss": 0.2169, + "step": 1904 + }, + { + "epoch": 0.50691857370942, + "grad_norm": 0.367701917886734, + "learning_rate": 1.9582993048432262e-07, + "loss": 0.2275, + "step": 1905 + }, + { + "epoch": 0.5071846726982437, + "grad_norm": 0.2826124131679535, + "learning_rate": 1.958251029610572e-07, + "loss": 0.2155, + "step": 1906 + }, + { + "epoch": 0.5074507716870675, + "grad_norm": 0.2791610062122345, + "learning_rate": 1.9582027270466513e-07, + "loss": 0.2192, + "step": 1907 + }, + { + "epoch": 0.5077168706758914, + "grad_norm": 0.2606637477874756, + "learning_rate": 1.9581543971528413e-07, + "loss": 0.2177, + "step": 1908 + }, + { + "epoch": 0.5079829696647152, + "grad_norm": 0.5614771246910095, + "learning_rate": 1.958106039930521e-07, + "loss": 0.2082, + "step": 1909 + }, + { + "epoch": 0.5082490686535391, + "grad_norm": 0.3173034191131592, + "learning_rate": 1.9580576553810695e-07, + "loss": 0.2309, + "step": 1910 + }, + { + "epoch": 0.5085151676423629, + "grad_norm": 0.39768466353416443, + "learning_rate": 1.958009243505867e-07, + "loss": 0.2393, + "step": 1911 + }, + { + "epoch": 0.5087812666311868, + "grad_norm": 0.369731605052948, + "learning_rate": 1.957960804306294e-07, + "loss": 0.2181, + "step": 1912 + }, + { + "epoch": 0.5090473656200106, + "grad_norm": 0.2551455497741699, + "learning_rate": 1.9579123377837318e-07, + "loss": 0.2082, + "step": 1913 + }, + { + "epoch": 0.5093134646088345, + "grad_norm": 0.27540281414985657, + "learning_rate": 1.9578638439395637e-07, + "loss": 0.2032, + "step": 1914 + }, + { + "epoch": 0.5095795635976583, + "grad_norm": 0.2958875298500061, + "learning_rate": 1.9578153227751719e-07, + "loss": 0.2119, + "step": 1915 + }, + { + "epoch": 0.5098456625864821, + "grad_norm": 0.2917386293411255, + "learning_rate": 1.957766774291941e-07, + "loss": 0.2151, + "step": 1916 + }, + { + "epoch": 0.510111761575306, + "grad_norm": 0.2674562335014343, + "learning_rate": 1.957718198491255e-07, + "loss": 0.2229, + "step": 1917 + }, + { + "epoch": 0.5103778605641298, + "grad_norm": 0.25212323665618896, + "learning_rate": 1.9576695953745005e-07, + "loss": 0.218, + "step": 1918 + }, + { + "epoch": 0.5106439595529537, + "grad_norm": 0.4591040313243866, + "learning_rate": 1.9576209649430623e-07, + "loss": 0.2202, + "step": 1919 + }, + { + "epoch": 0.5109100585417775, + "grad_norm": 0.3001951575279236, + "learning_rate": 1.9575723071983288e-07, + "loss": 0.2202, + "step": 1920 + }, + { + "epoch": 0.5111761575306014, + "grad_norm": 0.3587566018104553, + "learning_rate": 1.957523622141687e-07, + "loss": 0.2397, + "step": 1921 + }, + { + "epoch": 0.5114422565194252, + "grad_norm": 0.2853900194168091, + "learning_rate": 1.9574749097745256e-07, + "loss": 0.2352, + "step": 1922 + }, + { + "epoch": 0.5117083555082491, + "grad_norm": 0.28078266978263855, + "learning_rate": 1.957426170098234e-07, + "loss": 0.2079, + "step": 1923 + }, + { + "epoch": 0.5119744544970729, + "grad_norm": 0.37109360098838806, + "learning_rate": 1.9573774031142028e-07, + "loss": 0.2114, + "step": 1924 + }, + { + "epoch": 0.5122405534858967, + "grad_norm": 0.3388523459434509, + "learning_rate": 1.9573286088238224e-07, + "loss": 0.2274, + "step": 1925 + }, + { + "epoch": 0.5125066524747206, + "grad_norm": 0.383124977350235, + "learning_rate": 1.9572797872284848e-07, + "loss": 0.2179, + "step": 1926 + }, + { + "epoch": 0.5127727514635444, + "grad_norm": 0.3233197331428528, + "learning_rate": 1.9572309383295823e-07, + "loss": 0.2397, + "step": 1927 + }, + { + "epoch": 0.5130388504523683, + "grad_norm": 0.252285897731781, + "learning_rate": 1.957182062128508e-07, + "loss": 0.2187, + "step": 1928 + }, + { + "epoch": 0.5133049494411921, + "grad_norm": 0.29408571124076843, + "learning_rate": 1.9571331586266563e-07, + "loss": 0.2313, + "step": 1929 + }, + { + "epoch": 0.513571048430016, + "grad_norm": 0.2610262930393219, + "learning_rate": 1.9570842278254223e-07, + "loss": 0.2088, + "step": 1930 + }, + { + "epoch": 0.5138371474188398, + "grad_norm": 0.26903098821640015, + "learning_rate": 1.957035269726201e-07, + "loss": 0.1941, + "step": 1931 + }, + { + "epoch": 0.5141032464076637, + "grad_norm": 0.2692919075489044, + "learning_rate": 1.956986284330389e-07, + "loss": 0.2132, + "step": 1932 + }, + { + "epoch": 0.5143693453964875, + "grad_norm": 0.2904910445213318, + "learning_rate": 1.9569372716393835e-07, + "loss": 0.2221, + "step": 1933 + }, + { + "epoch": 0.5146354443853113, + "grad_norm": 0.35782983899116516, + "learning_rate": 1.9568882316545822e-07, + "loss": 0.2157, + "step": 1934 + }, + { + "epoch": 0.5149015433741352, + "grad_norm": 0.4189741611480713, + "learning_rate": 1.9568391643773845e-07, + "loss": 0.2349, + "step": 1935 + }, + { + "epoch": 0.515167642362959, + "grad_norm": 0.24697697162628174, + "learning_rate": 1.9567900698091893e-07, + "loss": 0.2094, + "step": 1936 + }, + { + "epoch": 0.5154337413517829, + "grad_norm": 0.2610195577144623, + "learning_rate": 1.9567409479513966e-07, + "loss": 0.2094, + "step": 1937 + }, + { + "epoch": 0.5156998403406067, + "grad_norm": 0.26100537180900574, + "learning_rate": 1.9566917988054085e-07, + "loss": 0.2151, + "step": 1938 + }, + { + "epoch": 0.5159659393294306, + "grad_norm": 0.30312928557395935, + "learning_rate": 1.9566426223726258e-07, + "loss": 0.2078, + "step": 1939 + }, + { + "epoch": 0.5162320383182544, + "grad_norm": 0.25643861293792725, + "learning_rate": 1.9565934186544518e-07, + "loss": 0.207, + "step": 1940 + }, + { + "epoch": 0.5164981373070783, + "grad_norm": 0.39860597252845764, + "learning_rate": 1.9565441876522893e-07, + "loss": 0.2461, + "step": 1941 + }, + { + "epoch": 0.5167642362959021, + "grad_norm": 0.2886042892932892, + "learning_rate": 1.956494929367543e-07, + "loss": 0.1987, + "step": 1942 + }, + { + "epoch": 0.517030335284726, + "grad_norm": 0.2647421360015869, + "learning_rate": 1.9564456438016174e-07, + "loss": 0.2272, + "step": 1943 + }, + { + "epoch": 0.5172964342735498, + "grad_norm": 0.27739447355270386, + "learning_rate": 1.9563963309559187e-07, + "loss": 0.2175, + "step": 1944 + }, + { + "epoch": 0.5175625332623736, + "grad_norm": 0.3342161178588867, + "learning_rate": 1.956346990831853e-07, + "loss": 0.209, + "step": 1945 + }, + { + "epoch": 0.5178286322511975, + "grad_norm": 0.3671925961971283, + "learning_rate": 1.9562976234308278e-07, + "loss": 0.2118, + "step": 1946 + }, + { + "epoch": 0.5180947312400213, + "grad_norm": 0.25019940733909607, + "learning_rate": 1.956248228754251e-07, + "loss": 0.201, + "step": 1947 + }, + { + "epoch": 0.5183608302288452, + "grad_norm": 0.35253071784973145, + "learning_rate": 1.9561988068035316e-07, + "loss": 0.2223, + "step": 1948 + }, + { + "epoch": 0.518626929217669, + "grad_norm": 0.3441518545150757, + "learning_rate": 1.956149357580079e-07, + "loss": 0.2289, + "step": 1949 + }, + { + "epoch": 0.5188930282064929, + "grad_norm": 0.46232423186302185, + "learning_rate": 1.956099881085304e-07, + "loss": 0.2208, + "step": 1950 + }, + { + "epoch": 0.5191591271953166, + "grad_norm": 0.4567544460296631, + "learning_rate": 1.956050377320617e-07, + "loss": 0.241, + "step": 1951 + }, + { + "epoch": 0.5194252261841406, + "grad_norm": 0.3995029330253601, + "learning_rate": 1.956000846287431e-07, + "loss": 0.238, + "step": 1952 + }, + { + "epoch": 0.5196913251729643, + "grad_norm": 0.4256260097026825, + "learning_rate": 1.9559512879871582e-07, + "loss": 0.233, + "step": 1953 + }, + { + "epoch": 0.5199574241617881, + "grad_norm": 0.36816534399986267, + "learning_rate": 1.9559017024212117e-07, + "loss": 0.2317, + "step": 1954 + }, + { + "epoch": 0.520223523150612, + "grad_norm": 0.2829972207546234, + "learning_rate": 1.9558520895910065e-07, + "loss": 0.2017, + "step": 1955 + }, + { + "epoch": 0.5204896221394358, + "grad_norm": 0.33707407116889954, + "learning_rate": 1.9558024494979572e-07, + "loss": 0.217, + "step": 1956 + }, + { + "epoch": 0.5207557211282597, + "grad_norm": 0.4830465614795685, + "learning_rate": 1.9557527821434797e-07, + "loss": 0.2799, + "step": 1957 + }, + { + "epoch": 0.5210218201170835, + "grad_norm": 0.2983303666114807, + "learning_rate": 1.9557030875289905e-07, + "loss": 0.2317, + "step": 1958 + }, + { + "epoch": 0.5212879191059074, + "grad_norm": 0.25783807039260864, + "learning_rate": 1.9556533656559076e-07, + "loss": 0.1932, + "step": 1959 + }, + { + "epoch": 0.5215540180947312, + "grad_norm": 0.3029518127441406, + "learning_rate": 1.9556036165256482e-07, + "loss": 0.2176, + "step": 1960 + }, + { + "epoch": 0.5218201170835551, + "grad_norm": 0.29548144340515137, + "learning_rate": 1.955553840139632e-07, + "loss": 0.239, + "step": 1961 + }, + { + "epoch": 0.5220862160723789, + "grad_norm": 0.2508391737937927, + "learning_rate": 1.9555040364992788e-07, + "loss": 0.1922, + "step": 1962 + }, + { + "epoch": 0.5223523150612027, + "grad_norm": 0.27590760588645935, + "learning_rate": 1.9554542056060085e-07, + "loss": 0.2093, + "step": 1963 + }, + { + "epoch": 0.5226184140500266, + "grad_norm": 0.2949637472629547, + "learning_rate": 1.9554043474612427e-07, + "loss": 0.2093, + "step": 1964 + }, + { + "epoch": 0.5228845130388504, + "grad_norm": 0.28367555141448975, + "learning_rate": 1.9553544620664034e-07, + "loss": 0.2234, + "step": 1965 + }, + { + "epoch": 0.5231506120276743, + "grad_norm": 0.28460565209388733, + "learning_rate": 1.9553045494229137e-07, + "loss": 0.2368, + "step": 1966 + }, + { + "epoch": 0.5234167110164981, + "grad_norm": 0.32011517882347107, + "learning_rate": 1.9552546095321967e-07, + "loss": 0.2262, + "step": 1967 + }, + { + "epoch": 0.523682810005322, + "grad_norm": 0.26976460218429565, + "learning_rate": 1.9552046423956771e-07, + "loss": 0.2091, + "step": 1968 + }, + { + "epoch": 0.5239489089941458, + "grad_norm": 0.2905050814151764, + "learning_rate": 1.9551546480147804e-07, + "loss": 0.2326, + "step": 1969 + }, + { + "epoch": 0.5242150079829697, + "grad_norm": 0.2921866178512573, + "learning_rate": 1.9551046263909318e-07, + "loss": 0.2338, + "step": 1970 + }, + { + "epoch": 0.5244811069717935, + "grad_norm": 0.3418000340461731, + "learning_rate": 1.9550545775255584e-07, + "loss": 0.2284, + "step": 1971 + }, + { + "epoch": 0.5247472059606173, + "grad_norm": 0.2504505515098572, + "learning_rate": 1.9550045014200877e-07, + "loss": 0.2156, + "step": 1972 + }, + { + "epoch": 0.5250133049494412, + "grad_norm": 0.31751683354377747, + "learning_rate": 1.954954398075948e-07, + "loss": 0.2412, + "step": 1973 + }, + { + "epoch": 0.525279403938265, + "grad_norm": 0.35107511281967163, + "learning_rate": 1.9549042674945684e-07, + "loss": 0.2415, + "step": 1974 + }, + { + "epoch": 0.5255455029270889, + "grad_norm": 0.2898761034011841, + "learning_rate": 1.9548541096773785e-07, + "loss": 0.2362, + "step": 1975 + }, + { + "epoch": 0.5258116019159127, + "grad_norm": 0.28425392508506775, + "learning_rate": 1.954803924625809e-07, + "loss": 0.2403, + "step": 1976 + }, + { + "epoch": 0.5260777009047366, + "grad_norm": 0.2942121922969818, + "learning_rate": 1.9547537123412916e-07, + "loss": 0.2339, + "step": 1977 + }, + { + "epoch": 0.5263437998935604, + "grad_norm": 0.37643247842788696, + "learning_rate": 1.9547034728252578e-07, + "loss": 0.2221, + "step": 1978 + }, + { + "epoch": 0.5266098988823843, + "grad_norm": 0.24306875467300415, + "learning_rate": 1.9546532060791412e-07, + "loss": 0.2054, + "step": 1979 + }, + { + "epoch": 0.5268759978712081, + "grad_norm": 0.2938222587108612, + "learning_rate": 1.954602912104375e-07, + "loss": 0.2063, + "step": 1980 + }, + { + "epoch": 0.5271420968600319, + "grad_norm": 0.2694459557533264, + "learning_rate": 1.954552590902394e-07, + "loss": 0.2169, + "step": 1981 + }, + { + "epoch": 0.5274081958488558, + "grad_norm": 0.24559202790260315, + "learning_rate": 1.9545022424746333e-07, + "loss": 0.193, + "step": 1982 + }, + { + "epoch": 0.5276742948376796, + "grad_norm": 0.316913366317749, + "learning_rate": 1.9544518668225293e-07, + "loss": 0.2384, + "step": 1983 + }, + { + "epoch": 0.5279403938265035, + "grad_norm": 0.35321202874183655, + "learning_rate": 1.9544014639475183e-07, + "loss": 0.2156, + "step": 1984 + }, + { + "epoch": 0.5282064928153273, + "grad_norm": 0.4262084364891052, + "learning_rate": 1.954351033851038e-07, + "loss": 0.252, + "step": 1985 + }, + { + "epoch": 0.5284725918041512, + "grad_norm": 0.36126890778541565, + "learning_rate": 1.9543005765345272e-07, + "loss": 0.2471, + "step": 1986 + }, + { + "epoch": 0.528738690792975, + "grad_norm": 0.24576637148857117, + "learning_rate": 1.9542500919994242e-07, + "loss": 0.1887, + "step": 1987 + }, + { + "epoch": 0.5290047897817989, + "grad_norm": 0.3015620708465576, + "learning_rate": 1.9541995802471697e-07, + "loss": 0.2011, + "step": 1988 + }, + { + "epoch": 0.5292708887706227, + "grad_norm": 0.26142561435699463, + "learning_rate": 1.9541490412792043e-07, + "loss": 0.1979, + "step": 1989 + }, + { + "epoch": 0.5295369877594465, + "grad_norm": 0.26798883080482483, + "learning_rate": 1.9540984750969694e-07, + "loss": 0.2051, + "step": 1990 + }, + { + "epoch": 0.5298030867482704, + "grad_norm": 0.46308913826942444, + "learning_rate": 1.954047881701907e-07, + "loss": 0.2654, + "step": 1991 + }, + { + "epoch": 0.5300691857370942, + "grad_norm": 0.2825559675693512, + "learning_rate": 1.95399726109546e-07, + "loss": 0.2292, + "step": 1992 + }, + { + "epoch": 0.5303352847259181, + "grad_norm": 0.35762903094291687, + "learning_rate": 1.9539466132790728e-07, + "loss": 0.2259, + "step": 1993 + }, + { + "epoch": 0.5306013837147419, + "grad_norm": 0.35896673798561096, + "learning_rate": 1.9538959382541898e-07, + "loss": 0.2462, + "step": 1994 + }, + { + "epoch": 0.5308674827035658, + "grad_norm": 0.2671605050563812, + "learning_rate": 1.9538452360222562e-07, + "loss": 0.2032, + "step": 1995 + }, + { + "epoch": 0.5311335816923896, + "grad_norm": 0.26914170384407043, + "learning_rate": 1.9537945065847178e-07, + "loss": 0.2113, + "step": 1996 + }, + { + "epoch": 0.5313996806812135, + "grad_norm": 0.24149976670742035, + "learning_rate": 1.9537437499430222e-07, + "loss": 0.2025, + "step": 1997 + }, + { + "epoch": 0.5316657796700373, + "grad_norm": 0.3376019597053528, + "learning_rate": 1.9536929660986168e-07, + "loss": 0.2163, + "step": 1998 + }, + { + "epoch": 0.531931878658861, + "grad_norm": 0.2910080850124359, + "learning_rate": 1.9536421550529498e-07, + "loss": 0.2238, + "step": 1999 + }, + { + "epoch": 0.532197977647685, + "grad_norm": 0.3609348237514496, + "learning_rate": 1.9535913168074706e-07, + "loss": 0.2256, + "step": 2000 + }, + { + "epoch": 0.5324640766365087, + "grad_norm": 0.2692658007144928, + "learning_rate": 1.9535404513636296e-07, + "loss": 0.2202, + "step": 2001 + }, + { + "epoch": 0.5327301756253326, + "grad_norm": 0.2664129436016083, + "learning_rate": 1.953489558722877e-07, + "loss": 0.1994, + "step": 2002 + }, + { + "epoch": 0.5329962746141564, + "grad_norm": 0.3198402523994446, + "learning_rate": 1.9534386388866645e-07, + "loss": 0.2291, + "step": 2003 + }, + { + "epoch": 0.5332623736029803, + "grad_norm": 0.3504772484302521, + "learning_rate": 1.9533876918564446e-07, + "loss": 0.2331, + "step": 2004 + }, + { + "epoch": 0.5335284725918041, + "grad_norm": 0.3593769967556, + "learning_rate": 1.9533367176336705e-07, + "loss": 0.2141, + "step": 2005 + }, + { + "epoch": 0.533794571580628, + "grad_norm": 0.35960835218429565, + "learning_rate": 1.953285716219796e-07, + "loss": 0.2448, + "step": 2006 + }, + { + "epoch": 0.5340606705694518, + "grad_norm": 0.545908510684967, + "learning_rate": 1.9532346876162755e-07, + "loss": 0.215, + "step": 2007 + }, + { + "epoch": 0.5343267695582756, + "grad_norm": 0.35643014311790466, + "learning_rate": 1.9531836318245648e-07, + "loss": 0.2106, + "step": 2008 + }, + { + "epoch": 0.5345928685470995, + "grad_norm": 0.3253966271877289, + "learning_rate": 1.9531325488461198e-07, + "loss": 0.2187, + "step": 2009 + }, + { + "epoch": 0.5348589675359233, + "grad_norm": 0.28653591871261597, + "learning_rate": 1.9530814386823975e-07, + "loss": 0.2337, + "step": 2010 + }, + { + "epoch": 0.5351250665247472, + "grad_norm": 0.33107811212539673, + "learning_rate": 1.953030301334856e-07, + "loss": 0.2156, + "step": 2011 + }, + { + "epoch": 0.535391165513571, + "grad_norm": 0.29446423053741455, + "learning_rate": 1.952979136804954e-07, + "loss": 0.2535, + "step": 2012 + }, + { + "epoch": 0.5356572645023949, + "grad_norm": 0.2651841938495636, + "learning_rate": 1.9529279450941498e-07, + "loss": 0.2175, + "step": 2013 + }, + { + "epoch": 0.5359233634912187, + "grad_norm": 0.2929390072822571, + "learning_rate": 1.9528767262039048e-07, + "loss": 0.2335, + "step": 2014 + }, + { + "epoch": 0.5361894624800426, + "grad_norm": 0.30940255522727966, + "learning_rate": 1.952825480135679e-07, + "loss": 0.2354, + "step": 2015 + }, + { + "epoch": 0.5364555614688664, + "grad_norm": 0.4030914902687073, + "learning_rate": 1.9527742068909338e-07, + "loss": 0.2369, + "step": 2016 + }, + { + "epoch": 0.5367216604576902, + "grad_norm": 0.2636895775794983, + "learning_rate": 1.952722906471133e-07, + "loss": 0.2145, + "step": 2017 + }, + { + "epoch": 0.5369877594465141, + "grad_norm": 0.26523324847221375, + "learning_rate": 1.9526715788777381e-07, + "loss": 0.2124, + "step": 2018 + }, + { + "epoch": 0.5372538584353379, + "grad_norm": 0.3318352699279785, + "learning_rate": 1.952620224112214e-07, + "loss": 0.2125, + "step": 2019 + }, + { + "epoch": 0.5375199574241618, + "grad_norm": 0.27587321400642395, + "learning_rate": 1.9525688421760257e-07, + "loss": 0.2209, + "step": 2020 + }, + { + "epoch": 0.5377860564129856, + "grad_norm": 0.26378950476646423, + "learning_rate": 1.9525174330706376e-07, + "loss": 0.2012, + "step": 2021 + }, + { + "epoch": 0.5380521554018095, + "grad_norm": 0.3625640571117401, + "learning_rate": 1.9524659967975175e-07, + "loss": 0.2518, + "step": 2022 + }, + { + "epoch": 0.5383182543906333, + "grad_norm": 0.3545631468296051, + "learning_rate": 1.9524145333581314e-07, + "loss": 0.2359, + "step": 2023 + }, + { + "epoch": 0.5385843533794572, + "grad_norm": 0.2613840401172638, + "learning_rate": 1.952363042753947e-07, + "loss": 0.211, + "step": 2024 + }, + { + "epoch": 0.538850452368281, + "grad_norm": 0.24646534025669098, + "learning_rate": 1.9523115249864338e-07, + "loss": 0.2036, + "step": 2025 + }, + { + "epoch": 0.5391165513571048, + "grad_norm": 0.24149776995182037, + "learning_rate": 1.9522599800570606e-07, + "loss": 0.1989, + "step": 2026 + }, + { + "epoch": 0.5393826503459287, + "grad_norm": 0.25511109828948975, + "learning_rate": 1.9522084079672975e-07, + "loss": 0.2011, + "step": 2027 + }, + { + "epoch": 0.5396487493347525, + "grad_norm": 0.28102004528045654, + "learning_rate": 1.952156808718616e-07, + "loss": 0.2128, + "step": 2028 + }, + { + "epoch": 0.5399148483235764, + "grad_norm": 0.26393118500709534, + "learning_rate": 1.9521051823124872e-07, + "loss": 0.2249, + "step": 2029 + }, + { + "epoch": 0.5401809473124002, + "grad_norm": 0.33141258358955383, + "learning_rate": 1.9520535287503836e-07, + "loss": 0.2073, + "step": 2030 + }, + { + "epoch": 0.5404470463012241, + "grad_norm": 0.24789966642856598, + "learning_rate": 1.952001848033779e-07, + "loss": 0.1994, + "step": 2031 + }, + { + "epoch": 0.5407131452900479, + "grad_norm": 0.4270676374435425, + "learning_rate": 1.9519501401641472e-07, + "loss": 0.2339, + "step": 2032 + }, + { + "epoch": 0.5409792442788718, + "grad_norm": 0.24346718192100525, + "learning_rate": 1.9518984051429627e-07, + "loss": 0.2218, + "step": 2033 + }, + { + "epoch": 0.5412453432676956, + "grad_norm": 0.2592104375362396, + "learning_rate": 1.9518466429717016e-07, + "loss": 0.2238, + "step": 2034 + }, + { + "epoch": 0.5415114422565194, + "grad_norm": 0.3175201416015625, + "learning_rate": 1.9517948536518398e-07, + "loss": 0.2281, + "step": 2035 + }, + { + "epoch": 0.5417775412453433, + "grad_norm": 0.26259803771972656, + "learning_rate": 1.9517430371848547e-07, + "loss": 0.2098, + "step": 2036 + }, + { + "epoch": 0.5420436402341671, + "grad_norm": 0.27043476700782776, + "learning_rate": 1.951691193572224e-07, + "loss": 0.2096, + "step": 2037 + }, + { + "epoch": 0.542309739222991, + "grad_norm": 0.30177387595176697, + "learning_rate": 1.951639322815427e-07, + "loss": 0.2569, + "step": 2038 + }, + { + "epoch": 0.5425758382118148, + "grad_norm": 0.4258653521537781, + "learning_rate": 1.951587424915942e-07, + "loss": 0.2353, + "step": 2039 + }, + { + "epoch": 0.5428419372006387, + "grad_norm": 0.30194535851478577, + "learning_rate": 1.9515354998752502e-07, + "loss": 0.2068, + "step": 2040 + }, + { + "epoch": 0.5431080361894625, + "grad_norm": 0.35675662755966187, + "learning_rate": 1.9514835476948325e-07, + "loss": 0.2178, + "step": 2041 + }, + { + "epoch": 0.5433741351782864, + "grad_norm": 0.5102407336235046, + "learning_rate": 1.9514315683761704e-07, + "loss": 0.2514, + "step": 2042 + }, + { + "epoch": 0.5436402341671102, + "grad_norm": 0.2882722020149231, + "learning_rate": 1.9513795619207466e-07, + "loss": 0.2454, + "step": 2043 + }, + { + "epoch": 0.543906333155934, + "grad_norm": 0.2709808945655823, + "learning_rate": 1.9513275283300445e-07, + "loss": 0.214, + "step": 2044 + }, + { + "epoch": 0.5441724321447579, + "grad_norm": 0.38273391127586365, + "learning_rate": 1.951275467605548e-07, + "loss": 0.2216, + "step": 2045 + }, + { + "epoch": 0.5444385311335816, + "grad_norm": 0.2588343024253845, + "learning_rate": 1.951223379748742e-07, + "loss": 0.2079, + "step": 2046 + }, + { + "epoch": 0.5447046301224056, + "grad_norm": 0.44999244809150696, + "learning_rate": 1.9511712647611123e-07, + "loss": 0.2499, + "step": 2047 + }, + { + "epoch": 0.5449707291112293, + "grad_norm": 0.30882179737091064, + "learning_rate": 1.9511191226441452e-07, + "loss": 0.2557, + "step": 2048 + }, + { + "epoch": 0.5452368281000533, + "grad_norm": 0.35754653811454773, + "learning_rate": 1.951066953399328e-07, + "loss": 0.2036, + "step": 2049 + }, + { + "epoch": 0.545502927088877, + "grad_norm": 0.2907683849334717, + "learning_rate": 1.9510147570281486e-07, + "loss": 0.2241, + "step": 2050 + }, + { + "epoch": 0.545769026077701, + "grad_norm": 0.2797240614891052, + "learning_rate": 1.950962533532096e-07, + "loss": 0.2126, + "step": 2051 + }, + { + "epoch": 0.5460351250665247, + "grad_norm": 0.2740214169025421, + "learning_rate": 1.9509102829126592e-07, + "loss": 0.1944, + "step": 2052 + }, + { + "epoch": 0.5463012240553486, + "grad_norm": 0.3986384868621826, + "learning_rate": 1.9508580051713288e-07, + "loss": 0.2363, + "step": 2053 + }, + { + "epoch": 0.5465673230441724, + "grad_norm": 0.26359081268310547, + "learning_rate": 1.9508057003095958e-07, + "loss": 0.1962, + "step": 2054 + }, + { + "epoch": 0.5468334220329962, + "grad_norm": 0.27620095014572144, + "learning_rate": 1.9507533683289522e-07, + "loss": 0.2254, + "step": 2055 + }, + { + "epoch": 0.5470995210218201, + "grad_norm": 0.25918877124786377, + "learning_rate": 1.9507010092308907e-07, + "loss": 0.2059, + "step": 2056 + }, + { + "epoch": 0.5473656200106439, + "grad_norm": 0.282795786857605, + "learning_rate": 1.9506486230169043e-07, + "loss": 0.2191, + "step": 2057 + }, + { + "epoch": 0.5476317189994678, + "grad_norm": 0.2690778970718384, + "learning_rate": 1.9505962096884875e-07, + "loss": 0.1926, + "step": 2058 + }, + { + "epoch": 0.5478978179882916, + "grad_norm": 0.2680918276309967, + "learning_rate": 1.9505437692471353e-07, + "loss": 0.2236, + "step": 2059 + }, + { + "epoch": 0.5481639169771155, + "grad_norm": 0.25568443536758423, + "learning_rate": 1.9504913016943426e-07, + "loss": 0.2042, + "step": 2060 + }, + { + "epoch": 0.5484300159659393, + "grad_norm": 0.360633909702301, + "learning_rate": 1.950438807031607e-07, + "loss": 0.2443, + "step": 2061 + }, + { + "epoch": 0.5486961149547632, + "grad_norm": 0.2641468942165375, + "learning_rate": 1.9503862852604248e-07, + "loss": 0.2216, + "step": 2062 + }, + { + "epoch": 0.548962213943587, + "grad_norm": 0.27168917655944824, + "learning_rate": 1.950333736382295e-07, + "loss": 0.217, + "step": 2063 + }, + { + "epoch": 0.5492283129324108, + "grad_norm": 0.2384924292564392, + "learning_rate": 1.9502811603987155e-07, + "loss": 0.2036, + "step": 2064 + }, + { + "epoch": 0.5494944119212347, + "grad_norm": 0.2820121943950653, + "learning_rate": 1.9502285573111862e-07, + "loss": 0.2216, + "step": 2065 + }, + { + "epoch": 0.5497605109100585, + "grad_norm": 0.36828094720840454, + "learning_rate": 1.9501759271212078e-07, + "loss": 0.2225, + "step": 2066 + }, + { + "epoch": 0.5500266098988824, + "grad_norm": 0.29375171661376953, + "learning_rate": 1.9501232698302807e-07, + "loss": 0.2236, + "step": 2067 + }, + { + "epoch": 0.5502927088877062, + "grad_norm": 0.2667753994464874, + "learning_rate": 1.9500705854399076e-07, + "loss": 0.235, + "step": 2068 + }, + { + "epoch": 0.5505588078765301, + "grad_norm": 0.27038007974624634, + "learning_rate": 1.9500178739515904e-07, + "loss": 0.2165, + "step": 2069 + }, + { + "epoch": 0.5508249068653539, + "grad_norm": 0.2980892062187195, + "learning_rate": 1.9499651353668335e-07, + "loss": 0.2388, + "step": 2070 + }, + { + "epoch": 0.5510910058541778, + "grad_norm": 0.41857364773750305, + "learning_rate": 1.9499123696871398e-07, + "loss": 0.2468, + "step": 2071 + }, + { + "epoch": 0.5513571048430016, + "grad_norm": 0.26162245869636536, + "learning_rate": 1.9498595769140155e-07, + "loss": 0.2106, + "step": 2072 + }, + { + "epoch": 0.5516232038318254, + "grad_norm": 0.3840639293193817, + "learning_rate": 1.9498067570489653e-07, + "loss": 0.2418, + "step": 2073 + }, + { + "epoch": 0.5518893028206493, + "grad_norm": 0.3457549214363098, + "learning_rate": 1.9497539100934968e-07, + "loss": 0.2241, + "step": 2074 + }, + { + "epoch": 0.5521554018094731, + "grad_norm": 0.26913386583328247, + "learning_rate": 1.9497010360491166e-07, + "loss": 0.2221, + "step": 2075 + }, + { + "epoch": 0.552421500798297, + "grad_norm": 0.25775453448295593, + "learning_rate": 1.9496481349173331e-07, + "loss": 0.2091, + "step": 2076 + }, + { + "epoch": 0.5526875997871208, + "grad_norm": 0.2643227279186249, + "learning_rate": 1.949595206699655e-07, + "loss": 0.2051, + "step": 2077 + }, + { + "epoch": 0.5529536987759447, + "grad_norm": 0.26169851422309875, + "learning_rate": 1.9495422513975917e-07, + "loss": 0.2164, + "step": 2078 + }, + { + "epoch": 0.5532197977647685, + "grad_norm": 0.4203450083732605, + "learning_rate": 1.949489269012654e-07, + "loss": 0.2274, + "step": 2079 + }, + { + "epoch": 0.5534858967535924, + "grad_norm": 0.2641468644142151, + "learning_rate": 1.9494362595463532e-07, + "loss": 0.2184, + "step": 2080 + }, + { + "epoch": 0.5537519957424162, + "grad_norm": 0.24171435832977295, + "learning_rate": 1.9493832230002005e-07, + "loss": 0.2046, + "step": 2081 + }, + { + "epoch": 0.55401809473124, + "grad_norm": 0.3509100377559662, + "learning_rate": 1.9493301593757091e-07, + "loss": 0.2257, + "step": 2082 + }, + { + "epoch": 0.5542841937200639, + "grad_norm": 0.3694177567958832, + "learning_rate": 1.9492770686743927e-07, + "loss": 0.2263, + "step": 2083 + }, + { + "epoch": 0.5545502927088877, + "grad_norm": 0.26410019397735596, + "learning_rate": 1.9492239508977647e-07, + "loss": 0.2057, + "step": 2084 + }, + { + "epoch": 0.5548163916977116, + "grad_norm": 0.3737354278564453, + "learning_rate": 1.9491708060473414e-07, + "loss": 0.2294, + "step": 2085 + }, + { + "epoch": 0.5550824906865354, + "grad_norm": 0.3502540588378906, + "learning_rate": 1.9491176341246373e-07, + "loss": 0.2379, + "step": 2086 + }, + { + "epoch": 0.5553485896753593, + "grad_norm": 0.2753280997276306, + "learning_rate": 1.94906443513117e-07, + "loss": 0.2169, + "step": 2087 + }, + { + "epoch": 0.5556146886641831, + "grad_norm": 0.3938615918159485, + "learning_rate": 1.9490112090684564e-07, + "loss": 0.2187, + "step": 2088 + }, + { + "epoch": 0.555880787653007, + "grad_norm": 0.27750691771507263, + "learning_rate": 1.9489579559380146e-07, + "loss": 0.2247, + "step": 2089 + }, + { + "epoch": 0.5561468866418308, + "grad_norm": 0.5261476635932922, + "learning_rate": 1.9489046757413632e-07, + "loss": 0.2183, + "step": 2090 + }, + { + "epoch": 0.5564129856306546, + "grad_norm": 0.3130474388599396, + "learning_rate": 1.9488513684800222e-07, + "loss": 0.2198, + "step": 2091 + }, + { + "epoch": 0.5566790846194785, + "grad_norm": 0.3592240512371063, + "learning_rate": 1.9487980341555125e-07, + "loss": 0.2282, + "step": 2092 + }, + { + "epoch": 0.5569451836083023, + "grad_norm": 0.27742016315460205, + "learning_rate": 1.9487446727693544e-07, + "loss": 0.2149, + "step": 2093 + }, + { + "epoch": 0.5572112825971262, + "grad_norm": 0.26690784096717834, + "learning_rate": 1.9486912843230704e-07, + "loss": 0.2205, + "step": 2094 + }, + { + "epoch": 0.55747738158595, + "grad_norm": 0.26583853363990784, + "learning_rate": 1.9486378688181832e-07, + "loss": 0.2114, + "step": 2095 + }, + { + "epoch": 0.5577434805747739, + "grad_norm": 0.2708437740802765, + "learning_rate": 1.9485844262562163e-07, + "loss": 0.2131, + "step": 2096 + }, + { + "epoch": 0.5580095795635976, + "grad_norm": 0.28815439343452454, + "learning_rate": 1.9485309566386938e-07, + "loss": 0.2184, + "step": 2097 + }, + { + "epoch": 0.5582756785524216, + "grad_norm": 0.3659645915031433, + "learning_rate": 1.9484774599671412e-07, + "loss": 0.2264, + "step": 2098 + }, + { + "epoch": 0.5585417775412453, + "grad_norm": 0.2912059426307678, + "learning_rate": 1.9484239362430837e-07, + "loss": 0.2252, + "step": 2099 + }, + { + "epoch": 0.5588078765300691, + "grad_norm": 0.3058759868144989, + "learning_rate": 1.948370385468048e-07, + "loss": 0.2245, + "step": 2100 + }, + { + "epoch": 0.559073975518893, + "grad_norm": 0.2475484162569046, + "learning_rate": 1.9483168076435624e-07, + "loss": 0.1945, + "step": 2101 + }, + { + "epoch": 0.5593400745077168, + "grad_norm": 0.3647507131099701, + "learning_rate": 1.948263202771154e-07, + "loss": 0.2273, + "step": 2102 + }, + { + "epoch": 0.5596061734965407, + "grad_norm": 0.27711182832717896, + "learning_rate": 1.9482095708523523e-07, + "loss": 0.2005, + "step": 2103 + }, + { + "epoch": 0.5598722724853645, + "grad_norm": 0.34057390689849854, + "learning_rate": 1.9481559118886867e-07, + "loss": 0.2284, + "step": 2104 + }, + { + "epoch": 0.5601383714741884, + "grad_norm": 0.28695136308670044, + "learning_rate": 1.9481022258816874e-07, + "loss": 0.197, + "step": 2105 + }, + { + "epoch": 0.5604044704630122, + "grad_norm": 0.27686017751693726, + "learning_rate": 1.9480485128328865e-07, + "loss": 0.1944, + "step": 2106 + }, + { + "epoch": 0.5606705694518361, + "grad_norm": 0.2746678292751312, + "learning_rate": 1.9479947727438153e-07, + "loss": 0.2242, + "step": 2107 + }, + { + "epoch": 0.5609366684406599, + "grad_norm": 0.2595820426940918, + "learning_rate": 1.947941005616007e-07, + "loss": 0.22, + "step": 2108 + }, + { + "epoch": 0.5612027674294837, + "grad_norm": 0.25123143196105957, + "learning_rate": 1.9478872114509942e-07, + "loss": 0.1994, + "step": 2109 + }, + { + "epoch": 0.5614688664183076, + "grad_norm": 0.3663157522678375, + "learning_rate": 1.9478333902503127e-07, + "loss": 0.2105, + "step": 2110 + }, + { + "epoch": 0.5617349654071314, + "grad_norm": 0.26534977555274963, + "learning_rate": 1.9477795420154963e-07, + "loss": 0.2085, + "step": 2111 + }, + { + "epoch": 0.5620010643959553, + "grad_norm": 0.4310172200202942, + "learning_rate": 1.9477256667480818e-07, + "loss": 0.2398, + "step": 2112 + }, + { + "epoch": 0.5622671633847791, + "grad_norm": 0.31038379669189453, + "learning_rate": 1.947671764449605e-07, + "loss": 0.2105, + "step": 2113 + }, + { + "epoch": 0.562533262373603, + "grad_norm": 0.2577454447746277, + "learning_rate": 1.947617835121604e-07, + "loss": 0.201, + "step": 2114 + }, + { + "epoch": 0.5627993613624268, + "grad_norm": 0.6044313311576843, + "learning_rate": 1.9475638787656163e-07, + "loss": 0.2412, + "step": 2115 + }, + { + "epoch": 0.5630654603512507, + "grad_norm": 0.25944983959198, + "learning_rate": 1.9475098953831817e-07, + "loss": 0.2125, + "step": 2116 + }, + { + "epoch": 0.5633315593400745, + "grad_norm": 0.33483052253723145, + "learning_rate": 1.9474558849758392e-07, + "loss": 0.2207, + "step": 2117 + }, + { + "epoch": 0.5635976583288983, + "grad_norm": 0.287599116563797, + "learning_rate": 1.9474018475451298e-07, + "loss": 0.2107, + "step": 2118 + }, + { + "epoch": 0.5638637573177222, + "grad_norm": 0.27099695801734924, + "learning_rate": 1.9473477830925942e-07, + "loss": 0.2037, + "step": 2119 + }, + { + "epoch": 0.564129856306546, + "grad_norm": 0.26561498641967773, + "learning_rate": 1.9472936916197749e-07, + "loss": 0.2071, + "step": 2120 + }, + { + "epoch": 0.5643959552953699, + "grad_norm": 0.30456551909446716, + "learning_rate": 1.9472395731282144e-07, + "loss": 0.2146, + "step": 2121 + }, + { + "epoch": 0.5646620542841937, + "grad_norm": 0.24964193999767303, + "learning_rate": 1.9471854276194565e-07, + "loss": 0.1805, + "step": 2122 + }, + { + "epoch": 0.5649281532730176, + "grad_norm": 0.38300347328186035, + "learning_rate": 1.9471312550950454e-07, + "loss": 0.2493, + "step": 2123 + }, + { + "epoch": 0.5651942522618414, + "grad_norm": 0.35134950280189514, + "learning_rate": 1.9470770555565262e-07, + "loss": 0.2199, + "step": 2124 + }, + { + "epoch": 0.5654603512506653, + "grad_norm": 0.3428250551223755, + "learning_rate": 1.9470228290054447e-07, + "loss": 0.2165, + "step": 2125 + }, + { + "epoch": 0.5657264502394891, + "grad_norm": 0.8164843916893005, + "learning_rate": 1.9469685754433482e-07, + "loss": 0.221, + "step": 2126 + }, + { + "epoch": 0.5659925492283129, + "grad_norm": 0.369472473859787, + "learning_rate": 1.946914294871783e-07, + "loss": 0.22, + "step": 2127 + }, + { + "epoch": 0.5662586482171368, + "grad_norm": 0.3012942373752594, + "learning_rate": 1.946859987292298e-07, + "loss": 0.2069, + "step": 2128 + }, + { + "epoch": 0.5665247472059606, + "grad_norm": 0.22944234311580658, + "learning_rate": 1.9468056527064424e-07, + "loss": 0.1744, + "step": 2129 + }, + { + "epoch": 0.5667908461947845, + "grad_norm": 0.25333982706069946, + "learning_rate": 1.946751291115765e-07, + "loss": 0.2031, + "step": 2130 + }, + { + "epoch": 0.5670569451836083, + "grad_norm": 0.2657730281352997, + "learning_rate": 1.9466969025218174e-07, + "loss": 0.2028, + "step": 2131 + }, + { + "epoch": 0.5673230441724322, + "grad_norm": 0.28702008724212646, + "learning_rate": 1.9466424869261503e-07, + "loss": 0.2215, + "step": 2132 + }, + { + "epoch": 0.567589143161256, + "grad_norm": 0.26331669092178345, + "learning_rate": 1.9465880443303157e-07, + "loss": 0.197, + "step": 2133 + }, + { + "epoch": 0.5678552421500799, + "grad_norm": 0.3306090235710144, + "learning_rate": 1.9465335747358664e-07, + "loss": 0.2143, + "step": 2134 + }, + { + "epoch": 0.5681213411389037, + "grad_norm": 0.2503424286842346, + "learning_rate": 1.9464790781443564e-07, + "loss": 0.2043, + "step": 2135 + }, + { + "epoch": 0.5683874401277275, + "grad_norm": 0.2715587019920349, + "learning_rate": 1.9464245545573394e-07, + "loss": 0.2246, + "step": 2136 + }, + { + "epoch": 0.5686535391165514, + "grad_norm": 0.2864100933074951, + "learning_rate": 1.946370003976371e-07, + "loss": 0.2376, + "step": 2137 + }, + { + "epoch": 0.5689196381053752, + "grad_norm": 0.33814117312431335, + "learning_rate": 1.946315426403007e-07, + "loss": 0.222, + "step": 2138 + }, + { + "epoch": 0.5691857370941991, + "grad_norm": 0.3741440773010254, + "learning_rate": 1.9462608218388042e-07, + "loss": 0.2295, + "step": 2139 + }, + { + "epoch": 0.5694518360830229, + "grad_norm": 0.27103355526924133, + "learning_rate": 1.9462061902853196e-07, + "loss": 0.2113, + "step": 2140 + }, + { + "epoch": 0.5697179350718468, + "grad_norm": 0.26674699783325195, + "learning_rate": 1.9461515317441118e-07, + "loss": 0.215, + "step": 2141 + }, + { + "epoch": 0.5699840340606706, + "grad_norm": 0.2760232985019684, + "learning_rate": 1.9460968462167394e-07, + "loss": 0.2217, + "step": 2142 + }, + { + "epoch": 0.5702501330494945, + "grad_norm": 0.2627291679382324, + "learning_rate": 1.9460421337047627e-07, + "loss": 0.1966, + "step": 2143 + }, + { + "epoch": 0.5705162320383182, + "grad_norm": 0.4099670648574829, + "learning_rate": 1.9459873942097417e-07, + "loss": 0.2235, + "step": 2144 + }, + { + "epoch": 0.570782331027142, + "grad_norm": 0.2893020510673523, + "learning_rate": 1.9459326277332382e-07, + "loss": 0.2309, + "step": 2145 + }, + { + "epoch": 0.571048430015966, + "grad_norm": 0.26890861988067627, + "learning_rate": 1.9458778342768136e-07, + "loss": 0.213, + "step": 2146 + }, + { + "epoch": 0.5713145290047897, + "grad_norm": 0.36991631984710693, + "learning_rate": 1.9458230138420313e-07, + "loss": 0.2573, + "step": 2147 + }, + { + "epoch": 0.5715806279936136, + "grad_norm": 0.26782137155532837, + "learning_rate": 1.9457681664304545e-07, + "loss": 0.2132, + "step": 2148 + }, + { + "epoch": 0.5718467269824374, + "grad_norm": 0.3197719156742096, + "learning_rate": 1.9457132920436478e-07, + "loss": 0.203, + "step": 2149 + }, + { + "epoch": 0.5721128259712613, + "grad_norm": 0.26529183983802795, + "learning_rate": 1.945658390683176e-07, + "loss": 0.2101, + "step": 2150 + }, + { + "epoch": 0.5723789249600851, + "grad_norm": 0.2929004728794098, + "learning_rate": 1.9456034623506054e-07, + "loss": 0.226, + "step": 2151 + }, + { + "epoch": 0.572645023948909, + "grad_norm": 0.38393786549568176, + "learning_rate": 1.9455485070475025e-07, + "loss": 0.2237, + "step": 2152 + }, + { + "epoch": 0.5729111229377328, + "grad_norm": 0.30167725682258606, + "learning_rate": 1.9454935247754347e-07, + "loss": 0.2236, + "step": 2153 + }, + { + "epoch": 0.5731772219265566, + "grad_norm": 0.26454398036003113, + "learning_rate": 1.94543851553597e-07, + "loss": 0.2099, + "step": 2154 + }, + { + "epoch": 0.5734433209153805, + "grad_norm": 0.3275390565395355, + "learning_rate": 1.945383479330678e-07, + "loss": 0.2262, + "step": 2155 + }, + { + "epoch": 0.5737094199042043, + "grad_norm": 0.3338598310947418, + "learning_rate": 1.945328416161128e-07, + "loss": 0.2213, + "step": 2156 + }, + { + "epoch": 0.5739755188930282, + "grad_norm": 0.2731224596500397, + "learning_rate": 1.9452733260288906e-07, + "loss": 0.2142, + "step": 2157 + }, + { + "epoch": 0.574241617881852, + "grad_norm": 0.280146986246109, + "learning_rate": 1.945218208935537e-07, + "loss": 0.2279, + "step": 2158 + }, + { + "epoch": 0.5745077168706759, + "grad_norm": 0.27869004011154175, + "learning_rate": 1.9451630648826393e-07, + "loss": 0.2206, + "step": 2159 + }, + { + "epoch": 0.5747738158594997, + "grad_norm": 0.22585073113441467, + "learning_rate": 1.94510789387177e-07, + "loss": 0.1873, + "step": 2160 + }, + { + "epoch": 0.5750399148483236, + "grad_norm": 0.33409029245376587, + "learning_rate": 1.9450526959045034e-07, + "loss": 0.2243, + "step": 2161 + }, + { + "epoch": 0.5753060138371474, + "grad_norm": 0.28528687357902527, + "learning_rate": 1.9449974709824133e-07, + "loss": 0.2067, + "step": 2162 + }, + { + "epoch": 0.5755721128259713, + "grad_norm": 0.33694157004356384, + "learning_rate": 1.944942219107075e-07, + "loss": 0.2306, + "step": 2163 + }, + { + "epoch": 0.5758382118147951, + "grad_norm": 0.36782583594322205, + "learning_rate": 1.9448869402800642e-07, + "loss": 0.2283, + "step": 2164 + }, + { + "epoch": 0.5761043108036189, + "grad_norm": 0.2936227321624756, + "learning_rate": 1.9448316345029583e-07, + "loss": 0.2127, + "step": 2165 + }, + { + "epoch": 0.5763704097924428, + "grad_norm": 0.26297247409820557, + "learning_rate": 1.9447763017773338e-07, + "loss": 0.2113, + "step": 2166 + }, + { + "epoch": 0.5766365087812666, + "grad_norm": 0.24568887054920197, + "learning_rate": 1.9447209421047693e-07, + "loss": 0.2132, + "step": 2167 + }, + { + "epoch": 0.5769026077700905, + "grad_norm": 0.3062211573123932, + "learning_rate": 1.944665555486844e-07, + "loss": 0.2152, + "step": 2168 + }, + { + "epoch": 0.5771687067589143, + "grad_norm": 0.5946362614631653, + "learning_rate": 1.9446101419251374e-07, + "loss": 0.2117, + "step": 2169 + }, + { + "epoch": 0.5774348057477382, + "grad_norm": 0.2897554337978363, + "learning_rate": 1.9445547014212298e-07, + "loss": 0.2184, + "step": 2170 + }, + { + "epoch": 0.577700904736562, + "grad_norm": 0.3293672502040863, + "learning_rate": 1.9444992339767028e-07, + "loss": 0.2338, + "step": 2171 + }, + { + "epoch": 0.5779670037253859, + "grad_norm": 0.25732994079589844, + "learning_rate": 1.9444437395931383e-07, + "loss": 0.2053, + "step": 2172 + }, + { + "epoch": 0.5782331027142097, + "grad_norm": 0.4312707483768463, + "learning_rate": 1.944388218272119e-07, + "loss": 0.2392, + "step": 2173 + }, + { + "epoch": 0.5784992017030335, + "grad_norm": 0.2512159049510956, + "learning_rate": 1.9443326700152288e-07, + "loss": 0.2096, + "step": 2174 + }, + { + "epoch": 0.5787653006918574, + "grad_norm": 0.3000927269458771, + "learning_rate": 1.944277094824052e-07, + "loss": 0.2392, + "step": 2175 + }, + { + "epoch": 0.5790313996806812, + "grad_norm": 0.3261195421218872, + "learning_rate": 1.9442214927001735e-07, + "loss": 0.205, + "step": 2176 + }, + { + "epoch": 0.5792974986695051, + "grad_norm": 0.23444247245788574, + "learning_rate": 1.9441658636451792e-07, + "loss": 0.1938, + "step": 2177 + }, + { + "epoch": 0.5795635976583289, + "grad_norm": 0.4418433606624603, + "learning_rate": 1.944110207660656e-07, + "loss": 0.2366, + "step": 2178 + }, + { + "epoch": 0.5798296966471528, + "grad_norm": 0.25144869089126587, + "learning_rate": 1.9440545247481909e-07, + "loss": 0.2184, + "step": 2179 + }, + { + "epoch": 0.5800957956359766, + "grad_norm": 0.33628299832344055, + "learning_rate": 1.9439988149093724e-07, + "loss": 0.2236, + "step": 2180 + }, + { + "epoch": 0.5803618946248005, + "grad_norm": 0.2750104069709778, + "learning_rate": 1.9439430781457892e-07, + "loss": 0.24, + "step": 2181 + }, + { + "epoch": 0.5806279936136243, + "grad_norm": 0.26613378524780273, + "learning_rate": 1.9438873144590317e-07, + "loss": 0.2152, + "step": 2182 + }, + { + "epoch": 0.5808940926024481, + "grad_norm": 0.32516199350357056, + "learning_rate": 1.9438315238506898e-07, + "loss": 0.2287, + "step": 2183 + }, + { + "epoch": 0.581160191591272, + "grad_norm": 0.2715206742286682, + "learning_rate": 1.9437757063223547e-07, + "loss": 0.2203, + "step": 2184 + }, + { + "epoch": 0.5814262905800958, + "grad_norm": 0.2527748942375183, + "learning_rate": 1.9437198618756186e-07, + "loss": 0.1917, + "step": 2185 + }, + { + "epoch": 0.5816923895689197, + "grad_norm": 0.48284047842025757, + "learning_rate": 1.9436639905120743e-07, + "loss": 0.2708, + "step": 2186 + }, + { + "epoch": 0.5819584885577435, + "grad_norm": 0.28976237773895264, + "learning_rate": 1.943608092233316e-07, + "loss": 0.2051, + "step": 2187 + }, + { + "epoch": 0.5822245875465674, + "grad_norm": 0.3585750460624695, + "learning_rate": 1.9435521670409366e-07, + "loss": 0.2383, + "step": 2188 + }, + { + "epoch": 0.5824906865353912, + "grad_norm": 0.2649739980697632, + "learning_rate": 1.9434962149365323e-07, + "loss": 0.2084, + "step": 2189 + }, + { + "epoch": 0.5827567855242151, + "grad_norm": 0.4340634346008301, + "learning_rate": 1.9434402359216985e-07, + "loss": 0.2508, + "step": 2190 + }, + { + "epoch": 0.5830228845130389, + "grad_norm": 0.32115185260772705, + "learning_rate": 1.9433842299980323e-07, + "loss": 0.2075, + "step": 2191 + }, + { + "epoch": 0.5832889835018626, + "grad_norm": 0.25930503010749817, + "learning_rate": 1.9433281971671307e-07, + "loss": 0.224, + "step": 2192 + }, + { + "epoch": 0.5835550824906865, + "grad_norm": 0.26742324233055115, + "learning_rate": 1.943272137430592e-07, + "loss": 0.216, + "step": 2193 + }, + { + "epoch": 0.5838211814795103, + "grad_norm": 0.28376173973083496, + "learning_rate": 1.943216050790015e-07, + "loss": 0.1996, + "step": 2194 + }, + { + "epoch": 0.5840872804683342, + "grad_norm": 0.43054282665252686, + "learning_rate": 1.9431599372469995e-07, + "loss": 0.2474, + "step": 2195 + }, + { + "epoch": 0.584353379457158, + "grad_norm": 0.2771788239479065, + "learning_rate": 1.9431037968031462e-07, + "loss": 0.2316, + "step": 2196 + }, + { + "epoch": 0.5846194784459819, + "grad_norm": 0.4292594790458679, + "learning_rate": 1.943047629460056e-07, + "loss": 0.2306, + "step": 2197 + }, + { + "epoch": 0.5848855774348057, + "grad_norm": 0.40099185705184937, + "learning_rate": 1.9429914352193313e-07, + "loss": 0.2319, + "step": 2198 + }, + { + "epoch": 0.5851516764236296, + "grad_norm": 0.33134758472442627, + "learning_rate": 1.9429352140825743e-07, + "loss": 0.2197, + "step": 2199 + }, + { + "epoch": 0.5854177754124534, + "grad_norm": 0.3390153646469116, + "learning_rate": 1.9428789660513889e-07, + "loss": 0.217, + "step": 2200 + }, + { + "epoch": 0.5856838744012772, + "grad_norm": 0.25654712319374084, + "learning_rate": 1.9428226911273796e-07, + "loss": 0.2232, + "step": 2201 + }, + { + "epoch": 0.5859499733901011, + "grad_norm": 0.44005393981933594, + "learning_rate": 1.942766389312151e-07, + "loss": 0.2344, + "step": 2202 + }, + { + "epoch": 0.5862160723789249, + "grad_norm": 0.2593744993209839, + "learning_rate": 1.9427100606073091e-07, + "loss": 0.2267, + "step": 2203 + }, + { + "epoch": 0.5864821713677488, + "grad_norm": 0.3080717921257019, + "learning_rate": 1.942653705014461e-07, + "loss": 0.2276, + "step": 2204 + }, + { + "epoch": 0.5867482703565726, + "grad_norm": 0.2662399411201477, + "learning_rate": 1.9425973225352133e-07, + "loss": 0.2112, + "step": 2205 + }, + { + "epoch": 0.5870143693453965, + "grad_norm": 0.2661568522453308, + "learning_rate": 1.9425409131711746e-07, + "loss": 0.2056, + "step": 2206 + }, + { + "epoch": 0.5872804683342203, + "grad_norm": 0.2829601466655731, + "learning_rate": 1.942484476923954e-07, + "loss": 0.2073, + "step": 2207 + }, + { + "epoch": 0.5875465673230442, + "grad_norm": 0.27623501420021057, + "learning_rate": 1.9424280137951604e-07, + "loss": 0.2262, + "step": 2208 + }, + { + "epoch": 0.587812666311868, + "grad_norm": 0.322562575340271, + "learning_rate": 1.942371523786405e-07, + "loss": 0.206, + "step": 2209 + }, + { + "epoch": 0.5880787653006918, + "grad_norm": 0.27601128816604614, + "learning_rate": 1.9423150068992988e-07, + "loss": 0.2242, + "step": 2210 + }, + { + "epoch": 0.5883448642895157, + "grad_norm": 0.28806957602500916, + "learning_rate": 1.9422584631354537e-07, + "loss": 0.2159, + "step": 2211 + }, + { + "epoch": 0.5886109632783395, + "grad_norm": 0.3411746919155121, + "learning_rate": 1.9422018924964824e-07, + "loss": 0.2358, + "step": 2212 + }, + { + "epoch": 0.5888770622671634, + "grad_norm": 0.3578915596008301, + "learning_rate": 1.9421452949839986e-07, + "loss": 0.2266, + "step": 2213 + }, + { + "epoch": 0.5891431612559872, + "grad_norm": 0.30111345648765564, + "learning_rate": 1.9420886705996163e-07, + "loss": 0.2344, + "step": 2214 + }, + { + "epoch": 0.5894092602448111, + "grad_norm": 0.25765514373779297, + "learning_rate": 1.9420320193449508e-07, + "loss": 0.2102, + "step": 2215 + }, + { + "epoch": 0.5896753592336349, + "grad_norm": 0.28153881430625916, + "learning_rate": 1.9419753412216177e-07, + "loss": 0.2201, + "step": 2216 + }, + { + "epoch": 0.5899414582224588, + "grad_norm": 0.30883142352104187, + "learning_rate": 1.941918636231234e-07, + "loss": 0.1984, + "step": 2217 + }, + { + "epoch": 0.5902075572112826, + "grad_norm": 0.41162097454071045, + "learning_rate": 1.9418619043754166e-07, + "loss": 0.2435, + "step": 2218 + }, + { + "epoch": 0.5904736562001064, + "grad_norm": 0.2721443176269531, + "learning_rate": 1.9418051456557835e-07, + "loss": 0.213, + "step": 2219 + }, + { + "epoch": 0.5907397551889303, + "grad_norm": 0.273275226354599, + "learning_rate": 1.9417483600739538e-07, + "loss": 0.2109, + "step": 2220 + }, + { + "epoch": 0.5910058541777541, + "grad_norm": 0.30041608214378357, + "learning_rate": 1.9416915476315471e-07, + "loss": 0.2126, + "step": 2221 + }, + { + "epoch": 0.591271953166578, + "grad_norm": 0.27187103033065796, + "learning_rate": 1.9416347083301842e-07, + "loss": 0.2045, + "step": 2222 + }, + { + "epoch": 0.5915380521554018, + "grad_norm": 0.3715590238571167, + "learning_rate": 1.941577842171486e-07, + "loss": 0.2066, + "step": 2223 + }, + { + "epoch": 0.5918041511442257, + "grad_norm": 0.2646235525608063, + "learning_rate": 1.941520949157074e-07, + "loss": 0.214, + "step": 2224 + }, + { + "epoch": 0.5920702501330495, + "grad_norm": 0.37139052152633667, + "learning_rate": 1.9414640292885712e-07, + "loss": 0.2085, + "step": 2225 + }, + { + "epoch": 0.5923363491218734, + "grad_norm": 0.26759564876556396, + "learning_rate": 1.9414070825676014e-07, + "loss": 0.2019, + "step": 2226 + }, + { + "epoch": 0.5926024481106972, + "grad_norm": 0.3500763177871704, + "learning_rate": 1.9413501089957887e-07, + "loss": 0.2366, + "step": 2227 + }, + { + "epoch": 0.592868547099521, + "grad_norm": 0.26172080636024475, + "learning_rate": 1.9412931085747574e-07, + "loss": 0.1926, + "step": 2228 + }, + { + "epoch": 0.5931346460883449, + "grad_norm": 0.32974010705947876, + "learning_rate": 1.9412360813061347e-07, + "loss": 0.212, + "step": 2229 + }, + { + "epoch": 0.5934007450771687, + "grad_norm": 0.27986183762550354, + "learning_rate": 1.9411790271915454e-07, + "loss": 0.2111, + "step": 2230 + }, + { + "epoch": 0.5936668440659926, + "grad_norm": 0.28489866852760315, + "learning_rate": 1.9411219462326182e-07, + "loss": 0.2283, + "step": 2231 + }, + { + "epoch": 0.5939329430548164, + "grad_norm": 0.2992693781852722, + "learning_rate": 1.9410648384309804e-07, + "loss": 0.2175, + "step": 2232 + }, + { + "epoch": 0.5941990420436403, + "grad_norm": 0.27136459946632385, + "learning_rate": 1.9410077037882613e-07, + "loss": 0.2042, + "step": 2233 + }, + { + "epoch": 0.5944651410324641, + "grad_norm": 0.258884996175766, + "learning_rate": 1.94095054230609e-07, + "loss": 0.199, + "step": 2234 + }, + { + "epoch": 0.594731240021288, + "grad_norm": 0.37153008580207825, + "learning_rate": 1.9408933539860974e-07, + "loss": 0.2241, + "step": 2235 + }, + { + "epoch": 0.5949973390101118, + "grad_norm": 0.29739314317703247, + "learning_rate": 1.9408361388299143e-07, + "loss": 0.2207, + "step": 2236 + }, + { + "epoch": 0.5952634379989356, + "grad_norm": 0.3020549714565277, + "learning_rate": 1.9407788968391724e-07, + "loss": 0.2059, + "step": 2237 + }, + { + "epoch": 0.5955295369877595, + "grad_norm": 0.266570121049881, + "learning_rate": 1.940721628015505e-07, + "loss": 0.1924, + "step": 2238 + }, + { + "epoch": 0.5957956359765832, + "grad_norm": 0.27476605772972107, + "learning_rate": 1.940664332360545e-07, + "loss": 0.1993, + "step": 2239 + }, + { + "epoch": 0.5960617349654072, + "grad_norm": 0.3492327332496643, + "learning_rate": 1.9406070098759267e-07, + "loss": 0.2369, + "step": 2240 + }, + { + "epoch": 0.596327833954231, + "grad_norm": 0.4768237769603729, + "learning_rate": 1.940549660563285e-07, + "loss": 0.2152, + "step": 2241 + }, + { + "epoch": 0.5965939329430548, + "grad_norm": 0.38321453332901, + "learning_rate": 1.9404922844242558e-07, + "loss": 0.229, + "step": 2242 + }, + { + "epoch": 0.5968600319318786, + "grad_norm": 0.265163391828537, + "learning_rate": 1.9404348814604755e-07, + "loss": 0.1999, + "step": 2243 + }, + { + "epoch": 0.5971261309207025, + "grad_norm": 0.3023378252983093, + "learning_rate": 1.940377451673581e-07, + "loss": 0.2259, + "step": 2244 + }, + { + "epoch": 0.5973922299095263, + "grad_norm": 0.26402732729911804, + "learning_rate": 1.9403199950652107e-07, + "loss": 0.2201, + "step": 2245 + }, + { + "epoch": 0.5976583288983501, + "grad_norm": 0.25745460391044617, + "learning_rate": 1.9402625116370034e-07, + "loss": 0.2088, + "step": 2246 + }, + { + "epoch": 0.597924427887174, + "grad_norm": 0.3148798942565918, + "learning_rate": 1.940205001390599e-07, + "loss": 0.222, + "step": 2247 + }, + { + "epoch": 0.5981905268759978, + "grad_norm": 0.3805276155471802, + "learning_rate": 1.9401474643276368e-07, + "loss": 0.2268, + "step": 2248 + }, + { + "epoch": 0.5984566258648217, + "grad_norm": 0.25302353501319885, + "learning_rate": 1.9400899004497585e-07, + "loss": 0.2051, + "step": 2249 + }, + { + "epoch": 0.5987227248536455, + "grad_norm": 0.284544438123703, + "learning_rate": 1.9400323097586063e-07, + "loss": 0.2162, + "step": 2250 + }, + { + "epoch": 0.5989888238424694, + "grad_norm": 0.43384888768196106, + "learning_rate": 1.9399746922558223e-07, + "loss": 0.2226, + "step": 2251 + }, + { + "epoch": 0.5992549228312932, + "grad_norm": 0.2625119984149933, + "learning_rate": 1.9399170479430498e-07, + "loss": 0.2165, + "step": 2252 + }, + { + "epoch": 0.5995210218201171, + "grad_norm": 0.3043935298919678, + "learning_rate": 1.9398593768219328e-07, + "loss": 0.2399, + "step": 2253 + }, + { + "epoch": 0.5997871208089409, + "grad_norm": 0.26537463068962097, + "learning_rate": 1.939801678894117e-07, + "loss": 0.2096, + "step": 2254 + }, + { + "epoch": 0.6000532197977647, + "grad_norm": 0.3479596972465515, + "learning_rate": 1.9397439541612473e-07, + "loss": 0.2003, + "step": 2255 + }, + { + "epoch": 0.6003193187865886, + "grad_norm": 0.38517633080482483, + "learning_rate": 1.9396862026249705e-07, + "loss": 0.2178, + "step": 2256 + }, + { + "epoch": 0.6005854177754124, + "grad_norm": 0.28467804193496704, + "learning_rate": 1.9396284242869334e-07, + "loss": 0.192, + "step": 2257 + }, + { + "epoch": 0.6008515167642363, + "grad_norm": 0.3383433222770691, + "learning_rate": 1.9395706191487844e-07, + "loss": 0.224, + "step": 2258 + }, + { + "epoch": 0.6011176157530601, + "grad_norm": 0.4295337200164795, + "learning_rate": 1.939512787212172e-07, + "loss": 0.2402, + "step": 2259 + }, + { + "epoch": 0.601383714741884, + "grad_norm": 0.27161818742752075, + "learning_rate": 1.9394549284787457e-07, + "loss": 0.1962, + "step": 2260 + }, + { + "epoch": 0.6016498137307078, + "grad_norm": 0.3717339336872101, + "learning_rate": 1.9393970429501558e-07, + "loss": 0.2227, + "step": 2261 + }, + { + "epoch": 0.6019159127195317, + "grad_norm": 0.2730735242366791, + "learning_rate": 1.9393391306280535e-07, + "loss": 0.2178, + "step": 2262 + }, + { + "epoch": 0.6021820117083555, + "grad_norm": 0.3602024018764496, + "learning_rate": 1.9392811915140901e-07, + "loss": 0.202, + "step": 2263 + }, + { + "epoch": 0.6024481106971793, + "grad_norm": 0.27585870027542114, + "learning_rate": 1.9392232256099186e-07, + "loss": 0.2021, + "step": 2264 + }, + { + "epoch": 0.6027142096860032, + "grad_norm": 0.3005305528640747, + "learning_rate": 1.9391652329171922e-07, + "loss": 0.2202, + "step": 2265 + }, + { + "epoch": 0.602980308674827, + "grad_norm": 0.26378726959228516, + "learning_rate": 1.9391072134375643e-07, + "loss": 0.2058, + "step": 2266 + }, + { + "epoch": 0.6032464076636509, + "grad_norm": 0.30414649844169617, + "learning_rate": 1.939049167172691e-07, + "loss": 0.2117, + "step": 2267 + }, + { + "epoch": 0.6035125066524747, + "grad_norm": 0.3546912968158722, + "learning_rate": 1.9389910941242268e-07, + "loss": 0.2081, + "step": 2268 + }, + { + "epoch": 0.6037786056412986, + "grad_norm": 0.2913650870323181, + "learning_rate": 1.9389329942938287e-07, + "loss": 0.2317, + "step": 2269 + }, + { + "epoch": 0.6040447046301224, + "grad_norm": 0.42941606044769287, + "learning_rate": 1.9388748676831535e-07, + "loss": 0.2135, + "step": 2270 + }, + { + "epoch": 0.6043108036189463, + "grad_norm": 0.34226545691490173, + "learning_rate": 1.9388167142938592e-07, + "loss": 0.2295, + "step": 2271 + }, + { + "epoch": 0.6045769026077701, + "grad_norm": 0.2613268792629242, + "learning_rate": 1.9387585341276043e-07, + "loss": 0.1997, + "step": 2272 + }, + { + "epoch": 0.604843001596594, + "grad_norm": 0.270082950592041, + "learning_rate": 1.9387003271860484e-07, + "loss": 0.213, + "step": 2273 + }, + { + "epoch": 0.6051091005854178, + "grad_norm": 0.3308953046798706, + "learning_rate": 1.9386420934708518e-07, + "loss": 0.2157, + "step": 2274 + }, + { + "epoch": 0.6053751995742416, + "grad_norm": 0.2682272791862488, + "learning_rate": 1.9385838329836752e-07, + "loss": 0.2186, + "step": 2275 + }, + { + "epoch": 0.6056412985630655, + "grad_norm": 0.2647644877433777, + "learning_rate": 1.9385255457261805e-07, + "loss": 0.2178, + "step": 2276 + }, + { + "epoch": 0.6059073975518893, + "grad_norm": 0.3494519293308258, + "learning_rate": 1.9384672317000298e-07, + "loss": 0.2064, + "step": 2277 + }, + { + "epoch": 0.6061734965407132, + "grad_norm": 0.35312166810035706, + "learning_rate": 1.9384088909068868e-07, + "loss": 0.2138, + "step": 2278 + }, + { + "epoch": 0.606439595529537, + "grad_norm": 0.2788267731666565, + "learning_rate": 1.9383505233484154e-07, + "loss": 0.2038, + "step": 2279 + }, + { + "epoch": 0.6067056945183609, + "grad_norm": 0.3599065840244293, + "learning_rate": 1.93829212902628e-07, + "loss": 0.206, + "step": 2280 + }, + { + "epoch": 0.6069717935071847, + "grad_norm": 0.27506589889526367, + "learning_rate": 1.9382337079421464e-07, + "loss": 0.1967, + "step": 2281 + }, + { + "epoch": 0.6072378924960086, + "grad_norm": 0.31494614481925964, + "learning_rate": 1.9381752600976809e-07, + "loss": 0.218, + "step": 2282 + }, + { + "epoch": 0.6075039914848324, + "grad_norm": 0.28491929173469543, + "learning_rate": 1.9381167854945506e-07, + "loss": 0.2119, + "step": 2283 + }, + { + "epoch": 0.6077700904736562, + "grad_norm": 0.4715430736541748, + "learning_rate": 1.9380582841344233e-07, + "loss": 0.2315, + "step": 2284 + }, + { + "epoch": 0.6080361894624801, + "grad_norm": 0.2523706555366516, + "learning_rate": 1.9379997560189673e-07, + "loss": 0.2035, + "step": 2285 + }, + { + "epoch": 0.6083022884513039, + "grad_norm": 0.2620411813259125, + "learning_rate": 1.9379412011498523e-07, + "loss": 0.2203, + "step": 2286 + }, + { + "epoch": 0.6085683874401278, + "grad_norm": 0.2812654972076416, + "learning_rate": 1.9378826195287483e-07, + "loss": 0.2164, + "step": 2287 + }, + { + "epoch": 0.6088344864289515, + "grad_norm": 0.2775430679321289, + "learning_rate": 1.937824011157326e-07, + "loss": 0.219, + "step": 2288 + }, + { + "epoch": 0.6091005854177755, + "grad_norm": 0.2753515839576721, + "learning_rate": 1.9377653760372571e-07, + "loss": 0.191, + "step": 2289 + }, + { + "epoch": 0.6093666844065992, + "grad_norm": 0.49734625220298767, + "learning_rate": 1.9377067141702144e-07, + "loss": 0.2433, + "step": 2290 + }, + { + "epoch": 0.6096327833954232, + "grad_norm": 0.3954668343067169, + "learning_rate": 1.9376480255578703e-07, + "loss": 0.2162, + "step": 2291 + }, + { + "epoch": 0.6098988823842469, + "grad_norm": 0.2833634614944458, + "learning_rate": 1.9375893102018993e-07, + "loss": 0.2095, + "step": 2292 + }, + { + "epoch": 0.6101649813730707, + "grad_norm": 0.2958487272262573, + "learning_rate": 1.9375305681039757e-07, + "loss": 0.2007, + "step": 2293 + }, + { + "epoch": 0.6104310803618946, + "grad_norm": 0.2714805603027344, + "learning_rate": 1.9374717992657753e-07, + "loss": 0.2193, + "step": 2294 + }, + { + "epoch": 0.6106971793507184, + "grad_norm": 0.3949548900127411, + "learning_rate": 1.9374130036889742e-07, + "loss": 0.2139, + "step": 2295 + }, + { + "epoch": 0.6109632783395423, + "grad_norm": 0.2761614918708801, + "learning_rate": 1.9373541813752495e-07, + "loss": 0.2312, + "step": 2296 + }, + { + "epoch": 0.6112293773283661, + "grad_norm": 0.34912094473838806, + "learning_rate": 1.937295332326278e-07, + "loss": 0.2112, + "step": 2297 + }, + { + "epoch": 0.61149547631719, + "grad_norm": 0.288820743560791, + "learning_rate": 1.9372364565437398e-07, + "loss": 0.2219, + "step": 2298 + }, + { + "epoch": 0.6117615753060138, + "grad_norm": 0.2554434537887573, + "learning_rate": 1.937177554029313e-07, + "loss": 0.2131, + "step": 2299 + }, + { + "epoch": 0.6120276742948377, + "grad_norm": 0.2676529288291931, + "learning_rate": 1.9371186247846777e-07, + "loss": 0.2252, + "step": 2300 + }, + { + "epoch": 0.6122937732836615, + "grad_norm": 0.29944339394569397, + "learning_rate": 1.9370596688115152e-07, + "loss": 0.2167, + "step": 2301 + }, + { + "epoch": 0.6125598722724853, + "grad_norm": 0.33215829730033875, + "learning_rate": 1.9370006861115066e-07, + "loss": 0.2339, + "step": 2302 + }, + { + "epoch": 0.6128259712613092, + "grad_norm": 0.2572324275970459, + "learning_rate": 1.9369416766863345e-07, + "loss": 0.2154, + "step": 2303 + }, + { + "epoch": 0.613092070250133, + "grad_norm": 0.2553844153881073, + "learning_rate": 1.9368826405376818e-07, + "loss": 0.2173, + "step": 2304 + }, + { + "epoch": 0.6133581692389569, + "grad_norm": 0.4024081826210022, + "learning_rate": 1.936823577667232e-07, + "loss": 0.2231, + "step": 2305 + }, + { + "epoch": 0.6136242682277807, + "grad_norm": 0.2753196358680725, + "learning_rate": 1.9367644880766707e-07, + "loss": 0.2241, + "step": 2306 + }, + { + "epoch": 0.6138903672166046, + "grad_norm": 0.28837668895721436, + "learning_rate": 1.9367053717676823e-07, + "loss": 0.2177, + "step": 2307 + }, + { + "epoch": 0.6141564662054284, + "grad_norm": 0.41398730874061584, + "learning_rate": 1.936646228741953e-07, + "loss": 0.2391, + "step": 2308 + }, + { + "epoch": 0.6144225651942523, + "grad_norm": 0.2948974072933197, + "learning_rate": 1.93658705900117e-07, + "loss": 0.2181, + "step": 2309 + }, + { + "epoch": 0.6146886641830761, + "grad_norm": 0.4310910999774933, + "learning_rate": 1.936527862547021e-07, + "loss": 0.216, + "step": 2310 + }, + { + "epoch": 0.6149547631718999, + "grad_norm": 0.27883195877075195, + "learning_rate": 1.9364686393811942e-07, + "loss": 0.2375, + "step": 2311 + }, + { + "epoch": 0.6152208621607238, + "grad_norm": 3.647634983062744, + "learning_rate": 1.9364093895053785e-07, + "loss": 0.2241, + "step": 2312 + }, + { + "epoch": 0.6154869611495476, + "grad_norm": 0.3102304935455322, + "learning_rate": 1.9363501129212646e-07, + "loss": 0.2318, + "step": 2313 + }, + { + "epoch": 0.6157530601383715, + "grad_norm": 0.2528611123561859, + "learning_rate": 1.9362908096305427e-07, + "loss": 0.2006, + "step": 2314 + }, + { + "epoch": 0.6160191591271953, + "grad_norm": 0.2937268018722534, + "learning_rate": 1.9362314796349039e-07, + "loss": 0.2201, + "step": 2315 + }, + { + "epoch": 0.6162852581160192, + "grad_norm": 0.4561005234718323, + "learning_rate": 1.9361721229360407e-07, + "loss": 0.2434, + "step": 2316 + }, + { + "epoch": 0.616551357104843, + "grad_norm": 0.3274036943912506, + "learning_rate": 1.9361127395356466e-07, + "loss": 0.2057, + "step": 2317 + }, + { + "epoch": 0.6168174560936669, + "grad_norm": 0.37526845932006836, + "learning_rate": 1.9360533294354148e-07, + "loss": 0.2357, + "step": 2318 + }, + { + "epoch": 0.6170835550824907, + "grad_norm": 0.37217849493026733, + "learning_rate": 1.9359938926370398e-07, + "loss": 0.2368, + "step": 2319 + }, + { + "epoch": 0.6173496540713145, + "grad_norm": 0.30995428562164307, + "learning_rate": 1.935934429142217e-07, + "loss": 0.2116, + "step": 2320 + }, + { + "epoch": 0.6176157530601384, + "grad_norm": 0.3529452383518219, + "learning_rate": 1.9358749389526423e-07, + "loss": 0.234, + "step": 2321 + }, + { + "epoch": 0.6178818520489622, + "grad_norm": 0.27886733412742615, + "learning_rate": 1.9358154220700127e-07, + "loss": 0.218, + "step": 2322 + }, + { + "epoch": 0.6181479510377861, + "grad_norm": 0.37334784865379333, + "learning_rate": 1.9357558784960255e-07, + "loss": 0.2091, + "step": 2323 + }, + { + "epoch": 0.6184140500266099, + "grad_norm": 0.29938921332359314, + "learning_rate": 1.9356963082323793e-07, + "loss": 0.2185, + "step": 2324 + }, + { + "epoch": 0.6186801490154338, + "grad_norm": 0.25114142894744873, + "learning_rate": 1.9356367112807725e-07, + "loss": 0.2091, + "step": 2325 + }, + { + "epoch": 0.6189462480042576, + "grad_norm": 0.34388303756713867, + "learning_rate": 1.9355770876429055e-07, + "loss": 0.2238, + "step": 2326 + }, + { + "epoch": 0.6192123469930815, + "grad_norm": 0.33369576930999756, + "learning_rate": 1.935517437320479e-07, + "loss": 0.2157, + "step": 2327 + }, + { + "epoch": 0.6194784459819053, + "grad_norm": 0.4379696547985077, + "learning_rate": 1.9354577603151943e-07, + "loss": 0.2183, + "step": 2328 + }, + { + "epoch": 0.6197445449707291, + "grad_norm": 0.30906152725219727, + "learning_rate": 1.9353980566287528e-07, + "loss": 0.2413, + "step": 2329 + }, + { + "epoch": 0.620010643959553, + "grad_norm": 0.2795381247997284, + "learning_rate": 1.9353383262628583e-07, + "loss": 0.2132, + "step": 2330 + }, + { + "epoch": 0.6202767429483768, + "grad_norm": 0.2659834921360016, + "learning_rate": 1.935278569219214e-07, + "loss": 0.1867, + "step": 2331 + }, + { + "epoch": 0.6205428419372007, + "grad_norm": 0.2607542872428894, + "learning_rate": 1.9352187854995243e-07, + "loss": 0.1881, + "step": 2332 + }, + { + "epoch": 0.6208089409260245, + "grad_norm": 0.28338074684143066, + "learning_rate": 1.9351589751054945e-07, + "loss": 0.2107, + "step": 2333 + }, + { + "epoch": 0.6210750399148484, + "grad_norm": 0.32064276933670044, + "learning_rate": 1.93509913803883e-07, + "loss": 0.2046, + "step": 2334 + }, + { + "epoch": 0.6213411389036722, + "grad_norm": 0.2765553593635559, + "learning_rate": 1.9350392743012382e-07, + "loss": 0.2047, + "step": 2335 + }, + { + "epoch": 0.6216072378924961, + "grad_norm": 0.2757579982280731, + "learning_rate": 1.934979383894426e-07, + "loss": 0.2222, + "step": 2336 + }, + { + "epoch": 0.6218733368813198, + "grad_norm": 0.27635979652404785, + "learning_rate": 1.934919466820102e-07, + "loss": 0.2179, + "step": 2337 + }, + { + "epoch": 0.6221394358701436, + "grad_norm": 0.25951385498046875, + "learning_rate": 1.9348595230799746e-07, + "loss": 0.2191, + "step": 2338 + }, + { + "epoch": 0.6224055348589675, + "grad_norm": 0.2600404918193817, + "learning_rate": 1.9347995526757544e-07, + "loss": 0.2134, + "step": 2339 + }, + { + "epoch": 0.6226716338477913, + "grad_norm": 0.3172447085380554, + "learning_rate": 1.9347395556091512e-07, + "loss": 0.2145, + "step": 2340 + }, + { + "epoch": 0.6229377328366152, + "grad_norm": 0.268220454454422, + "learning_rate": 1.9346795318818762e-07, + "loss": 0.2216, + "step": 2341 + }, + { + "epoch": 0.623203831825439, + "grad_norm": 0.253984659910202, + "learning_rate": 1.9346194814956417e-07, + "loss": 0.1877, + "step": 2342 + }, + { + "epoch": 0.6234699308142629, + "grad_norm": 0.27204322814941406, + "learning_rate": 1.9345594044521603e-07, + "loss": 0.2196, + "step": 2343 + }, + { + "epoch": 0.6237360298030867, + "grad_norm": 0.4759868085384369, + "learning_rate": 1.9344993007531456e-07, + "loss": 0.2539, + "step": 2344 + }, + { + "epoch": 0.6240021287919106, + "grad_norm": 0.29028913378715515, + "learning_rate": 1.9344391704003119e-07, + "loss": 0.1986, + "step": 2345 + }, + { + "epoch": 0.6242682277807344, + "grad_norm": 0.40609851479530334, + "learning_rate": 1.9343790133953742e-07, + "loss": 0.2185, + "step": 2346 + }, + { + "epoch": 0.6245343267695582, + "grad_norm": 0.2709527015686035, + "learning_rate": 1.934318829740048e-07, + "loss": 0.1868, + "step": 2347 + }, + { + "epoch": 0.6248004257583821, + "grad_norm": 0.2633790373802185, + "learning_rate": 1.9342586194360505e-07, + "loss": 0.1979, + "step": 2348 + }, + { + "epoch": 0.6250665247472059, + "grad_norm": 0.3811013400554657, + "learning_rate": 1.9341983824850985e-07, + "loss": 0.2138, + "step": 2349 + }, + { + "epoch": 0.6253326237360298, + "grad_norm": 0.31931614875793457, + "learning_rate": 1.9341381188889103e-07, + "loss": 0.2258, + "step": 2350 + }, + { + "epoch": 0.6255987227248536, + "grad_norm": 0.7438996434211731, + "learning_rate": 1.9340778286492047e-07, + "loss": 0.2219, + "step": 2351 + }, + { + "epoch": 0.6258648217136775, + "grad_norm": 0.277605801820755, + "learning_rate": 1.9340175117677014e-07, + "loss": 0.2152, + "step": 2352 + }, + { + "epoch": 0.6261309207025013, + "grad_norm": 0.3210899829864502, + "learning_rate": 1.9339571682461203e-07, + "loss": 0.2339, + "step": 2353 + }, + { + "epoch": 0.6263970196913252, + "grad_norm": 0.2650911211967468, + "learning_rate": 1.9338967980861832e-07, + "loss": 0.2218, + "step": 2354 + }, + { + "epoch": 0.626663118680149, + "grad_norm": 0.25635167956352234, + "learning_rate": 1.9338364012896116e-07, + "loss": 0.2293, + "step": 2355 + }, + { + "epoch": 0.6269292176689728, + "grad_norm": 0.2741609513759613, + "learning_rate": 1.9337759778581281e-07, + "loss": 0.2443, + "step": 2356 + }, + { + "epoch": 0.6271953166577967, + "grad_norm": 0.33030107617378235, + "learning_rate": 1.9337155277934563e-07, + "loss": 0.2235, + "step": 2357 + }, + { + "epoch": 0.6274614156466205, + "grad_norm": 0.3569287657737732, + "learning_rate": 1.9336550510973202e-07, + "loss": 0.2163, + "step": 2358 + }, + { + "epoch": 0.6277275146354444, + "grad_norm": 0.33584868907928467, + "learning_rate": 1.9335945477714448e-07, + "loss": 0.2292, + "step": 2359 + }, + { + "epoch": 0.6279936136242682, + "grad_norm": 0.2707878351211548, + "learning_rate": 1.9335340178175556e-07, + "loss": 0.2215, + "step": 2360 + }, + { + "epoch": 0.6282597126130921, + "grad_norm": 0.2709237337112427, + "learning_rate": 1.9334734612373794e-07, + "loss": 0.2131, + "step": 2361 + }, + { + "epoch": 0.6285258116019159, + "grad_norm": 0.49567288160324097, + "learning_rate": 1.9334128780326429e-07, + "loss": 0.264, + "step": 2362 + }, + { + "epoch": 0.6287919105907398, + "grad_norm": 0.26447227597236633, + "learning_rate": 1.9333522682050745e-07, + "loss": 0.2071, + "step": 2363 + }, + { + "epoch": 0.6290580095795636, + "grad_norm": 0.36185500025749207, + "learning_rate": 1.9332916317564028e-07, + "loss": 0.2146, + "step": 2364 + }, + { + "epoch": 0.6293241085683874, + "grad_norm": 0.33932778239250183, + "learning_rate": 1.933230968688357e-07, + "loss": 0.2114, + "step": 2365 + }, + { + "epoch": 0.6295902075572113, + "grad_norm": 0.34900638461112976, + "learning_rate": 1.9331702790026677e-07, + "loss": 0.2331, + "step": 2366 + }, + { + "epoch": 0.6298563065460351, + "grad_norm": 0.2805887460708618, + "learning_rate": 1.9331095627010655e-07, + "loss": 0.212, + "step": 2367 + }, + { + "epoch": 0.630122405534859, + "grad_norm": 0.27171993255615234, + "learning_rate": 1.933048819785283e-07, + "loss": 0.2153, + "step": 2368 + }, + { + "epoch": 0.6303885045236828, + "grad_norm": 0.26164576411247253, + "learning_rate": 1.9329880502570517e-07, + "loss": 0.2176, + "step": 2369 + }, + { + "epoch": 0.6306546035125067, + "grad_norm": 0.38904860615730286, + "learning_rate": 1.9329272541181055e-07, + "loss": 0.2119, + "step": 2370 + }, + { + "epoch": 0.6309207025013305, + "grad_norm": 0.3816504180431366, + "learning_rate": 1.932866431370178e-07, + "loss": 0.2356, + "step": 2371 + }, + { + "epoch": 0.6311868014901544, + "grad_norm": 0.32194581627845764, + "learning_rate": 1.932805582015004e-07, + "loss": 0.2424, + "step": 2372 + }, + { + "epoch": 0.6314529004789782, + "grad_norm": 0.3376893997192383, + "learning_rate": 1.93274470605432e-07, + "loss": 0.2272, + "step": 2373 + }, + { + "epoch": 0.631718999467802, + "grad_norm": 0.3245176076889038, + "learning_rate": 1.9326838034898608e-07, + "loss": 0.2123, + "step": 2374 + }, + { + "epoch": 0.6319850984566259, + "grad_norm": 0.31030479073524475, + "learning_rate": 1.9326228743233645e-07, + "loss": 0.2148, + "step": 2375 + }, + { + "epoch": 0.6322511974454497, + "grad_norm": 0.3243281841278076, + "learning_rate": 1.9325619185565688e-07, + "loss": 0.2313, + "step": 2376 + }, + { + "epoch": 0.6325172964342736, + "grad_norm": 0.42591285705566406, + "learning_rate": 1.9325009361912116e-07, + "loss": 0.227, + "step": 2377 + }, + { + "epoch": 0.6327833954230974, + "grad_norm": 0.2754235863685608, + "learning_rate": 1.9324399272290332e-07, + "loss": 0.1938, + "step": 2378 + }, + { + "epoch": 0.6330494944119213, + "grad_norm": 0.33361804485321045, + "learning_rate": 1.9323788916717733e-07, + "loss": 0.2144, + "step": 2379 + }, + { + "epoch": 0.6333155934007451, + "grad_norm": 0.2570149004459381, + "learning_rate": 1.9323178295211725e-07, + "loss": 0.2047, + "step": 2380 + }, + { + "epoch": 0.633581692389569, + "grad_norm": 0.27086207270622253, + "learning_rate": 1.932256740778973e-07, + "loss": 0.2005, + "step": 2381 + }, + { + "epoch": 0.6338477913783928, + "grad_norm": 0.27443206310272217, + "learning_rate": 1.9321956254469166e-07, + "loss": 0.2238, + "step": 2382 + }, + { + "epoch": 0.6341138903672167, + "grad_norm": 0.2681194841861725, + "learning_rate": 1.9321344835267467e-07, + "loss": 0.2176, + "step": 2383 + }, + { + "epoch": 0.6343799893560405, + "grad_norm": 0.4095330834388733, + "learning_rate": 1.9320733150202068e-07, + "loss": 0.2033, + "step": 2384 + }, + { + "epoch": 0.6346460883448642, + "grad_norm": 0.2756219208240509, + "learning_rate": 1.9320121199290423e-07, + "loss": 0.1984, + "step": 2385 + }, + { + "epoch": 0.6349121873336881, + "grad_norm": 0.28143176436424255, + "learning_rate": 1.931950898254998e-07, + "loss": 0.2105, + "step": 2386 + }, + { + "epoch": 0.6351782863225119, + "grad_norm": 0.26365017890930176, + "learning_rate": 1.9318896499998203e-07, + "loss": 0.219, + "step": 2387 + }, + { + "epoch": 0.6354443853113358, + "grad_norm": 0.28928396105766296, + "learning_rate": 1.931828375165256e-07, + "loss": 0.2192, + "step": 2388 + }, + { + "epoch": 0.6357104843001596, + "grad_norm": 0.3133062720298767, + "learning_rate": 1.9317670737530532e-07, + "loss": 0.21, + "step": 2389 + }, + { + "epoch": 0.6359765832889835, + "grad_norm": 0.2558647394180298, + "learning_rate": 1.9317057457649596e-07, + "loss": 0.2069, + "step": 2390 + }, + { + "epoch": 0.6362426822778073, + "grad_norm": 0.28125, + "learning_rate": 1.931644391202725e-07, + "loss": 0.2255, + "step": 2391 + }, + { + "epoch": 0.6365087812666312, + "grad_norm": 0.28056442737579346, + "learning_rate": 1.9315830100680995e-07, + "loss": 0.2119, + "step": 2392 + }, + { + "epoch": 0.636774880255455, + "grad_norm": 0.3476063907146454, + "learning_rate": 1.931521602362833e-07, + "loss": 0.2018, + "step": 2393 + }, + { + "epoch": 0.6370409792442788, + "grad_norm": 0.32080399990081787, + "learning_rate": 1.9314601680886777e-07, + "loss": 0.2107, + "step": 2394 + }, + { + "epoch": 0.6373070782331027, + "grad_norm": 0.26207712292671204, + "learning_rate": 1.9313987072473857e-07, + "loss": 0.1998, + "step": 2395 + }, + { + "epoch": 0.6375731772219265, + "grad_norm": 0.2874518036842346, + "learning_rate": 1.9313372198407097e-07, + "loss": 0.2207, + "step": 2396 + }, + { + "epoch": 0.6378392762107504, + "grad_norm": 0.2654260993003845, + "learning_rate": 1.9312757058704035e-07, + "loss": 0.2244, + "step": 2397 + }, + { + "epoch": 0.6381053751995742, + "grad_norm": 0.289747029542923, + "learning_rate": 1.9312141653382219e-07, + "loss": 0.224, + "step": 2398 + }, + { + "epoch": 0.6383714741883981, + "grad_norm": 0.2844523787498474, + "learning_rate": 1.9311525982459198e-07, + "loss": 0.2131, + "step": 2399 + }, + { + "epoch": 0.6386375731772219, + "grad_norm": 0.3179759681224823, + "learning_rate": 1.9310910045952538e-07, + "loss": 0.2089, + "step": 2400 + }, + { + "epoch": 0.6389036721660458, + "grad_norm": 0.46506497263908386, + "learning_rate": 1.9310293843879797e-07, + "loss": 0.2008, + "step": 2401 + }, + { + "epoch": 0.6391697711548696, + "grad_norm": 0.4636223316192627, + "learning_rate": 1.930967737625856e-07, + "loss": 0.1953, + "step": 2402 + }, + { + "epoch": 0.6394358701436934, + "grad_norm": 0.3075079321861267, + "learning_rate": 1.9309060643106406e-07, + "loss": 0.2208, + "step": 2403 + }, + { + "epoch": 0.6397019691325173, + "grad_norm": 0.29361942410469055, + "learning_rate": 1.9308443644440926e-07, + "loss": 0.1943, + "step": 2404 + }, + { + "epoch": 0.6399680681213411, + "grad_norm": 0.27854081988334656, + "learning_rate": 1.9307826380279716e-07, + "loss": 0.2135, + "step": 2405 + }, + { + "epoch": 0.640234167110165, + "grad_norm": 0.27336618304252625, + "learning_rate": 1.9307208850640383e-07, + "loss": 0.2269, + "step": 2406 + }, + { + "epoch": 0.6405002660989888, + "grad_norm": 0.27616047859191895, + "learning_rate": 1.9306591055540541e-07, + "loss": 0.216, + "step": 2407 + }, + { + "epoch": 0.6407663650878127, + "grad_norm": 0.7997803688049316, + "learning_rate": 1.930597299499781e-07, + "loss": 0.2104, + "step": 2408 + }, + { + "epoch": 0.6410324640766365, + "grad_norm": 0.2756381034851074, + "learning_rate": 1.930535466902982e-07, + "loss": 0.2234, + "step": 2409 + }, + { + "epoch": 0.6412985630654604, + "grad_norm": 0.26757943630218506, + "learning_rate": 1.9304736077654205e-07, + "loss": 0.2114, + "step": 2410 + }, + { + "epoch": 0.6415646620542842, + "grad_norm": 0.34802788496017456, + "learning_rate": 1.9304117220888608e-07, + "loss": 0.2185, + "step": 2411 + }, + { + "epoch": 0.641830761043108, + "grad_norm": 0.5653870701789856, + "learning_rate": 1.930349809875068e-07, + "loss": 0.2319, + "step": 2412 + }, + { + "epoch": 0.6420968600319319, + "grad_norm": 0.38331344723701477, + "learning_rate": 1.9302878711258084e-07, + "loss": 0.209, + "step": 2413 + }, + { + "epoch": 0.6423629590207557, + "grad_norm": 0.2680680453777313, + "learning_rate": 1.930225905842848e-07, + "loss": 0.2055, + "step": 2414 + }, + { + "epoch": 0.6426290580095796, + "grad_norm": 0.46921247243881226, + "learning_rate": 1.9301639140279546e-07, + "loss": 0.2431, + "step": 2415 + }, + { + "epoch": 0.6428951569984034, + "grad_norm": 0.2752893567085266, + "learning_rate": 1.9301018956828963e-07, + "loss": 0.2468, + "step": 2416 + }, + { + "epoch": 0.6431612559872273, + "grad_norm": 0.3030901253223419, + "learning_rate": 1.9300398508094416e-07, + "loss": 0.2238, + "step": 2417 + }, + { + "epoch": 0.6434273549760511, + "grad_norm": 0.30657488107681274, + "learning_rate": 1.9299777794093606e-07, + "loss": 0.2249, + "step": 2418 + }, + { + "epoch": 0.643693453964875, + "grad_norm": 0.2789154350757599, + "learning_rate": 1.9299156814844232e-07, + "loss": 0.239, + "step": 2419 + }, + { + "epoch": 0.6439595529536988, + "grad_norm": 0.28824034333229065, + "learning_rate": 1.9298535570364014e-07, + "loss": 0.2045, + "step": 2420 + }, + { + "epoch": 0.6442256519425226, + "grad_norm": 0.325103759765625, + "learning_rate": 1.9297914060670662e-07, + "loss": 0.2347, + "step": 2421 + }, + { + "epoch": 0.6444917509313465, + "grad_norm": 0.2928839325904846, + "learning_rate": 1.9297292285781906e-07, + "loss": 0.218, + "step": 2422 + }, + { + "epoch": 0.6447578499201703, + "grad_norm": 0.26880672574043274, + "learning_rate": 1.9296670245715484e-07, + "loss": 0.206, + "step": 2423 + }, + { + "epoch": 0.6450239489089942, + "grad_norm": 0.2849896550178528, + "learning_rate": 1.9296047940489135e-07, + "loss": 0.2197, + "step": 2424 + }, + { + "epoch": 0.645290047897818, + "grad_norm": 0.2521916627883911, + "learning_rate": 1.9295425370120605e-07, + "loss": 0.2206, + "step": 2425 + }, + { + "epoch": 0.6455561468866419, + "grad_norm": 0.2651546001434326, + "learning_rate": 1.929480253462766e-07, + "loss": 0.2103, + "step": 2426 + }, + { + "epoch": 0.6458222458754657, + "grad_norm": 0.2906102240085602, + "learning_rate": 1.9294179434028056e-07, + "loss": 0.2266, + "step": 2427 + }, + { + "epoch": 0.6460883448642896, + "grad_norm": 0.3804858326911926, + "learning_rate": 1.9293556068339565e-07, + "loss": 0.2299, + "step": 2428 + }, + { + "epoch": 0.6463544438531134, + "grad_norm": 0.2628536522388458, + "learning_rate": 1.9292932437579972e-07, + "loss": 0.1925, + "step": 2429 + }, + { + "epoch": 0.6466205428419372, + "grad_norm": 0.26335999369621277, + "learning_rate": 1.9292308541767063e-07, + "loss": 0.2046, + "step": 2430 + }, + { + "epoch": 0.6468866418307611, + "grad_norm": 0.33111146092414856, + "learning_rate": 1.929168438091863e-07, + "loss": 0.224, + "step": 2431 + }, + { + "epoch": 0.6471527408195848, + "grad_norm": 0.35078591108322144, + "learning_rate": 1.9291059955052472e-07, + "loss": 0.2441, + "step": 2432 + }, + { + "epoch": 0.6474188398084088, + "grad_norm": 0.2774410545825958, + "learning_rate": 1.9290435264186412e-07, + "loss": 0.2247, + "step": 2433 + }, + { + "epoch": 0.6476849387972325, + "grad_norm": 0.2712075412273407, + "learning_rate": 1.9289810308338255e-07, + "loss": 0.2118, + "step": 2434 + }, + { + "epoch": 0.6479510377860564, + "grad_norm": 0.2757715582847595, + "learning_rate": 1.928918508752583e-07, + "loss": 0.2222, + "step": 2435 + }, + { + "epoch": 0.6482171367748802, + "grad_norm": 0.26329243183135986, + "learning_rate": 1.928855960176697e-07, + "loss": 0.1959, + "step": 2436 + }, + { + "epoch": 0.6484832357637041, + "grad_norm": 0.46495112776756287, + "learning_rate": 1.9287933851079517e-07, + "loss": 0.1913, + "step": 2437 + }, + { + "epoch": 0.6487493347525279, + "grad_norm": 0.2587107717990875, + "learning_rate": 1.9287307835481316e-07, + "loss": 0.2168, + "step": 2438 + }, + { + "epoch": 0.6490154337413517, + "grad_norm": 0.28226250410079956, + "learning_rate": 1.928668155499022e-07, + "loss": 0.2251, + "step": 2439 + }, + { + "epoch": 0.6492815327301756, + "grad_norm": 0.25671884417533875, + "learning_rate": 1.9286055009624097e-07, + "loss": 0.1965, + "step": 2440 + }, + { + "epoch": 0.6495476317189994, + "grad_norm": 0.36932918429374695, + "learning_rate": 1.9285428199400815e-07, + "loss": 0.2434, + "step": 2441 + }, + { + "epoch": 0.6498137307078233, + "grad_norm": 0.2646516263484955, + "learning_rate": 1.9284801124338252e-07, + "loss": 0.2064, + "step": 2442 + }, + { + "epoch": 0.6500798296966471, + "grad_norm": 0.43409767746925354, + "learning_rate": 1.928417378445429e-07, + "loss": 0.2522, + "step": 2443 + }, + { + "epoch": 0.650345928685471, + "grad_norm": 0.27683010697364807, + "learning_rate": 1.928354617976683e-07, + "loss": 0.1979, + "step": 2444 + }, + { + "epoch": 0.6506120276742948, + "grad_norm": 0.2714884579181671, + "learning_rate": 1.9282918310293765e-07, + "loss": 0.2065, + "step": 2445 + }, + { + "epoch": 0.6508781266631187, + "grad_norm": 0.29313287138938904, + "learning_rate": 1.9282290176053007e-07, + "loss": 0.183, + "step": 2446 + }, + { + "epoch": 0.6511442256519425, + "grad_norm": 0.3279009461402893, + "learning_rate": 1.928166177706247e-07, + "loss": 0.238, + "step": 2447 + }, + { + "epoch": 0.6514103246407663, + "grad_norm": 0.2967199683189392, + "learning_rate": 1.928103311334008e-07, + "loss": 0.2053, + "step": 2448 + }, + { + "epoch": 0.6516764236295902, + "grad_norm": 0.31423065066337585, + "learning_rate": 1.9280404184903764e-07, + "loss": 0.2138, + "step": 2449 + }, + { + "epoch": 0.651942522618414, + "grad_norm": 0.2763921916484833, + "learning_rate": 1.9279774991771464e-07, + "loss": 0.2096, + "step": 2450 + }, + { + "epoch": 0.6522086216072379, + "grad_norm": 0.40438273549079895, + "learning_rate": 1.927914553396112e-07, + "loss": 0.227, + "step": 2451 + }, + { + "epoch": 0.6524747205960617, + "grad_norm": 0.3855910897254944, + "learning_rate": 1.927851581149069e-07, + "loss": 0.2151, + "step": 2452 + }, + { + "epoch": 0.6527408195848856, + "grad_norm": 0.5056418776512146, + "learning_rate": 1.9277885824378135e-07, + "loss": 0.2569, + "step": 2453 + }, + { + "epoch": 0.6530069185737094, + "grad_norm": 0.29180848598480225, + "learning_rate": 1.9277255572641424e-07, + "loss": 0.2011, + "step": 2454 + }, + { + "epoch": 0.6532730175625333, + "grad_norm": 0.2587116062641144, + "learning_rate": 1.9276625056298532e-07, + "loss": 0.1959, + "step": 2455 + }, + { + "epoch": 0.6535391165513571, + "grad_norm": 0.27443909645080566, + "learning_rate": 1.927599427536744e-07, + "loss": 0.2158, + "step": 2456 + }, + { + "epoch": 0.6538052155401809, + "grad_norm": 0.3115445673465729, + "learning_rate": 1.9275363229866143e-07, + "loss": 0.2242, + "step": 2457 + }, + { + "epoch": 0.6540713145290048, + "grad_norm": 0.27717649936676025, + "learning_rate": 1.927473191981264e-07, + "loss": 0.1936, + "step": 2458 + }, + { + "epoch": 0.6543374135178286, + "grad_norm": 0.26557233929634094, + "learning_rate": 1.9274100345224932e-07, + "loss": 0.1826, + "step": 2459 + }, + { + "epoch": 0.6546035125066525, + "grad_norm": 0.2651815414428711, + "learning_rate": 1.9273468506121037e-07, + "loss": 0.1875, + "step": 2460 + }, + { + "epoch": 0.6548696114954763, + "grad_norm": 0.24628038704395294, + "learning_rate": 1.9272836402518978e-07, + "loss": 0.1943, + "step": 2461 + }, + { + "epoch": 0.6551357104843002, + "grad_norm": 0.41008704900741577, + "learning_rate": 1.927220403443678e-07, + "loss": 0.2185, + "step": 2462 + }, + { + "epoch": 0.655401809473124, + "grad_norm": 0.27133655548095703, + "learning_rate": 1.927157140189248e-07, + "loss": 0.2282, + "step": 2463 + }, + { + "epoch": 0.6556679084619479, + "grad_norm": 0.29830655455589294, + "learning_rate": 1.9270938504904125e-07, + "loss": 0.2112, + "step": 2464 + }, + { + "epoch": 0.6559340074507717, + "grad_norm": 0.383263498544693, + "learning_rate": 1.9270305343489764e-07, + "loss": 0.226, + "step": 2465 + }, + { + "epoch": 0.6562001064395955, + "grad_norm": 0.28348857164382935, + "learning_rate": 1.9269671917667454e-07, + "loss": 0.2129, + "step": 2466 + }, + { + "epoch": 0.6564662054284194, + "grad_norm": 0.26534709334373474, + "learning_rate": 1.9269038227455265e-07, + "loss": 0.2126, + "step": 2467 + }, + { + "epoch": 0.6567323044172432, + "grad_norm": 0.27340129017829895, + "learning_rate": 1.9268404272871272e-07, + "loss": 0.2168, + "step": 2468 + }, + { + "epoch": 0.6569984034060671, + "grad_norm": 0.33214104175567627, + "learning_rate": 1.9267770053933552e-07, + "loss": 0.2264, + "step": 2469 + }, + { + "epoch": 0.6572645023948909, + "grad_norm": 0.26241815090179443, + "learning_rate": 1.92671355706602e-07, + "loss": 0.1983, + "step": 2470 + }, + { + "epoch": 0.6575306013837148, + "grad_norm": 0.24813225865364075, + "learning_rate": 1.9266500823069306e-07, + "loss": 0.1904, + "step": 2471 + }, + { + "epoch": 0.6577967003725386, + "grad_norm": 0.30889880657196045, + "learning_rate": 1.926586581117898e-07, + "loss": 0.2172, + "step": 2472 + }, + { + "epoch": 0.6580627993613625, + "grad_norm": 0.3747035562992096, + "learning_rate": 1.9265230535007336e-07, + "loss": 0.231, + "step": 2473 + }, + { + "epoch": 0.6583288983501863, + "grad_norm": 0.44584283232688904, + "learning_rate": 1.9264594994572482e-07, + "loss": 0.2319, + "step": 2474 + }, + { + "epoch": 0.6585949973390101, + "grad_norm": 0.3325025141239166, + "learning_rate": 1.9263959189892558e-07, + "loss": 0.2094, + "step": 2475 + }, + { + "epoch": 0.658861096327834, + "grad_norm": 0.26219338178634644, + "learning_rate": 1.926332312098569e-07, + "loss": 0.2116, + "step": 2476 + }, + { + "epoch": 0.6591271953166578, + "grad_norm": 0.37791696190834045, + "learning_rate": 1.9262686787870024e-07, + "loss": 0.2145, + "step": 2477 + }, + { + "epoch": 0.6593932943054817, + "grad_norm": 0.34736019372940063, + "learning_rate": 1.9262050190563707e-07, + "loss": 0.2079, + "step": 2478 + }, + { + "epoch": 0.6596593932943055, + "grad_norm": 0.3458625078201294, + "learning_rate": 1.92614133290849e-07, + "loss": 0.2036, + "step": 2479 + }, + { + "epoch": 0.6599254922831294, + "grad_norm": 0.28131619095802307, + "learning_rate": 1.9260776203451758e-07, + "loss": 0.2163, + "step": 2480 + }, + { + "epoch": 0.6601915912719531, + "grad_norm": 0.46135011315345764, + "learning_rate": 1.9260138813682464e-07, + "loss": 0.2227, + "step": 2481 + }, + { + "epoch": 0.660457690260777, + "grad_norm": 0.27208220958709717, + "learning_rate": 1.9259501159795195e-07, + "loss": 0.1891, + "step": 2482 + }, + { + "epoch": 0.6607237892496008, + "grad_norm": 0.3647623658180237, + "learning_rate": 1.9258863241808137e-07, + "loss": 0.2228, + "step": 2483 + }, + { + "epoch": 0.6609898882384246, + "grad_norm": 0.2540648281574249, + "learning_rate": 1.9258225059739482e-07, + "loss": 0.1997, + "step": 2484 + }, + { + "epoch": 0.6612559872272485, + "grad_norm": 0.28017711639404297, + "learning_rate": 1.9257586613607434e-07, + "loss": 0.2025, + "step": 2485 + }, + { + "epoch": 0.6615220862160723, + "grad_norm": 0.4298401176929474, + "learning_rate": 1.9256947903430208e-07, + "loss": 0.2277, + "step": 2486 + }, + { + "epoch": 0.6617881852048962, + "grad_norm": 0.3204290270805359, + "learning_rate": 1.9256308929226012e-07, + "loss": 0.2289, + "step": 2487 + }, + { + "epoch": 0.66205428419372, + "grad_norm": 0.39691588282585144, + "learning_rate": 1.925566969101308e-07, + "loss": 0.2009, + "step": 2488 + }, + { + "epoch": 0.6623203831825439, + "grad_norm": 0.4071376621723175, + "learning_rate": 1.9255030188809636e-07, + "loss": 0.2088, + "step": 2489 + }, + { + "epoch": 0.6625864821713677, + "grad_norm": 0.2930663824081421, + "learning_rate": 1.9254390422633925e-07, + "loss": 0.2297, + "step": 2490 + }, + { + "epoch": 0.6628525811601916, + "grad_norm": 0.34697604179382324, + "learning_rate": 1.9253750392504193e-07, + "loss": 0.2019, + "step": 2491 + }, + { + "epoch": 0.6631186801490154, + "grad_norm": 0.3358358144760132, + "learning_rate": 1.9253110098438697e-07, + "loss": 0.2182, + "step": 2492 + }, + { + "epoch": 0.6633847791378393, + "grad_norm": 0.3954378664493561, + "learning_rate": 1.9252469540455696e-07, + "loss": 0.2373, + "step": 2493 + }, + { + "epoch": 0.6636508781266631, + "grad_norm": 0.26921582221984863, + "learning_rate": 1.9251828718573462e-07, + "loss": 0.2068, + "step": 2494 + }, + { + "epoch": 0.6639169771154869, + "grad_norm": 0.32954141497612, + "learning_rate": 1.9251187632810274e-07, + "loss": 0.1982, + "step": 2495 + }, + { + "epoch": 0.6641830761043108, + "grad_norm": 0.2557165324687958, + "learning_rate": 1.9250546283184414e-07, + "loss": 0.2102, + "step": 2496 + }, + { + "epoch": 0.6644491750931346, + "grad_norm": 0.34678640961647034, + "learning_rate": 1.9249904669714178e-07, + "loss": 0.2154, + "step": 2497 + }, + { + "epoch": 0.6647152740819585, + "grad_norm": 0.378915011882782, + "learning_rate": 1.924926279241786e-07, + "loss": 0.2261, + "step": 2498 + }, + { + "epoch": 0.6649813730707823, + "grad_norm": 0.7033190131187439, + "learning_rate": 1.9248620651313775e-07, + "loss": 0.2152, + "step": 2499 + }, + { + "epoch": 0.6652474720596062, + "grad_norm": 0.28292906284332275, + "learning_rate": 1.9247978246420234e-07, + "loss": 0.2109, + "step": 2500 + }, + { + "epoch": 0.66551357104843, + "grad_norm": 0.2521919310092926, + "learning_rate": 1.924733557775556e-07, + "loss": 0.1953, + "step": 2501 + }, + { + "epoch": 0.6657796700372539, + "grad_norm": 0.33323734998703003, + "learning_rate": 1.9246692645338084e-07, + "loss": 0.219, + "step": 2502 + }, + { + "epoch": 0.6660457690260777, + "grad_norm": 0.3122061789035797, + "learning_rate": 1.9246049449186143e-07, + "loss": 0.2297, + "step": 2503 + }, + { + "epoch": 0.6663118680149015, + "grad_norm": 0.2447386085987091, + "learning_rate": 1.9245405989318085e-07, + "loss": 0.193, + "step": 2504 + }, + { + "epoch": 0.6665779670037254, + "grad_norm": 0.2755383849143982, + "learning_rate": 1.924476226575226e-07, + "loss": 0.2047, + "step": 2505 + }, + { + "epoch": 0.6668440659925492, + "grad_norm": 0.33121150732040405, + "learning_rate": 1.9244118278507026e-07, + "loss": 0.2261, + "step": 2506 + }, + { + "epoch": 0.6671101649813731, + "grad_norm": 0.2638905942440033, + "learning_rate": 1.924347402760076e-07, + "loss": 0.2159, + "step": 2507 + }, + { + "epoch": 0.6673762639701969, + "grad_norm": 0.38790711760520935, + "learning_rate": 1.9242829513051824e-07, + "loss": 0.228, + "step": 2508 + }, + { + "epoch": 0.6676423629590208, + "grad_norm": 0.2655755281448364, + "learning_rate": 1.924218473487861e-07, + "loss": 0.2089, + "step": 2509 + }, + { + "epoch": 0.6679084619478446, + "grad_norm": 0.3679089844226837, + "learning_rate": 1.924153969309951e-07, + "loss": 0.1995, + "step": 2510 + }, + { + "epoch": 0.6681745609366685, + "grad_norm": 0.26723435521125793, + "learning_rate": 1.9240894387732916e-07, + "loss": 0.2129, + "step": 2511 + }, + { + "epoch": 0.6684406599254923, + "grad_norm": 0.2704660892486572, + "learning_rate": 1.9240248818797236e-07, + "loss": 0.2259, + "step": 2512 + }, + { + "epoch": 0.6687067589143161, + "grad_norm": 0.29153090715408325, + "learning_rate": 1.9239602986310878e-07, + "loss": 0.2387, + "step": 2513 + }, + { + "epoch": 0.66897285790314, + "grad_norm": 0.3200059235095978, + "learning_rate": 1.9238956890292273e-07, + "loss": 0.224, + "step": 2514 + }, + { + "epoch": 0.6692389568919638, + "grad_norm": 0.4711012542247772, + "learning_rate": 1.9238310530759842e-07, + "loss": 0.2132, + "step": 2515 + }, + { + "epoch": 0.6695050558807877, + "grad_norm": 0.39573198556900024, + "learning_rate": 1.923766390773202e-07, + "loss": 0.2238, + "step": 2516 + }, + { + "epoch": 0.6697711548696115, + "grad_norm": 0.3106217086315155, + "learning_rate": 1.9237017021227252e-07, + "loss": 0.2145, + "step": 2517 + }, + { + "epoch": 0.6700372538584354, + "grad_norm": 0.26426035165786743, + "learning_rate": 1.923636987126399e-07, + "loss": 0.1932, + "step": 2518 + }, + { + "epoch": 0.6703033528472592, + "grad_norm": 0.2841518521308899, + "learning_rate": 1.9235722457860688e-07, + "loss": 0.2199, + "step": 2519 + }, + { + "epoch": 0.6705694518360831, + "grad_norm": 0.28301751613616943, + "learning_rate": 1.9235074781035815e-07, + "loss": 0.208, + "step": 2520 + }, + { + "epoch": 0.6708355508249069, + "grad_norm": 0.40655070543289185, + "learning_rate": 1.923442684080784e-07, + "loss": 0.232, + "step": 2521 + }, + { + "epoch": 0.6711016498137307, + "grad_norm": 0.2612617611885071, + "learning_rate": 1.9233778637195247e-07, + "loss": 0.2151, + "step": 2522 + }, + { + "epoch": 0.6713677488025546, + "grad_norm": 0.2634410858154297, + "learning_rate": 1.9233130170216526e-07, + "loss": 0.2127, + "step": 2523 + }, + { + "epoch": 0.6716338477913784, + "grad_norm": 0.3096814751625061, + "learning_rate": 1.9232481439890168e-07, + "loss": 0.231, + "step": 2524 + }, + { + "epoch": 0.6718999467802023, + "grad_norm": 0.7124761939048767, + "learning_rate": 1.9231832446234677e-07, + "loss": 0.2021, + "step": 2525 + }, + { + "epoch": 0.672166045769026, + "grad_norm": 0.2624015212059021, + "learning_rate": 1.9231183189268564e-07, + "loss": 0.1948, + "step": 2526 + }, + { + "epoch": 0.67243214475785, + "grad_norm": 0.33212342858314514, + "learning_rate": 1.9230533669010353e-07, + "loss": 0.1949, + "step": 2527 + }, + { + "epoch": 0.6726982437466738, + "grad_norm": 0.2745049297809601, + "learning_rate": 1.922988388547856e-07, + "loss": 0.2068, + "step": 2528 + }, + { + "epoch": 0.6729643427354977, + "grad_norm": 0.29295313358306885, + "learning_rate": 1.9229233838691723e-07, + "loss": 0.2269, + "step": 2529 + }, + { + "epoch": 0.6732304417243214, + "grad_norm": 0.2794772684574127, + "learning_rate": 1.9228583528668382e-07, + "loss": 0.2088, + "step": 2530 + }, + { + "epoch": 0.6734965407131452, + "grad_norm": 0.2658919095993042, + "learning_rate": 1.9227932955427086e-07, + "loss": 0.203, + "step": 2531 + }, + { + "epoch": 0.6737626397019691, + "grad_norm": 0.27536606788635254, + "learning_rate": 1.9227282118986392e-07, + "loss": 0.205, + "step": 2532 + }, + { + "epoch": 0.6740287386907929, + "grad_norm": 0.26154032349586487, + "learning_rate": 1.922663101936486e-07, + "loss": 0.1876, + "step": 2533 + }, + { + "epoch": 0.6742948376796168, + "grad_norm": 0.5094501972198486, + "learning_rate": 1.922597965658106e-07, + "loss": 0.2453, + "step": 2534 + }, + { + "epoch": 0.6745609366684406, + "grad_norm": 0.3638540506362915, + "learning_rate": 1.9225328030653572e-07, + "loss": 0.235, + "step": 2535 + }, + { + "epoch": 0.6748270356572645, + "grad_norm": 0.2648650109767914, + "learning_rate": 1.9224676141600985e-07, + "loss": 0.2197, + "step": 2536 + }, + { + "epoch": 0.6750931346460883, + "grad_norm": 0.38617056608200073, + "learning_rate": 1.9224023989441886e-07, + "loss": 0.2218, + "step": 2537 + }, + { + "epoch": 0.6753592336349122, + "grad_norm": 0.2891090214252472, + "learning_rate": 1.922337157419488e-07, + "loss": 0.2136, + "step": 2538 + }, + { + "epoch": 0.675625332623736, + "grad_norm": 0.3878990113735199, + "learning_rate": 1.9222718895878575e-07, + "loss": 0.2127, + "step": 2539 + }, + { + "epoch": 0.6758914316125598, + "grad_norm": 0.23884688317775726, + "learning_rate": 1.9222065954511583e-07, + "loss": 0.1783, + "step": 2540 + }, + { + "epoch": 0.6761575306013837, + "grad_norm": 0.4025321900844574, + "learning_rate": 1.922141275011253e-07, + "loss": 0.2335, + "step": 2541 + }, + { + "epoch": 0.6764236295902075, + "grad_norm": 0.2784683406352997, + "learning_rate": 1.922075928270005e-07, + "loss": 0.2112, + "step": 2542 + }, + { + "epoch": 0.6766897285790314, + "grad_norm": 0.28760674595832825, + "learning_rate": 1.9220105552292776e-07, + "loss": 0.2001, + "step": 2543 + }, + { + "epoch": 0.6769558275678552, + "grad_norm": 0.26680606603622437, + "learning_rate": 1.9219451558909354e-07, + "loss": 0.2039, + "step": 2544 + }, + { + "epoch": 0.6772219265566791, + "grad_norm": 0.28746840357780457, + "learning_rate": 1.921879730256844e-07, + "loss": 0.2117, + "step": 2545 + }, + { + "epoch": 0.6774880255455029, + "grad_norm": 0.272017240524292, + "learning_rate": 1.9218142783288695e-07, + "loss": 0.2016, + "step": 2546 + }, + { + "epoch": 0.6777541245343268, + "grad_norm": 0.3493981659412384, + "learning_rate": 1.9217488001088781e-07, + "loss": 0.2314, + "step": 2547 + }, + { + "epoch": 0.6780202235231506, + "grad_norm": 0.256773442029953, + "learning_rate": 1.9216832955987383e-07, + "loss": 0.1917, + "step": 2548 + }, + { + "epoch": 0.6782863225119744, + "grad_norm": 0.25982528924942017, + "learning_rate": 1.9216177648003181e-07, + "loss": 0.1938, + "step": 2549 + }, + { + "epoch": 0.6785524215007983, + "grad_norm": 0.27467456459999084, + "learning_rate": 1.9215522077154857e-07, + "loss": 0.2326, + "step": 2550 + }, + { + "epoch": 0.6788185204896221, + "grad_norm": 0.38565757870674133, + "learning_rate": 1.9214866243461123e-07, + "loss": 0.2492, + "step": 2551 + }, + { + "epoch": 0.679084619478446, + "grad_norm": 0.3530331254005432, + "learning_rate": 1.9214210146940676e-07, + "loss": 0.202, + "step": 2552 + }, + { + "epoch": 0.6793507184672698, + "grad_norm": 0.5279794335365295, + "learning_rate": 1.9213553787612232e-07, + "loss": 0.2318, + "step": 2553 + }, + { + "epoch": 0.6796168174560937, + "grad_norm": 0.2624325752258301, + "learning_rate": 1.921289716549451e-07, + "loss": 0.2018, + "step": 2554 + }, + { + "epoch": 0.6798829164449175, + "grad_norm": 0.26121219992637634, + "learning_rate": 1.921224028060624e-07, + "loss": 0.2188, + "step": 2555 + }, + { + "epoch": 0.6801490154337414, + "grad_norm": 0.2663070559501648, + "learning_rate": 1.9211583132966156e-07, + "loss": 0.2037, + "step": 2556 + }, + { + "epoch": 0.6804151144225652, + "grad_norm": 0.2829674184322357, + "learning_rate": 1.9210925722593002e-07, + "loss": 0.2205, + "step": 2557 + }, + { + "epoch": 0.680681213411389, + "grad_norm": 0.25114160776138306, + "learning_rate": 1.921026804950553e-07, + "loss": 0.1946, + "step": 2558 + }, + { + "epoch": 0.6809473124002129, + "grad_norm": 0.3952549695968628, + "learning_rate": 1.9209610113722497e-07, + "loss": 0.2079, + "step": 2559 + }, + { + "epoch": 0.6812134113890367, + "grad_norm": 0.5860555768013, + "learning_rate": 1.920895191526267e-07, + "loss": 0.2175, + "step": 2560 + }, + { + "epoch": 0.6814795103778606, + "grad_norm": 0.27123600244522095, + "learning_rate": 1.9208293454144822e-07, + "loss": 0.2072, + "step": 2561 + }, + { + "epoch": 0.6817456093666844, + "grad_norm": 0.446296364068985, + "learning_rate": 1.920763473038773e-07, + "loss": 0.2108, + "step": 2562 + }, + { + "epoch": 0.6820117083555083, + "grad_norm": 0.42573225498199463, + "learning_rate": 1.9206975744010182e-07, + "loss": 0.223, + "step": 2563 + }, + { + "epoch": 0.6822778073443321, + "grad_norm": 0.3055950403213501, + "learning_rate": 1.920631649503098e-07, + "loss": 0.2054, + "step": 2564 + }, + { + "epoch": 0.682543906333156, + "grad_norm": 0.2577064037322998, + "learning_rate": 1.9205656983468925e-07, + "loss": 0.1988, + "step": 2565 + }, + { + "epoch": 0.6828100053219798, + "grad_norm": 0.7460551261901855, + "learning_rate": 1.9204997209342824e-07, + "loss": 0.2093, + "step": 2566 + }, + { + "epoch": 0.6830761043108036, + "grad_norm": 0.2658613324165344, + "learning_rate": 1.9204337172671495e-07, + "loss": 0.2098, + "step": 2567 + }, + { + "epoch": 0.6833422032996275, + "grad_norm": 0.38510435819625854, + "learning_rate": 1.9203676873473768e-07, + "loss": 0.2177, + "step": 2568 + }, + { + "epoch": 0.6836083022884513, + "grad_norm": 0.28905612230300903, + "learning_rate": 1.9203016311768473e-07, + "loss": 0.1946, + "step": 2569 + }, + { + "epoch": 0.6838744012772752, + "grad_norm": 0.2893888056278229, + "learning_rate": 1.920235548757445e-07, + "loss": 0.2126, + "step": 2570 + }, + { + "epoch": 0.684140500266099, + "grad_norm": 0.31221362948417664, + "learning_rate": 1.9201694400910552e-07, + "loss": 0.2058, + "step": 2571 + }, + { + "epoch": 0.6844065992549229, + "grad_norm": 0.2755693793296814, + "learning_rate": 1.920103305179563e-07, + "loss": 0.2066, + "step": 2572 + }, + { + "epoch": 0.6846726982437467, + "grad_norm": 0.3760436475276947, + "learning_rate": 1.920037144024855e-07, + "loss": 0.219, + "step": 2573 + }, + { + "epoch": 0.6849387972325706, + "grad_norm": 0.315128892660141, + "learning_rate": 1.9199709566288176e-07, + "loss": 0.1986, + "step": 2574 + }, + { + "epoch": 0.6852048962213944, + "grad_norm": 0.2634030282497406, + "learning_rate": 1.919904742993339e-07, + "loss": 0.193, + "step": 2575 + }, + { + "epoch": 0.6854709952102181, + "grad_norm": 0.27988526225090027, + "learning_rate": 1.919838503120308e-07, + "loss": 0.2243, + "step": 2576 + }, + { + "epoch": 0.685737094199042, + "grad_norm": 0.3288469910621643, + "learning_rate": 1.919772237011614e-07, + "loss": 0.2277, + "step": 2577 + }, + { + "epoch": 0.6860031931878658, + "grad_norm": 0.29904839396476746, + "learning_rate": 1.9197059446691463e-07, + "loss": 0.2219, + "step": 2578 + }, + { + "epoch": 0.6862692921766897, + "grad_norm": 0.3960373103618622, + "learning_rate": 1.919639626094796e-07, + "loss": 0.2278, + "step": 2579 + }, + { + "epoch": 0.6865353911655135, + "grad_norm": 0.324092298746109, + "learning_rate": 1.919573281290455e-07, + "loss": 0.222, + "step": 2580 + }, + { + "epoch": 0.6868014901543374, + "grad_norm": 0.3477138876914978, + "learning_rate": 1.9195069102580154e-07, + "loss": 0.2085, + "step": 2581 + }, + { + "epoch": 0.6870675891431612, + "grad_norm": 0.265974223613739, + "learning_rate": 1.9194405129993703e-07, + "loss": 0.2084, + "step": 2582 + }, + { + "epoch": 0.6873336881319851, + "grad_norm": 0.3572200834751129, + "learning_rate": 1.919374089516413e-07, + "loss": 0.2035, + "step": 2583 + }, + { + "epoch": 0.6875997871208089, + "grad_norm": 0.29956483840942383, + "learning_rate": 1.919307639811039e-07, + "loss": 0.2081, + "step": 2584 + }, + { + "epoch": 0.6878658861096327, + "grad_norm": 0.49847087264060974, + "learning_rate": 1.9192411638851426e-07, + "loss": 0.2013, + "step": 2585 + }, + { + "epoch": 0.6881319850984566, + "grad_norm": 0.39859360456466675, + "learning_rate": 1.9191746617406205e-07, + "loss": 0.2282, + "step": 2586 + }, + { + "epoch": 0.6883980840872804, + "grad_norm": 0.3334825932979584, + "learning_rate": 1.9191081333793688e-07, + "loss": 0.2033, + "step": 2587 + }, + { + "epoch": 0.6886641830761043, + "grad_norm": 0.3853002190589905, + "learning_rate": 1.9190415788032855e-07, + "loss": 0.213, + "step": 2588 + }, + { + "epoch": 0.6889302820649281, + "grad_norm": 0.2997151017189026, + "learning_rate": 1.918974998014269e-07, + "loss": 0.2115, + "step": 2589 + }, + { + "epoch": 0.689196381053752, + "grad_norm": 0.3329659402370453, + "learning_rate": 1.918908391014218e-07, + "loss": 0.2213, + "step": 2590 + }, + { + "epoch": 0.6894624800425758, + "grad_norm": 0.29890257120132446, + "learning_rate": 1.918841757805032e-07, + "loss": 0.2302, + "step": 2591 + }, + { + "epoch": 0.6897285790313997, + "grad_norm": 0.31455352902412415, + "learning_rate": 1.9187750983886123e-07, + "loss": 0.2339, + "step": 2592 + }, + { + "epoch": 0.6899946780202235, + "grad_norm": 0.32634177803993225, + "learning_rate": 1.9187084127668598e-07, + "loss": 0.2439, + "step": 2593 + }, + { + "epoch": 0.6902607770090473, + "grad_norm": 0.3055887818336487, + "learning_rate": 1.9186417009416763e-07, + "loss": 0.2064, + "step": 2594 + }, + { + "epoch": 0.6905268759978712, + "grad_norm": 0.4117889702320099, + "learning_rate": 1.9185749629149648e-07, + "loss": 0.2035, + "step": 2595 + }, + { + "epoch": 0.690792974986695, + "grad_norm": 0.386883407831192, + "learning_rate": 1.9185081986886286e-07, + "loss": 0.1975, + "step": 2596 + }, + { + "epoch": 0.6910590739755189, + "grad_norm": 0.269552081823349, + "learning_rate": 1.9184414082645723e-07, + "loss": 0.1954, + "step": 2597 + }, + { + "epoch": 0.6913251729643427, + "grad_norm": 0.2779572606086731, + "learning_rate": 1.9183745916447008e-07, + "loss": 0.2069, + "step": 2598 + }, + { + "epoch": 0.6915912719531666, + "grad_norm": 0.47188833355903625, + "learning_rate": 1.9183077488309193e-07, + "loss": 0.2281, + "step": 2599 + }, + { + "epoch": 0.6918573709419904, + "grad_norm": 0.37933149933815, + "learning_rate": 1.918240879825135e-07, + "loss": 0.2061, + "step": 2600 + }, + { + "epoch": 0.6921234699308143, + "grad_norm": 0.24236994981765747, + "learning_rate": 1.9181739846292547e-07, + "loss": 0.1927, + "step": 2601 + }, + { + "epoch": 0.6923895689196381, + "grad_norm": 0.2581564784049988, + "learning_rate": 1.9181070632451864e-07, + "loss": 0.2044, + "step": 2602 + }, + { + "epoch": 0.692655667908462, + "grad_norm": 0.2646183967590332, + "learning_rate": 1.9180401156748394e-07, + "loss": 0.2079, + "step": 2603 + }, + { + "epoch": 0.6929217668972858, + "grad_norm": 0.2825230062007904, + "learning_rate": 1.917973141920122e-07, + "loss": 0.2104, + "step": 2604 + }, + { + "epoch": 0.6931878658861096, + "grad_norm": 0.43720072507858276, + "learning_rate": 1.917906141982946e-07, + "loss": 0.2445, + "step": 2605 + }, + { + "epoch": 0.6934539648749335, + "grad_norm": 0.2579250931739807, + "learning_rate": 1.9178391158652213e-07, + "loss": 0.2041, + "step": 2606 + }, + { + "epoch": 0.6937200638637573, + "grad_norm": 0.4190925657749176, + "learning_rate": 1.9177720635688597e-07, + "loss": 0.2286, + "step": 2607 + }, + { + "epoch": 0.6939861628525812, + "grad_norm": 0.3779555857181549, + "learning_rate": 1.917704985095774e-07, + "loss": 0.2115, + "step": 2608 + }, + { + "epoch": 0.694252261841405, + "grad_norm": 0.2722090482711792, + "learning_rate": 1.9176378804478773e-07, + "loss": 0.1956, + "step": 2609 + }, + { + "epoch": 0.6945183608302289, + "grad_norm": 0.26447463035583496, + "learning_rate": 1.9175707496270834e-07, + "loss": 0.21, + "step": 2610 + }, + { + "epoch": 0.6947844598190527, + "grad_norm": 0.2610284686088562, + "learning_rate": 1.917503592635307e-07, + "loss": 0.2112, + "step": 2611 + }, + { + "epoch": 0.6950505588078766, + "grad_norm": 0.3195679783821106, + "learning_rate": 1.917436409474464e-07, + "loss": 0.2144, + "step": 2612 + }, + { + "epoch": 0.6953166577967004, + "grad_norm": 0.3859598934650421, + "learning_rate": 1.9173692001464702e-07, + "loss": 0.2222, + "step": 2613 + }, + { + "epoch": 0.6955827567855242, + "grad_norm": 0.2788684070110321, + "learning_rate": 1.9173019646532428e-07, + "loss": 0.2161, + "step": 2614 + }, + { + "epoch": 0.6958488557743481, + "grad_norm": 0.2611372172832489, + "learning_rate": 1.9172347029966987e-07, + "loss": 0.2162, + "step": 2615 + }, + { + "epoch": 0.6961149547631719, + "grad_norm": 0.27302899956703186, + "learning_rate": 1.9171674151787577e-07, + "loss": 0.2036, + "step": 2616 + }, + { + "epoch": 0.6963810537519958, + "grad_norm": 0.28080326318740845, + "learning_rate": 1.9171001012013378e-07, + "loss": 0.2032, + "step": 2617 + }, + { + "epoch": 0.6966471527408196, + "grad_norm": 0.3387605547904968, + "learning_rate": 1.9170327610663594e-07, + "loss": 0.2107, + "step": 2618 + }, + { + "epoch": 0.6969132517296435, + "grad_norm": 0.3472575843334198, + "learning_rate": 1.9169653947757432e-07, + "loss": 0.2044, + "step": 2619 + }, + { + "epoch": 0.6971793507184673, + "grad_norm": 0.3928994834423065, + "learning_rate": 1.9168980023314106e-07, + "loss": 0.2415, + "step": 2620 + }, + { + "epoch": 0.6974454497072912, + "grad_norm": 0.4655194878578186, + "learning_rate": 1.9168305837352836e-07, + "loss": 0.2273, + "step": 2621 + }, + { + "epoch": 0.697711548696115, + "grad_norm": 0.26947373151779175, + "learning_rate": 1.9167631389892856e-07, + "loss": 0.2145, + "step": 2622 + }, + { + "epoch": 0.6979776476849388, + "grad_norm": 0.2955455780029297, + "learning_rate": 1.9166956680953396e-07, + "loss": 0.2101, + "step": 2623 + }, + { + "epoch": 0.6982437466737627, + "grad_norm": 0.28689268231391907, + "learning_rate": 1.9166281710553703e-07, + "loss": 0.2225, + "step": 2624 + }, + { + "epoch": 0.6985098456625864, + "grad_norm": 0.2557941675186157, + "learning_rate": 1.9165606478713033e-07, + "loss": 0.2052, + "step": 2625 + }, + { + "epoch": 0.6987759446514104, + "grad_norm": 0.25712376832962036, + "learning_rate": 1.9164930985450638e-07, + "loss": 0.1954, + "step": 2626 + }, + { + "epoch": 0.6990420436402341, + "grad_norm": 0.30183279514312744, + "learning_rate": 1.916425523078579e-07, + "loss": 0.2203, + "step": 2627 + }, + { + "epoch": 0.699308142629058, + "grad_norm": 0.35616418719291687, + "learning_rate": 1.9163579214737756e-07, + "loss": 0.2168, + "step": 2628 + }, + { + "epoch": 0.6995742416178818, + "grad_norm": 0.2842315435409546, + "learning_rate": 1.9162902937325822e-07, + "loss": 0.2092, + "step": 2629 + }, + { + "epoch": 0.6998403406067057, + "grad_norm": 0.30833959579467773, + "learning_rate": 1.916222639856928e-07, + "loss": 0.211, + "step": 2630 + }, + { + "epoch": 0.7001064395955295, + "grad_norm": 0.27835652232170105, + "learning_rate": 1.916154959848742e-07, + "loss": 0.2081, + "step": 2631 + }, + { + "epoch": 0.7003725385843533, + "grad_norm": 0.3003099262714386, + "learning_rate": 1.916087253709955e-07, + "loss": 0.2157, + "step": 2632 + }, + { + "epoch": 0.7006386375731772, + "grad_norm": 0.2717071771621704, + "learning_rate": 1.9160195214424975e-07, + "loss": 0.1987, + "step": 2633 + }, + { + "epoch": 0.700904736562001, + "grad_norm": 0.312686949968338, + "learning_rate": 1.915951763048302e-07, + "loss": 0.2015, + "step": 2634 + }, + { + "epoch": 0.7011708355508249, + "grad_norm": 0.31147632002830505, + "learning_rate": 1.9158839785293012e-07, + "loss": 0.209, + "step": 2635 + }, + { + "epoch": 0.7014369345396487, + "grad_norm": 0.27095329761505127, + "learning_rate": 1.9158161678874277e-07, + "loss": 0.2131, + "step": 2636 + }, + { + "epoch": 0.7017030335284726, + "grad_norm": 0.2841237187385559, + "learning_rate": 1.9157483311246165e-07, + "loss": 0.2016, + "step": 2637 + }, + { + "epoch": 0.7019691325172964, + "grad_norm": 0.2913610637187958, + "learning_rate": 1.9156804682428018e-07, + "loss": 0.1902, + "step": 2638 + }, + { + "epoch": 0.7022352315061203, + "grad_norm": 0.271907776594162, + "learning_rate": 1.9156125792439197e-07, + "loss": 0.1993, + "step": 2639 + }, + { + "epoch": 0.7025013304949441, + "grad_norm": 0.24796348810195923, + "learning_rate": 1.9155446641299057e-07, + "loss": 0.2098, + "step": 2640 + }, + { + "epoch": 0.7027674294837679, + "grad_norm": 0.29204070568084717, + "learning_rate": 1.915476722902698e-07, + "loss": 0.1948, + "step": 2641 + }, + { + "epoch": 0.7030335284725918, + "grad_norm": 0.3838568925857544, + "learning_rate": 1.9154087555642333e-07, + "loss": 0.2482, + "step": 2642 + }, + { + "epoch": 0.7032996274614156, + "grad_norm": 0.3466247618198395, + "learning_rate": 1.915340762116451e-07, + "loss": 0.2163, + "step": 2643 + }, + { + "epoch": 0.7035657264502395, + "grad_norm": 0.3281455934047699, + "learning_rate": 1.91527274256129e-07, + "loss": 0.2238, + "step": 2644 + }, + { + "epoch": 0.7038318254390633, + "grad_norm": 0.36157599091529846, + "learning_rate": 1.9152046969006905e-07, + "loss": 0.2111, + "step": 2645 + }, + { + "epoch": 0.7040979244278872, + "grad_norm": 0.3423406183719635, + "learning_rate": 1.915136625136593e-07, + "loss": 0.2191, + "step": 2646 + }, + { + "epoch": 0.704364023416711, + "grad_norm": 0.3907645046710968, + "learning_rate": 1.9150685272709394e-07, + "loss": 0.2011, + "step": 2647 + }, + { + "epoch": 0.7046301224055349, + "grad_norm": 0.27307718992233276, + "learning_rate": 1.9150004033056723e-07, + "loss": 0.2003, + "step": 2648 + }, + { + "epoch": 0.7048962213943587, + "grad_norm": 0.40291687846183777, + "learning_rate": 1.914932253242734e-07, + "loss": 0.2003, + "step": 2649 + }, + { + "epoch": 0.7051623203831825, + "grad_norm": 0.32029032707214355, + "learning_rate": 1.9148640770840684e-07, + "loss": 0.1904, + "step": 2650 + }, + { + "epoch": 0.7054284193720064, + "grad_norm": 0.36207860708236694, + "learning_rate": 1.9147958748316207e-07, + "loss": 0.2363, + "step": 2651 + }, + { + "epoch": 0.7056945183608302, + "grad_norm": 0.27255234122276306, + "learning_rate": 1.9147276464873354e-07, + "loss": 0.2253, + "step": 2652 + }, + { + "epoch": 0.7059606173496541, + "grad_norm": 0.3378559350967407, + "learning_rate": 1.914659392053159e-07, + "loss": 0.1853, + "step": 2653 + }, + { + "epoch": 0.7062267163384779, + "grad_norm": 0.2865406572818756, + "learning_rate": 1.9145911115310377e-07, + "loss": 0.2118, + "step": 2654 + }, + { + "epoch": 0.7064928153273018, + "grad_norm": 0.2894911766052246, + "learning_rate": 1.9145228049229197e-07, + "loss": 0.2193, + "step": 2655 + }, + { + "epoch": 0.7067589143161256, + "grad_norm": 0.27506107091903687, + "learning_rate": 1.914454472230753e-07, + "loss": 0.2299, + "step": 2656 + }, + { + "epoch": 0.7070250133049495, + "grad_norm": 0.2980518341064453, + "learning_rate": 1.9143861134564862e-07, + "loss": 0.2078, + "step": 2657 + }, + { + "epoch": 0.7072911122937733, + "grad_norm": 0.26713937520980835, + "learning_rate": 1.9143177286020694e-07, + "loss": 0.1957, + "step": 2658 + }, + { + "epoch": 0.7075572112825971, + "grad_norm": 0.4111461639404297, + "learning_rate": 1.9142493176694532e-07, + "loss": 0.2261, + "step": 2659 + }, + { + "epoch": 0.707823310271421, + "grad_norm": 0.3781028687953949, + "learning_rate": 1.9141808806605883e-07, + "loss": 0.2192, + "step": 2660 + }, + { + "epoch": 0.7080894092602448, + "grad_norm": 0.2590653896331787, + "learning_rate": 1.914112417577427e-07, + "loss": 0.1834, + "step": 2661 + }, + { + "epoch": 0.7083555082490687, + "grad_norm": 0.35424190759658813, + "learning_rate": 1.9140439284219224e-07, + "loss": 0.2151, + "step": 2662 + }, + { + "epoch": 0.7086216072378925, + "grad_norm": 0.2812662422657013, + "learning_rate": 1.9139754131960272e-07, + "loss": 0.2106, + "step": 2663 + }, + { + "epoch": 0.7088877062267164, + "grad_norm": 0.2908899486064911, + "learning_rate": 1.913906871901696e-07, + "loss": 0.2144, + "step": 2664 + }, + { + "epoch": 0.7091538052155402, + "grad_norm": 0.3526274263858795, + "learning_rate": 1.9138383045408833e-07, + "loss": 0.2181, + "step": 2665 + }, + { + "epoch": 0.7094199042043641, + "grad_norm": 0.37963739037513733, + "learning_rate": 1.9137697111155456e-07, + "loss": 0.196, + "step": 2666 + }, + { + "epoch": 0.7096860031931879, + "grad_norm": 0.30706483125686646, + "learning_rate": 1.913701091627639e-07, + "loss": 0.2159, + "step": 2667 + }, + { + "epoch": 0.7099521021820117, + "grad_norm": 0.38872766494750977, + "learning_rate": 1.91363244607912e-07, + "loss": 0.2206, + "step": 2668 + }, + { + "epoch": 0.7102182011708356, + "grad_norm": 0.38203707337379456, + "learning_rate": 1.9135637744719472e-07, + "loss": 0.2378, + "step": 2669 + }, + { + "epoch": 0.7104843001596594, + "grad_norm": 0.2804964780807495, + "learning_rate": 1.9134950768080796e-07, + "loss": 0.2238, + "step": 2670 + }, + { + "epoch": 0.7107503991484833, + "grad_norm": 0.30372679233551025, + "learning_rate": 1.9134263530894756e-07, + "loss": 0.2173, + "step": 2671 + }, + { + "epoch": 0.711016498137307, + "grad_norm": 0.4472172260284424, + "learning_rate": 1.9133576033180958e-07, + "loss": 0.205, + "step": 2672 + }, + { + "epoch": 0.711282597126131, + "grad_norm": 0.2802627384662628, + "learning_rate": 1.9132888274959015e-07, + "loss": 0.212, + "step": 2673 + }, + { + "epoch": 0.7115486961149547, + "grad_norm": 0.3091230094432831, + "learning_rate": 1.9132200256248536e-07, + "loss": 0.2029, + "step": 2674 + }, + { + "epoch": 0.7118147951037787, + "grad_norm": 0.2755478322505951, + "learning_rate": 1.913151197706915e-07, + "loss": 0.2134, + "step": 2675 + }, + { + "epoch": 0.7120808940926024, + "grad_norm": 0.2906661033630371, + "learning_rate": 1.9130823437440485e-07, + "loss": 0.2091, + "step": 2676 + }, + { + "epoch": 0.7123469930814262, + "grad_norm": 0.2602698504924774, + "learning_rate": 1.9130134637382183e-07, + "loss": 0.2056, + "step": 2677 + }, + { + "epoch": 0.7126130920702501, + "grad_norm": 0.39234986901283264, + "learning_rate": 1.9129445576913885e-07, + "loss": 0.2398, + "step": 2678 + }, + { + "epoch": 0.7128791910590739, + "grad_norm": 0.36098602414131165, + "learning_rate": 1.912875625605525e-07, + "loss": 0.2199, + "step": 2679 + }, + { + "epoch": 0.7131452900478978, + "grad_norm": 0.2819021940231323, + "learning_rate": 1.9128066674825933e-07, + "loss": 0.2096, + "step": 2680 + }, + { + "epoch": 0.7134113890367216, + "grad_norm": 0.341839462518692, + "learning_rate": 1.9127376833245607e-07, + "loss": 0.2103, + "step": 2681 + }, + { + "epoch": 0.7136774880255455, + "grad_norm": 0.28725045919418335, + "learning_rate": 1.9126686731333947e-07, + "loss": 0.2334, + "step": 2682 + }, + { + "epoch": 0.7139435870143693, + "grad_norm": 0.43109872937202454, + "learning_rate": 1.9125996369110636e-07, + "loss": 0.2, + "step": 2683 + }, + { + "epoch": 0.7142096860031932, + "grad_norm": 0.2597619593143463, + "learning_rate": 1.912530574659536e-07, + "loss": 0.2107, + "step": 2684 + }, + { + "epoch": 0.714475784992017, + "grad_norm": 0.2780826985836029, + "learning_rate": 1.9124614863807823e-07, + "loss": 0.2123, + "step": 2685 + }, + { + "epoch": 0.7147418839808408, + "grad_norm": 0.3088717758655548, + "learning_rate": 1.912392372076773e-07, + "loss": 0.2106, + "step": 2686 + }, + { + "epoch": 0.7150079829696647, + "grad_norm": 0.33196157217025757, + "learning_rate": 1.912323231749479e-07, + "loss": 0.21, + "step": 2687 + }, + { + "epoch": 0.7152740819584885, + "grad_norm": 0.349979430437088, + "learning_rate": 1.912254065400872e-07, + "loss": 0.2168, + "step": 2688 + }, + { + "epoch": 0.7155401809473124, + "grad_norm": 0.2611532509326935, + "learning_rate": 1.9121848730329257e-07, + "loss": 0.2097, + "step": 2689 + }, + { + "epoch": 0.7158062799361362, + "grad_norm": 0.2637260854244232, + "learning_rate": 1.9121156546476133e-07, + "loss": 0.2024, + "step": 2690 + }, + { + "epoch": 0.7160723789249601, + "grad_norm": 0.25481486320495605, + "learning_rate": 1.912046410246909e-07, + "loss": 0.1988, + "step": 2691 + }, + { + "epoch": 0.7163384779137839, + "grad_norm": 0.2902291715145111, + "learning_rate": 1.9119771398327873e-07, + "loss": 0.2121, + "step": 2692 + }, + { + "epoch": 0.7166045769026078, + "grad_norm": 0.34665271639823914, + "learning_rate": 1.9119078434072246e-07, + "loss": 0.2124, + "step": 2693 + }, + { + "epoch": 0.7168706758914316, + "grad_norm": 0.65646892786026, + "learning_rate": 1.911838520972197e-07, + "loss": 0.2049, + "step": 2694 + }, + { + "epoch": 0.7171367748802554, + "grad_norm": 0.2899418771266937, + "learning_rate": 1.911769172529682e-07, + "loss": 0.2094, + "step": 2695 + }, + { + "epoch": 0.7174028738690793, + "grad_norm": 0.2999928891658783, + "learning_rate": 1.9116997980816572e-07, + "loss": 0.2129, + "step": 2696 + }, + { + "epoch": 0.7176689728579031, + "grad_norm": 0.24431446194648743, + "learning_rate": 1.9116303976301015e-07, + "loss": 0.187, + "step": 2697 + }, + { + "epoch": 0.717935071846727, + "grad_norm": 0.26917195320129395, + "learning_rate": 1.9115609711769944e-07, + "loss": 0.215, + "step": 2698 + }, + { + "epoch": 0.7182011708355508, + "grad_norm": 0.3513243794441223, + "learning_rate": 1.911491518724316e-07, + "loss": 0.2228, + "step": 2699 + }, + { + "epoch": 0.7184672698243747, + "grad_norm": 0.2711149752140045, + "learning_rate": 1.9114220402740475e-07, + "loss": 0.2051, + "step": 2700 + }, + { + "epoch": 0.7187333688131985, + "grad_norm": 0.29025593400001526, + "learning_rate": 1.9113525358281702e-07, + "loss": 0.2094, + "step": 2701 + }, + { + "epoch": 0.7189994678020224, + "grad_norm": 0.30129262804985046, + "learning_rate": 1.9112830053886663e-07, + "loss": 0.2113, + "step": 2702 + }, + { + "epoch": 0.7192655667908462, + "grad_norm": 0.2820858657360077, + "learning_rate": 1.9112134489575196e-07, + "loss": 0.1977, + "step": 2703 + }, + { + "epoch": 0.71953166577967, + "grad_norm": 0.2798754870891571, + "learning_rate": 1.9111438665367136e-07, + "loss": 0.2256, + "step": 2704 + }, + { + "epoch": 0.7197977647684939, + "grad_norm": 0.3197033703327179, + "learning_rate": 1.911074258128233e-07, + "loss": 0.215, + "step": 2705 + }, + { + "epoch": 0.7200638637573177, + "grad_norm": 0.24329370260238647, + "learning_rate": 1.9110046237340631e-07, + "loss": 0.184, + "step": 2706 + }, + { + "epoch": 0.7203299627461416, + "grad_norm": 0.3103681802749634, + "learning_rate": 1.91093496335619e-07, + "loss": 0.2181, + "step": 2707 + }, + { + "epoch": 0.7205960617349654, + "grad_norm": 0.2898060083389282, + "learning_rate": 1.910865276996601e-07, + "loss": 0.1989, + "step": 2708 + }, + { + "epoch": 0.7208621607237893, + "grad_norm": 0.2721886932849884, + "learning_rate": 1.910795564657283e-07, + "loss": 0.2089, + "step": 2709 + }, + { + "epoch": 0.7211282597126131, + "grad_norm": 0.3847410976886749, + "learning_rate": 1.9107258263402248e-07, + "loss": 0.2328, + "step": 2710 + }, + { + "epoch": 0.721394358701437, + "grad_norm": 0.3797452449798584, + "learning_rate": 1.9106560620474152e-07, + "loss": 0.2154, + "step": 2711 + }, + { + "epoch": 0.7216604576902608, + "grad_norm": 0.2574249804019928, + "learning_rate": 1.9105862717808443e-07, + "loss": 0.1898, + "step": 2712 + }, + { + "epoch": 0.7219265566790847, + "grad_norm": 0.38694387674331665, + "learning_rate": 1.9105164555425026e-07, + "loss": 0.216, + "step": 2713 + }, + { + "epoch": 0.7221926556679085, + "grad_norm": 0.3035295903682709, + "learning_rate": 1.9104466133343814e-07, + "loss": 0.2152, + "step": 2714 + }, + { + "epoch": 0.7224587546567323, + "grad_norm": 0.5740920901298523, + "learning_rate": 1.9103767451584722e-07, + "loss": 0.2189, + "step": 2715 + }, + { + "epoch": 0.7227248536455562, + "grad_norm": 0.35366129875183105, + "learning_rate": 1.9103068510167686e-07, + "loss": 0.2138, + "step": 2716 + }, + { + "epoch": 0.72299095263438, + "grad_norm": 0.2608794569969177, + "learning_rate": 1.9102369309112637e-07, + "loss": 0.2075, + "step": 2717 + }, + { + "epoch": 0.7232570516232039, + "grad_norm": 0.2650291323661804, + "learning_rate": 1.9101669848439518e-07, + "loss": 0.2188, + "step": 2718 + }, + { + "epoch": 0.7235231506120277, + "grad_norm": 0.2727518677711487, + "learning_rate": 1.9100970128168278e-07, + "loss": 0.2031, + "step": 2719 + }, + { + "epoch": 0.7237892496008516, + "grad_norm": 0.3337539732456207, + "learning_rate": 1.910027014831888e-07, + "loss": 0.199, + "step": 2720 + }, + { + "epoch": 0.7240553485896754, + "grad_norm": 0.2551347315311432, + "learning_rate": 1.9099569908911283e-07, + "loss": 0.1999, + "step": 2721 + }, + { + "epoch": 0.7243214475784993, + "grad_norm": 0.2623569369316101, + "learning_rate": 1.909886940996546e-07, + "loss": 0.1923, + "step": 2722 + }, + { + "epoch": 0.724587546567323, + "grad_norm": 0.3689413070678711, + "learning_rate": 1.9098168651501393e-07, + "loss": 0.2287, + "step": 2723 + }, + { + "epoch": 0.7248536455561468, + "grad_norm": 0.37521010637283325, + "learning_rate": 1.9097467633539068e-07, + "loss": 0.2074, + "step": 2724 + }, + { + "epoch": 0.7251197445449707, + "grad_norm": 0.2900080978870392, + "learning_rate": 1.909676635609848e-07, + "loss": 0.1833, + "step": 2725 + }, + { + "epoch": 0.7253858435337945, + "grad_norm": 0.40979477763175964, + "learning_rate": 1.9096064819199628e-07, + "loss": 0.2381, + "step": 2726 + }, + { + "epoch": 0.7256519425226184, + "grad_norm": 0.35387134552001953, + "learning_rate": 1.9095363022862522e-07, + "loss": 0.2186, + "step": 2727 + }, + { + "epoch": 0.7259180415114422, + "grad_norm": 0.3006797134876251, + "learning_rate": 1.909466096710718e-07, + "loss": 0.2225, + "step": 2728 + }, + { + "epoch": 0.7261841405002661, + "grad_norm": 0.42731961607933044, + "learning_rate": 1.9093958651953628e-07, + "loss": 0.2147, + "step": 2729 + }, + { + "epoch": 0.7264502394890899, + "grad_norm": 0.29490309953689575, + "learning_rate": 1.9093256077421895e-07, + "loss": 0.217, + "step": 2730 + }, + { + "epoch": 0.7267163384779138, + "grad_norm": 0.3773114085197449, + "learning_rate": 1.909255324353202e-07, + "loss": 0.2459, + "step": 2731 + }, + { + "epoch": 0.7269824374667376, + "grad_norm": 0.2759507894515991, + "learning_rate": 1.909185015030405e-07, + "loss": 0.2059, + "step": 2732 + }, + { + "epoch": 0.7272485364555614, + "grad_norm": 0.39433810114860535, + "learning_rate": 1.9091146797758037e-07, + "loss": 0.2147, + "step": 2733 + }, + { + "epoch": 0.7275146354443853, + "grad_norm": 0.2740982472896576, + "learning_rate": 1.9090443185914044e-07, + "loss": 0.2168, + "step": 2734 + }, + { + "epoch": 0.7277807344332091, + "grad_norm": 0.3737160563468933, + "learning_rate": 1.9089739314792143e-07, + "loss": 0.1984, + "step": 2735 + }, + { + "epoch": 0.728046833422033, + "grad_norm": 0.2609514594078064, + "learning_rate": 1.9089035184412398e-07, + "loss": 0.1964, + "step": 2736 + }, + { + "epoch": 0.7283129324108568, + "grad_norm": 0.2907635569572449, + "learning_rate": 1.9088330794794904e-07, + "loss": 0.2169, + "step": 2737 + }, + { + "epoch": 0.7285790313996807, + "grad_norm": 0.25846439599990845, + "learning_rate": 1.9087626145959745e-07, + "loss": 0.2094, + "step": 2738 + }, + { + "epoch": 0.7288451303885045, + "grad_norm": 0.26989540457725525, + "learning_rate": 1.9086921237927024e-07, + "loss": 0.2115, + "step": 2739 + }, + { + "epoch": 0.7291112293773284, + "grad_norm": 0.2978876233100891, + "learning_rate": 1.908621607071684e-07, + "loss": 0.2226, + "step": 2740 + }, + { + "epoch": 0.7293773283661522, + "grad_norm": 0.24239253997802734, + "learning_rate": 1.9085510644349315e-07, + "loss": 0.1864, + "step": 2741 + }, + { + "epoch": 0.729643427354976, + "grad_norm": 0.26699936389923096, + "learning_rate": 1.908480495884456e-07, + "loss": 0.2083, + "step": 2742 + }, + { + "epoch": 0.7299095263437999, + "grad_norm": 0.2552500069141388, + "learning_rate": 1.9084099014222707e-07, + "loss": 0.2079, + "step": 2743 + }, + { + "epoch": 0.7301756253326237, + "grad_norm": 0.3592827022075653, + "learning_rate": 1.9083392810503892e-07, + "loss": 0.2191, + "step": 2744 + }, + { + "epoch": 0.7304417243214476, + "grad_norm": 0.27066993713378906, + "learning_rate": 1.9082686347708252e-07, + "loss": 0.2159, + "step": 2745 + }, + { + "epoch": 0.7307078233102714, + "grad_norm": 0.3552928566932678, + "learning_rate": 1.9081979625855944e-07, + "loss": 0.2207, + "step": 2746 + }, + { + "epoch": 0.7309739222990953, + "grad_norm": 0.2837275564670563, + "learning_rate": 1.9081272644967122e-07, + "loss": 0.2085, + "step": 2747 + }, + { + "epoch": 0.7312400212879191, + "grad_norm": 0.32763922214508057, + "learning_rate": 1.9080565405061947e-07, + "loss": 0.2014, + "step": 2748 + }, + { + "epoch": 0.731506120276743, + "grad_norm": 0.2830948233604431, + "learning_rate": 1.9079857906160597e-07, + "loss": 0.2177, + "step": 2749 + }, + { + "epoch": 0.7317722192655668, + "grad_norm": 0.333634614944458, + "learning_rate": 1.9079150148283247e-07, + "loss": 0.2287, + "step": 2750 + }, + { + "epoch": 0.7320383182543906, + "grad_norm": 0.3444281816482544, + "learning_rate": 1.9078442131450086e-07, + "loss": 0.2081, + "step": 2751 + }, + { + "epoch": 0.7323044172432145, + "grad_norm": 0.29372310638427734, + "learning_rate": 1.9077733855681304e-07, + "loss": 0.2308, + "step": 2752 + }, + { + "epoch": 0.7325705162320383, + "grad_norm": 0.2878282070159912, + "learning_rate": 1.907702532099711e-07, + "loss": 0.2163, + "step": 2753 + }, + { + "epoch": 0.7328366152208622, + "grad_norm": 0.4717380106449127, + "learning_rate": 1.9076316527417704e-07, + "loss": 0.2284, + "step": 2754 + }, + { + "epoch": 0.733102714209686, + "grad_norm": 0.29415422677993774, + "learning_rate": 1.907560747496331e-07, + "loss": 0.2096, + "step": 2755 + }, + { + "epoch": 0.7333688131985099, + "grad_norm": 0.29309171438217163, + "learning_rate": 1.9074898163654147e-07, + "loss": 0.2305, + "step": 2756 + }, + { + "epoch": 0.7336349121873337, + "grad_norm": 0.24885179102420807, + "learning_rate": 1.9074188593510448e-07, + "loss": 0.1921, + "step": 2757 + }, + { + "epoch": 0.7339010111761576, + "grad_norm": 0.3359297811985016, + "learning_rate": 1.9073478764552449e-07, + "loss": 0.2224, + "step": 2758 + }, + { + "epoch": 0.7341671101649814, + "grad_norm": 0.39953261613845825, + "learning_rate": 1.9072768676800396e-07, + "loss": 0.2174, + "step": 2759 + }, + { + "epoch": 0.7344332091538052, + "grad_norm": 0.39049801230430603, + "learning_rate": 1.9072058330274548e-07, + "loss": 0.2299, + "step": 2760 + }, + { + "epoch": 0.7346993081426291, + "grad_norm": 0.28324583172798157, + "learning_rate": 1.907134772499516e-07, + "loss": 0.2294, + "step": 2761 + }, + { + "epoch": 0.7349654071314529, + "grad_norm": 0.2744864225387573, + "learning_rate": 1.90706368609825e-07, + "loss": 0.2002, + "step": 2762 + }, + { + "epoch": 0.7352315061202768, + "grad_norm": 0.3290872871875763, + "learning_rate": 1.9069925738256845e-07, + "loss": 0.2174, + "step": 2763 + }, + { + "epoch": 0.7354976051091006, + "grad_norm": 0.28774043917655945, + "learning_rate": 1.9069214356838477e-07, + "loss": 0.1924, + "step": 2764 + }, + { + "epoch": 0.7357637040979245, + "grad_norm": 0.3117314875125885, + "learning_rate": 1.9068502716747685e-07, + "loss": 0.2186, + "step": 2765 + }, + { + "epoch": 0.7360298030867483, + "grad_norm": 0.28206896781921387, + "learning_rate": 1.9067790818004767e-07, + "loss": 0.199, + "step": 2766 + }, + { + "epoch": 0.7362959020755722, + "grad_norm": 0.28330641984939575, + "learning_rate": 1.906707866063003e-07, + "loss": 0.1994, + "step": 2767 + }, + { + "epoch": 0.736562001064396, + "grad_norm": 0.29918593168258667, + "learning_rate": 1.9066366244643786e-07, + "loss": 0.1907, + "step": 2768 + }, + { + "epoch": 0.7368281000532197, + "grad_norm": 0.33663612604141235, + "learning_rate": 1.906565357006635e-07, + "loss": 0.2242, + "step": 2769 + }, + { + "epoch": 0.7370941990420437, + "grad_norm": 0.2606881558895111, + "learning_rate": 1.9064940636918052e-07, + "loss": 0.2032, + "step": 2770 + }, + { + "epoch": 0.7373602980308674, + "grad_norm": 0.2864450514316559, + "learning_rate": 1.9064227445219228e-07, + "loss": 0.2096, + "step": 2771 + }, + { + "epoch": 0.7376263970196913, + "grad_norm": 0.2945152819156647, + "learning_rate": 1.9063513994990215e-07, + "loss": 0.206, + "step": 2772 + }, + { + "epoch": 0.7378924960085151, + "grad_norm": 0.30984601378440857, + "learning_rate": 1.9062800286251365e-07, + "loss": 0.2113, + "step": 2773 + }, + { + "epoch": 0.738158594997339, + "grad_norm": 0.31085851788520813, + "learning_rate": 1.9062086319023037e-07, + "loss": 0.2111, + "step": 2774 + }, + { + "epoch": 0.7384246939861628, + "grad_norm": 0.3185957968235016, + "learning_rate": 1.906137209332559e-07, + "loss": 0.2111, + "step": 2775 + }, + { + "epoch": 0.7386907929749867, + "grad_norm": 0.32772135734558105, + "learning_rate": 1.90606576091794e-07, + "loss": 0.2098, + "step": 2776 + }, + { + "epoch": 0.7389568919638105, + "grad_norm": 0.44631415605545044, + "learning_rate": 1.905994286660484e-07, + "loss": 0.2114, + "step": 2777 + }, + { + "epoch": 0.7392229909526343, + "grad_norm": 0.27125996351242065, + "learning_rate": 1.9059227865622302e-07, + "loss": 0.211, + "step": 2778 + }, + { + "epoch": 0.7394890899414582, + "grad_norm": 0.4797491729259491, + "learning_rate": 1.9058512606252174e-07, + "loss": 0.2174, + "step": 2779 + }, + { + "epoch": 0.739755188930282, + "grad_norm": 0.29677271842956543, + "learning_rate": 1.9057797088514858e-07, + "loss": 0.1999, + "step": 2780 + }, + { + "epoch": 0.7400212879191059, + "grad_norm": 0.5011184215545654, + "learning_rate": 1.9057081312430765e-07, + "loss": 0.2319, + "step": 2781 + }, + { + "epoch": 0.7402873869079297, + "grad_norm": 0.2502138316631317, + "learning_rate": 1.9056365278020305e-07, + "loss": 0.1818, + "step": 2782 + }, + { + "epoch": 0.7405534858967536, + "grad_norm": 0.28847193717956543, + "learning_rate": 1.9055648985303905e-07, + "loss": 0.2139, + "step": 2783 + }, + { + "epoch": 0.7408195848855774, + "grad_norm": 0.36587420105934143, + "learning_rate": 1.9054932434301995e-07, + "loss": 0.2343, + "step": 2784 + }, + { + "epoch": 0.7410856838744013, + "grad_norm": 0.2872953414916992, + "learning_rate": 1.9054215625035011e-07, + "loss": 0.2151, + "step": 2785 + }, + { + "epoch": 0.7413517828632251, + "grad_norm": 0.32747897505760193, + "learning_rate": 1.9053498557523397e-07, + "loss": 0.2026, + "step": 2786 + }, + { + "epoch": 0.7416178818520489, + "grad_norm": 0.3784337639808655, + "learning_rate": 1.905278123178761e-07, + "loss": 0.2294, + "step": 2787 + }, + { + "epoch": 0.7418839808408728, + "grad_norm": 0.2658289670944214, + "learning_rate": 1.9052063647848102e-07, + "loss": 0.2152, + "step": 2788 + }, + { + "epoch": 0.7421500798296966, + "grad_norm": 0.2714938223361969, + "learning_rate": 1.9051345805725347e-07, + "loss": 0.2046, + "step": 2789 + }, + { + "epoch": 0.7424161788185205, + "grad_norm": 0.3354005813598633, + "learning_rate": 1.9050627705439815e-07, + "loss": 0.2125, + "step": 2790 + }, + { + "epoch": 0.7426822778073443, + "grad_norm": 0.2587014138698578, + "learning_rate": 1.9049909347011987e-07, + "loss": 0.183, + "step": 2791 + }, + { + "epoch": 0.7429483767961682, + "grad_norm": 0.34745946526527405, + "learning_rate": 1.9049190730462358e-07, + "loss": 0.2107, + "step": 2792 + }, + { + "epoch": 0.743214475784992, + "grad_norm": 0.39808499813079834, + "learning_rate": 1.904847185581142e-07, + "loss": 0.221, + "step": 2793 + }, + { + "epoch": 0.7434805747738159, + "grad_norm": 0.3699611723423004, + "learning_rate": 1.9047752723079676e-07, + "loss": 0.2335, + "step": 2794 + }, + { + "epoch": 0.7437466737626397, + "grad_norm": 0.41679152846336365, + "learning_rate": 1.904703333228764e-07, + "loss": 0.2246, + "step": 2795 + }, + { + "epoch": 0.7440127727514635, + "grad_norm": 0.26094579696655273, + "learning_rate": 1.9046313683455826e-07, + "loss": 0.2064, + "step": 2796 + }, + { + "epoch": 0.7442788717402874, + "grad_norm": 0.3122521638870239, + "learning_rate": 1.9045593776604766e-07, + "loss": 0.1978, + "step": 2797 + }, + { + "epoch": 0.7445449707291112, + "grad_norm": 0.3371003568172455, + "learning_rate": 1.904487361175499e-07, + "loss": 0.2027, + "step": 2798 + }, + { + "epoch": 0.7448110697179351, + "grad_norm": 0.2780289351940155, + "learning_rate": 1.9044153188927035e-07, + "loss": 0.1993, + "step": 2799 + }, + { + "epoch": 0.7450771687067589, + "grad_norm": 0.2906135320663452, + "learning_rate": 1.9043432508141451e-07, + "loss": 0.2194, + "step": 2800 + }, + { + "epoch": 0.7453432676955828, + "grad_norm": 0.2792748808860779, + "learning_rate": 1.90427115694188e-07, + "loss": 0.2166, + "step": 2801 + }, + { + "epoch": 0.7456093666844066, + "grad_norm": 0.44184592366218567, + "learning_rate": 1.9041990372779638e-07, + "loss": 0.2117, + "step": 2802 + }, + { + "epoch": 0.7458754656732305, + "grad_norm": 0.3722650110721588, + "learning_rate": 1.9041268918244535e-07, + "loss": 0.208, + "step": 2803 + }, + { + "epoch": 0.7461415646620543, + "grad_norm": 0.28084009885787964, + "learning_rate": 1.9040547205834073e-07, + "loss": 0.2136, + "step": 2804 + }, + { + "epoch": 0.7464076636508781, + "grad_norm": 0.33621764183044434, + "learning_rate": 1.9039825235568832e-07, + "loss": 0.2068, + "step": 2805 + }, + { + "epoch": 0.746673762639702, + "grad_norm": 0.350666344165802, + "learning_rate": 1.9039103007469404e-07, + "loss": 0.1987, + "step": 2806 + }, + { + "epoch": 0.7469398616285258, + "grad_norm": 0.35915252566337585, + "learning_rate": 1.9038380521556388e-07, + "loss": 0.2116, + "step": 2807 + }, + { + "epoch": 0.7472059606173497, + "grad_norm": 0.2688317894935608, + "learning_rate": 1.9037657777850394e-07, + "loss": 0.2001, + "step": 2808 + }, + { + "epoch": 0.7474720596061735, + "grad_norm": 0.26490360498428345, + "learning_rate": 1.9036934776372037e-07, + "loss": 0.1949, + "step": 2809 + }, + { + "epoch": 0.7477381585949974, + "grad_norm": 0.35289251804351807, + "learning_rate": 1.9036211517141937e-07, + "loss": 0.2308, + "step": 2810 + }, + { + "epoch": 0.7480042575838212, + "grad_norm": 0.2597399055957794, + "learning_rate": 1.903548800018072e-07, + "loss": 0.2152, + "step": 2811 + }, + { + "epoch": 0.7482703565726451, + "grad_norm": 0.37625569105148315, + "learning_rate": 1.9034764225509022e-07, + "loss": 0.1969, + "step": 2812 + }, + { + "epoch": 0.7485364555614689, + "grad_norm": 0.3631000518798828, + "learning_rate": 1.9034040193147493e-07, + "loss": 0.1974, + "step": 2813 + }, + { + "epoch": 0.7488025545502927, + "grad_norm": 0.3508005440235138, + "learning_rate": 1.9033315903116778e-07, + "loss": 0.2108, + "step": 2814 + }, + { + "epoch": 0.7490686535391166, + "grad_norm": 0.2752656042575836, + "learning_rate": 1.9032591355437537e-07, + "loss": 0.2153, + "step": 2815 + }, + { + "epoch": 0.7493347525279404, + "grad_norm": 0.2824917137622833, + "learning_rate": 1.9031866550130435e-07, + "loss": 0.2181, + "step": 2816 + }, + { + "epoch": 0.7496008515167643, + "grad_norm": 0.6366555094718933, + "learning_rate": 1.9031141487216146e-07, + "loss": 0.2359, + "step": 2817 + }, + { + "epoch": 0.749866950505588, + "grad_norm": 0.2552119195461273, + "learning_rate": 1.9030416166715347e-07, + "loss": 0.1988, + "step": 2818 + }, + { + "epoch": 0.750133049494412, + "grad_norm": 0.30978959798812866, + "learning_rate": 1.902969058864873e-07, + "loss": 0.2067, + "step": 2819 + }, + { + "epoch": 0.7503991484832357, + "grad_norm": 0.35230904817581177, + "learning_rate": 1.902896475303699e-07, + "loss": 0.2163, + "step": 2820 + }, + { + "epoch": 0.7506652474720596, + "grad_norm": 0.34576117992401123, + "learning_rate": 1.9028238659900826e-07, + "loss": 0.204, + "step": 2821 + }, + { + "epoch": 0.7509313464608834, + "grad_norm": 0.26720529794692993, + "learning_rate": 1.902751230926095e-07, + "loss": 0.1992, + "step": 2822 + }, + { + "epoch": 0.7511974454497073, + "grad_norm": 0.25525903701782227, + "learning_rate": 1.9026785701138074e-07, + "loss": 0.204, + "step": 2823 + }, + { + "epoch": 0.7514635444385311, + "grad_norm": 0.34987884759902954, + "learning_rate": 1.9026058835552932e-07, + "loss": 0.2036, + "step": 2824 + }, + { + "epoch": 0.7517296434273549, + "grad_norm": 0.39139896631240845, + "learning_rate": 1.902533171252625e-07, + "loss": 0.2242, + "step": 2825 + }, + { + "epoch": 0.7519957424161788, + "grad_norm": 0.28330251574516296, + "learning_rate": 1.9024604332078762e-07, + "loss": 0.2227, + "step": 2826 + }, + { + "epoch": 0.7522618414050026, + "grad_norm": 0.38914087414741516, + "learning_rate": 1.9023876694231225e-07, + "loss": 0.198, + "step": 2827 + }, + { + "epoch": 0.7525279403938265, + "grad_norm": 0.339004784822464, + "learning_rate": 1.9023148799004384e-07, + "loss": 0.2156, + "step": 2828 + }, + { + "epoch": 0.7527940393826503, + "grad_norm": 0.3389239013195038, + "learning_rate": 1.9022420646419005e-07, + "loss": 0.2018, + "step": 2829 + }, + { + "epoch": 0.7530601383714742, + "grad_norm": 0.28114449977874756, + "learning_rate": 1.9021692236495854e-07, + "loss": 0.2152, + "step": 2830 + }, + { + "epoch": 0.753326237360298, + "grad_norm": 0.36903518438339233, + "learning_rate": 1.9020963569255707e-07, + "loss": 0.2184, + "step": 2831 + }, + { + "epoch": 0.7535923363491219, + "grad_norm": 0.47471511363983154, + "learning_rate": 1.902023464471935e-07, + "loss": 0.2148, + "step": 2832 + }, + { + "epoch": 0.7538584353379457, + "grad_norm": 0.2719983160495758, + "learning_rate": 1.9019505462907567e-07, + "loss": 0.2085, + "step": 2833 + }, + { + "epoch": 0.7541245343267695, + "grad_norm": 0.4522375166416168, + "learning_rate": 1.9018776023841162e-07, + "loss": 0.2142, + "step": 2834 + }, + { + "epoch": 0.7543906333155934, + "grad_norm": 0.37113404273986816, + "learning_rate": 1.9018046327540939e-07, + "loss": 0.2179, + "step": 2835 + }, + { + "epoch": 0.7546567323044172, + "grad_norm": 0.2769186198711395, + "learning_rate": 1.9017316374027707e-07, + "loss": 0.2174, + "step": 2836 + }, + { + "epoch": 0.7549228312932411, + "grad_norm": 0.27004581689834595, + "learning_rate": 1.9016586163322291e-07, + "loss": 0.196, + "step": 2837 + }, + { + "epoch": 0.7551889302820649, + "grad_norm": 0.27755826711654663, + "learning_rate": 1.901585569544551e-07, + "loss": 0.2174, + "step": 2838 + }, + { + "epoch": 0.7554550292708888, + "grad_norm": 0.3670079708099365, + "learning_rate": 1.9015124970418208e-07, + "loss": 0.245, + "step": 2839 + }, + { + "epoch": 0.7557211282597126, + "grad_norm": 0.33802977204322815, + "learning_rate": 1.901439398826122e-07, + "loss": 0.2262, + "step": 2840 + }, + { + "epoch": 0.7559872272485365, + "grad_norm": 0.3450685739517212, + "learning_rate": 1.9013662748995398e-07, + "loss": 0.226, + "step": 2841 + }, + { + "epoch": 0.7562533262373603, + "grad_norm": 0.27426549792289734, + "learning_rate": 1.9012931252641598e-07, + "loss": 0.2154, + "step": 2842 + }, + { + "epoch": 0.7565194252261841, + "grad_norm": 0.26638656854629517, + "learning_rate": 1.9012199499220682e-07, + "loss": 0.2021, + "step": 2843 + }, + { + "epoch": 0.756785524215008, + "grad_norm": 0.27767908573150635, + "learning_rate": 1.9011467488753524e-07, + "loss": 0.1991, + "step": 2844 + }, + { + "epoch": 0.7570516232038318, + "grad_norm": 0.43867868185043335, + "learning_rate": 1.9010735221260998e-07, + "loss": 0.2298, + "step": 2845 + }, + { + "epoch": 0.7573177221926557, + "grad_norm": 0.27041390538215637, + "learning_rate": 1.9010002696763996e-07, + "loss": 0.2179, + "step": 2846 + }, + { + "epoch": 0.7575838211814795, + "grad_norm": 0.2569611668586731, + "learning_rate": 1.9009269915283407e-07, + "loss": 0.1991, + "step": 2847 + }, + { + "epoch": 0.7578499201703034, + "grad_norm": 0.26882821321487427, + "learning_rate": 1.9008536876840133e-07, + "loss": 0.215, + "step": 2848 + }, + { + "epoch": 0.7581160191591272, + "grad_norm": 0.25540077686309814, + "learning_rate": 1.9007803581455083e-07, + "loss": 0.2001, + "step": 2849 + }, + { + "epoch": 0.7583821181479511, + "grad_norm": 0.27677664160728455, + "learning_rate": 1.9007070029149168e-07, + "loss": 0.2183, + "step": 2850 + }, + { + "epoch": 0.7586482171367749, + "grad_norm": 0.371780127286911, + "learning_rate": 1.900633621994331e-07, + "loss": 0.2135, + "step": 2851 + }, + { + "epoch": 0.7589143161255987, + "grad_norm": 0.26933470368385315, + "learning_rate": 1.9005602153858446e-07, + "loss": 0.2125, + "step": 2852 + }, + { + "epoch": 0.7591804151144226, + "grad_norm": 0.2748945355415344, + "learning_rate": 1.9004867830915504e-07, + "loss": 0.2095, + "step": 2853 + }, + { + "epoch": 0.7594465141032464, + "grad_norm": 0.43554067611694336, + "learning_rate": 1.9004133251135433e-07, + "loss": 0.2208, + "step": 2854 + }, + { + "epoch": 0.7597126130920703, + "grad_norm": 0.2858550548553467, + "learning_rate": 1.900339841453919e-07, + "loss": 0.1958, + "step": 2855 + }, + { + "epoch": 0.7599787120808941, + "grad_norm": 0.3079787492752075, + "learning_rate": 1.9002663321147724e-07, + "loss": 0.2167, + "step": 2856 + }, + { + "epoch": 0.760244811069718, + "grad_norm": 0.26373571157455444, + "learning_rate": 1.9001927970982005e-07, + "loss": 0.2257, + "step": 2857 + }, + { + "epoch": 0.7605109100585418, + "grad_norm": 0.37626370787620544, + "learning_rate": 1.900119236406301e-07, + "loss": 0.2484, + "step": 2858 + }, + { + "epoch": 0.7607770090473657, + "grad_norm": 0.2673168480396271, + "learning_rate": 1.9000456500411716e-07, + "loss": 0.199, + "step": 2859 + }, + { + "epoch": 0.7610431080361895, + "grad_norm": 0.25865262746810913, + "learning_rate": 1.8999720380049113e-07, + "loss": 0.2095, + "step": 2860 + }, + { + "epoch": 0.7613092070250133, + "grad_norm": 0.2467900514602661, + "learning_rate": 1.8998984002996196e-07, + "loss": 0.1871, + "step": 2861 + }, + { + "epoch": 0.7615753060138372, + "grad_norm": 0.2672297954559326, + "learning_rate": 1.899824736927397e-07, + "loss": 0.1954, + "step": 2862 + }, + { + "epoch": 0.761841405002661, + "grad_norm": 0.2833724319934845, + "learning_rate": 1.8997510478903442e-07, + "loss": 0.21, + "step": 2863 + }, + { + "epoch": 0.7621075039914849, + "grad_norm": 0.47530221939086914, + "learning_rate": 1.899677333190563e-07, + "loss": 0.2259, + "step": 2864 + }, + { + "epoch": 0.7623736029803087, + "grad_norm": 0.28961220383644104, + "learning_rate": 1.8996035928301564e-07, + "loss": 0.1943, + "step": 2865 + }, + { + "epoch": 0.7626397019691326, + "grad_norm": 0.33268216252326965, + "learning_rate": 1.8995298268112272e-07, + "loss": 0.2205, + "step": 2866 + }, + { + "epoch": 0.7629058009579563, + "grad_norm": 0.26604875922203064, + "learning_rate": 1.8994560351358793e-07, + "loss": 0.2083, + "step": 2867 + }, + { + "epoch": 0.7631718999467803, + "grad_norm": 0.32949307560920715, + "learning_rate": 1.8993822178062175e-07, + "loss": 0.2219, + "step": 2868 + }, + { + "epoch": 0.763437998935604, + "grad_norm": 0.2745201289653778, + "learning_rate": 1.899308374824347e-07, + "loss": 0.2217, + "step": 2869 + }, + { + "epoch": 0.7637040979244278, + "grad_norm": 0.285841166973114, + "learning_rate": 1.8992345061923746e-07, + "loss": 0.1991, + "step": 2870 + }, + { + "epoch": 0.7639701969132517, + "grad_norm": 0.974823534488678, + "learning_rate": 1.8991606119124064e-07, + "loss": 0.1976, + "step": 2871 + }, + { + "epoch": 0.7642362959020755, + "grad_norm": 0.2946847975254059, + "learning_rate": 1.8990866919865503e-07, + "loss": 0.2015, + "step": 2872 + }, + { + "epoch": 0.7645023948908994, + "grad_norm": 0.28546345233917236, + "learning_rate": 1.899012746416915e-07, + "loss": 0.2198, + "step": 2873 + }, + { + "epoch": 0.7647684938797232, + "grad_norm": 0.27064159512519836, + "learning_rate": 1.8989387752056093e-07, + "loss": 0.212, + "step": 2874 + }, + { + "epoch": 0.7650345928685471, + "grad_norm": 0.31768396496772766, + "learning_rate": 1.898864778354743e-07, + "loss": 0.193, + "step": 2875 + }, + { + "epoch": 0.7653006918573709, + "grad_norm": 0.2944844365119934, + "learning_rate": 1.8987907558664261e-07, + "loss": 0.2054, + "step": 2876 + }, + { + "epoch": 0.7655667908461948, + "grad_norm": 0.3548020124435425, + "learning_rate": 1.898716707742771e-07, + "loss": 0.2298, + "step": 2877 + }, + { + "epoch": 0.7658328898350186, + "grad_norm": 0.260484904050827, + "learning_rate": 1.8986426339858892e-07, + "loss": 0.2039, + "step": 2878 + }, + { + "epoch": 0.7660989888238424, + "grad_norm": 0.5183793306350708, + "learning_rate": 1.898568534597893e-07, + "loss": 0.2455, + "step": 2879 + }, + { + "epoch": 0.7663650878126663, + "grad_norm": 0.37091654539108276, + "learning_rate": 1.898494409580896e-07, + "loss": 0.2035, + "step": 2880 + }, + { + "epoch": 0.7666311868014901, + "grad_norm": 0.27439579367637634, + "learning_rate": 1.8984202589370134e-07, + "loss": 0.2148, + "step": 2881 + }, + { + "epoch": 0.766897285790314, + "grad_norm": 0.2767007350921631, + "learning_rate": 1.8983460826683585e-07, + "loss": 0.2123, + "step": 2882 + }, + { + "epoch": 0.7671633847791378, + "grad_norm": 0.29212066531181335, + "learning_rate": 1.8982718807770483e-07, + "loss": 0.1963, + "step": 2883 + }, + { + "epoch": 0.7674294837679617, + "grad_norm": 0.33519938588142395, + "learning_rate": 1.8981976532651986e-07, + "loss": 0.2022, + "step": 2884 + }, + { + "epoch": 0.7676955827567855, + "grad_norm": 0.40061870217323303, + "learning_rate": 1.8981234001349265e-07, + "loss": 0.1964, + "step": 2885 + }, + { + "epoch": 0.7679616817456094, + "grad_norm": 0.28437158465385437, + "learning_rate": 1.89804912138835e-07, + "loss": 0.1951, + "step": 2886 + }, + { + "epoch": 0.7682277807344332, + "grad_norm": 0.3382467031478882, + "learning_rate": 1.8979748170275876e-07, + "loss": 0.2103, + "step": 2887 + }, + { + "epoch": 0.768493879723257, + "grad_norm": 0.2597443461418152, + "learning_rate": 1.8979004870547587e-07, + "loss": 0.1864, + "step": 2888 + }, + { + "epoch": 0.7687599787120809, + "grad_norm": 0.26977699995040894, + "learning_rate": 1.897826131471983e-07, + "loss": 0.2112, + "step": 2889 + }, + { + "epoch": 0.7690260777009047, + "grad_norm": 0.28102463483810425, + "learning_rate": 1.897751750281382e-07, + "loss": 0.1935, + "step": 2890 + }, + { + "epoch": 0.7692921766897286, + "grad_norm": 0.2912382185459137, + "learning_rate": 1.8976773434850765e-07, + "loss": 0.2045, + "step": 2891 + }, + { + "epoch": 0.7695582756785524, + "grad_norm": 0.3060520887374878, + "learning_rate": 1.8976029110851895e-07, + "loss": 0.2011, + "step": 2892 + }, + { + "epoch": 0.7698243746673763, + "grad_norm": 0.30730289220809937, + "learning_rate": 1.897528453083843e-07, + "loss": 0.2266, + "step": 2893 + }, + { + "epoch": 0.7700904736562001, + "grad_norm": 0.27147814631462097, + "learning_rate": 1.897453969483161e-07, + "loss": 0.2116, + "step": 2894 + }, + { + "epoch": 0.770356572645024, + "grad_norm": 0.2407536506652832, + "learning_rate": 1.8973794602852684e-07, + "loss": 0.1952, + "step": 2895 + }, + { + "epoch": 0.7706226716338478, + "grad_norm": 0.38808587193489075, + "learning_rate": 1.89730492549229e-07, + "loss": 0.2062, + "step": 2896 + }, + { + "epoch": 0.7708887706226716, + "grad_norm": 0.27012717723846436, + "learning_rate": 1.8972303651063514e-07, + "loss": 0.2089, + "step": 2897 + }, + { + "epoch": 0.7711548696114955, + "grad_norm": 0.28741228580474854, + "learning_rate": 1.8971557791295798e-07, + "loss": 0.2335, + "step": 2898 + }, + { + "epoch": 0.7714209686003193, + "grad_norm": 0.315560519695282, + "learning_rate": 1.8970811675641023e-07, + "loss": 0.2129, + "step": 2899 + }, + { + "epoch": 0.7716870675891432, + "grad_norm": 0.3796476721763611, + "learning_rate": 1.8970065304120468e-07, + "loss": 0.2288, + "step": 2900 + }, + { + "epoch": 0.771953166577967, + "grad_norm": 0.27286070585250854, + "learning_rate": 1.8969318676755425e-07, + "loss": 0.1968, + "step": 2901 + }, + { + "epoch": 0.7722192655667909, + "grad_norm": 0.2639641761779785, + "learning_rate": 1.8968571793567185e-07, + "loss": 0.2139, + "step": 2902 + }, + { + "epoch": 0.7724853645556147, + "grad_norm": 0.4692784249782562, + "learning_rate": 1.8967824654577052e-07, + "loss": 0.2087, + "step": 2903 + }, + { + "epoch": 0.7727514635444386, + "grad_norm": 0.27590030431747437, + "learning_rate": 1.8967077259806333e-07, + "loss": 0.204, + "step": 2904 + }, + { + "epoch": 0.7730175625332624, + "grad_norm": 0.3583259880542755, + "learning_rate": 1.8966329609276354e-07, + "loss": 0.234, + "step": 2905 + }, + { + "epoch": 0.7732836615220862, + "grad_norm": 0.35081031918525696, + "learning_rate": 1.8965581703008432e-07, + "loss": 0.2047, + "step": 2906 + }, + { + "epoch": 0.7735497605109101, + "grad_norm": 0.26872673630714417, + "learning_rate": 1.8964833541023902e-07, + "loss": 0.2022, + "step": 2907 + }, + { + "epoch": 0.7738158594997339, + "grad_norm": 0.34983018040657043, + "learning_rate": 1.8964085123344102e-07, + "loss": 0.2253, + "step": 2908 + }, + { + "epoch": 0.7740819584885578, + "grad_norm": 0.2837845981121063, + "learning_rate": 1.896333644999038e-07, + "loss": 0.2072, + "step": 2909 + }, + { + "epoch": 0.7743480574773816, + "grad_norm": 0.33510622382164, + "learning_rate": 1.8962587520984082e-07, + "loss": 0.2224, + "step": 2910 + }, + { + "epoch": 0.7746141564662055, + "grad_norm": 0.359754741191864, + "learning_rate": 1.8961838336346582e-07, + "loss": 0.2335, + "step": 2911 + }, + { + "epoch": 0.7748802554550293, + "grad_norm": 0.3405194580554962, + "learning_rate": 1.8961088896099237e-07, + "loss": 0.2374, + "step": 2912 + }, + { + "epoch": 0.7751463544438532, + "grad_norm": 0.39428290724754333, + "learning_rate": 1.896033920026343e-07, + "loss": 0.2082, + "step": 2913 + }, + { + "epoch": 0.775412453432677, + "grad_norm": 0.30894261598587036, + "learning_rate": 1.8959589248860537e-07, + "loss": 0.2048, + "step": 2914 + }, + { + "epoch": 0.7756785524215007, + "grad_norm": 0.3360808491706848, + "learning_rate": 1.8958839041911953e-07, + "loss": 0.218, + "step": 2915 + }, + { + "epoch": 0.7759446514103246, + "grad_norm": 0.36296361684799194, + "learning_rate": 1.8958088579439076e-07, + "loss": 0.2328, + "step": 2916 + }, + { + "epoch": 0.7762107503991484, + "grad_norm": 0.2963111698627472, + "learning_rate": 1.8957337861463307e-07, + "loss": 0.2271, + "step": 2917 + }, + { + "epoch": 0.7764768493879723, + "grad_norm": 0.2552281320095062, + "learning_rate": 1.8956586888006063e-07, + "loss": 0.2051, + "step": 2918 + }, + { + "epoch": 0.7767429483767961, + "grad_norm": 0.2564674913883209, + "learning_rate": 1.8955835659088757e-07, + "loss": 0.2054, + "step": 2919 + }, + { + "epoch": 0.77700904736562, + "grad_norm": 0.3492770791053772, + "learning_rate": 1.895508417473282e-07, + "loss": 0.2051, + "step": 2920 + }, + { + "epoch": 0.7772751463544438, + "grad_norm": 0.35300901532173157, + "learning_rate": 1.8954332434959684e-07, + "loss": 0.2115, + "step": 2921 + }, + { + "epoch": 0.7775412453432677, + "grad_norm": 0.39769843220710754, + "learning_rate": 1.8953580439790793e-07, + "loss": 0.2197, + "step": 2922 + }, + { + "epoch": 0.7778073443320915, + "grad_norm": 0.26665061712265015, + "learning_rate": 1.895282818924759e-07, + "loss": 0.1908, + "step": 2923 + }, + { + "epoch": 0.7780734433209153, + "grad_norm": 0.2670152485370636, + "learning_rate": 1.895207568335154e-07, + "loss": 0.2005, + "step": 2924 + }, + { + "epoch": 0.7783395423097392, + "grad_norm": 0.4490402936935425, + "learning_rate": 1.8951322922124095e-07, + "loss": 0.2286, + "step": 2925 + }, + { + "epoch": 0.778605641298563, + "grad_norm": 0.2801932394504547, + "learning_rate": 1.895056990558673e-07, + "loss": 0.2056, + "step": 2926 + }, + { + "epoch": 0.7788717402873869, + "grad_norm": 0.24592503905296326, + "learning_rate": 1.8949816633760924e-07, + "loss": 0.1895, + "step": 2927 + }, + { + "epoch": 0.7791378392762107, + "grad_norm": 0.2706276774406433, + "learning_rate": 1.894906310666816e-07, + "loss": 0.1994, + "step": 2928 + }, + { + "epoch": 0.7794039382650346, + "grad_norm": 0.42023560404777527, + "learning_rate": 1.8948309324329933e-07, + "loss": 0.2253, + "step": 2929 + }, + { + "epoch": 0.7796700372538584, + "grad_norm": 0.2548057734966278, + "learning_rate": 1.894755528676774e-07, + "loss": 0.2013, + "step": 2930 + }, + { + "epoch": 0.7799361362426823, + "grad_norm": 0.25695961713790894, + "learning_rate": 1.8946800994003086e-07, + "loss": 0.2013, + "step": 2931 + }, + { + "epoch": 0.7802022352315061, + "grad_norm": 0.44587036967277527, + "learning_rate": 1.894604644605749e-07, + "loss": 0.2103, + "step": 2932 + }, + { + "epoch": 0.78046833422033, + "grad_norm": 0.27876242995262146, + "learning_rate": 1.8945291642952465e-07, + "loss": 0.2241, + "step": 2933 + }, + { + "epoch": 0.7807344332091538, + "grad_norm": 0.27549034357070923, + "learning_rate": 1.8944536584709548e-07, + "loss": 0.2173, + "step": 2934 + }, + { + "epoch": 0.7810005321979776, + "grad_norm": 0.40671804547309875, + "learning_rate": 1.8943781271350271e-07, + "loss": 0.2114, + "step": 2935 + }, + { + "epoch": 0.7812666311868015, + "grad_norm": 0.40750569105148315, + "learning_rate": 1.8943025702896178e-07, + "loss": 0.2096, + "step": 2936 + }, + { + "epoch": 0.7815327301756253, + "grad_norm": 0.26635459065437317, + "learning_rate": 1.894226987936882e-07, + "loss": 0.193, + "step": 2937 + }, + { + "epoch": 0.7817988291644492, + "grad_norm": 0.25499096512794495, + "learning_rate": 1.8941513800789753e-07, + "loss": 0.1918, + "step": 2938 + }, + { + "epoch": 0.782064928153273, + "grad_norm": 0.28297245502471924, + "learning_rate": 1.894075746718054e-07, + "loss": 0.2046, + "step": 2939 + }, + { + "epoch": 0.7823310271420969, + "grad_norm": 0.28111931681632996, + "learning_rate": 1.8940000878562758e-07, + "loss": 0.2031, + "step": 2940 + }, + { + "epoch": 0.7825971261309207, + "grad_norm": 0.2771131098270416, + "learning_rate": 1.8939244034957983e-07, + "loss": 0.2183, + "step": 2941 + }, + { + "epoch": 0.7828632251197446, + "grad_norm": 0.3272600769996643, + "learning_rate": 1.89384869363878e-07, + "loss": 0.2063, + "step": 2942 + }, + { + "epoch": 0.7831293241085684, + "grad_norm": 0.4030549228191376, + "learning_rate": 1.893772958287381e-07, + "loss": 0.2235, + "step": 2943 + }, + { + "epoch": 0.7833954230973922, + "grad_norm": 0.3035977780818939, + "learning_rate": 1.8936971974437607e-07, + "loss": 0.2026, + "step": 2944 + }, + { + "epoch": 0.7836615220862161, + "grad_norm": 0.46626755595207214, + "learning_rate": 1.8936214111100804e-07, + "loss": 0.1868, + "step": 2945 + }, + { + "epoch": 0.7839276210750399, + "grad_norm": 0.2870906591415405, + "learning_rate": 1.8935455992885014e-07, + "loss": 0.2041, + "step": 2946 + }, + { + "epoch": 0.7841937200638638, + "grad_norm": 0.3490598797798157, + "learning_rate": 1.8934697619811862e-07, + "loss": 0.2185, + "step": 2947 + }, + { + "epoch": 0.7844598190526876, + "grad_norm": 0.3283099830150604, + "learning_rate": 1.8933938991902978e-07, + "loss": 0.18, + "step": 2948 + }, + { + "epoch": 0.7847259180415115, + "grad_norm": 0.3391801416873932, + "learning_rate": 1.893318010918e-07, + "loss": 0.2216, + "step": 2949 + }, + { + "epoch": 0.7849920170303353, + "grad_norm": 0.29859256744384766, + "learning_rate": 1.8932420971664572e-07, + "loss": 0.2235, + "step": 2950 + }, + { + "epoch": 0.7852581160191592, + "grad_norm": 0.2669377028942108, + "learning_rate": 1.8931661579378346e-07, + "loss": 0.1984, + "step": 2951 + }, + { + "epoch": 0.785524215007983, + "grad_norm": 0.27181482315063477, + "learning_rate": 1.8930901932342982e-07, + "loss": 0.2056, + "step": 2952 + }, + { + "epoch": 0.7857903139968068, + "grad_norm": 0.2755740284919739, + "learning_rate": 1.8930142030580144e-07, + "loss": 0.196, + "step": 2953 + }, + { + "epoch": 0.7860564129856307, + "grad_norm": 0.2654426693916321, + "learning_rate": 1.892938187411151e-07, + "loss": 0.2083, + "step": 2954 + }, + { + "epoch": 0.7863225119744545, + "grad_norm": 0.3462038040161133, + "learning_rate": 1.8928621462958763e-07, + "loss": 0.2178, + "step": 2955 + }, + { + "epoch": 0.7865886109632784, + "grad_norm": 0.24706128239631653, + "learning_rate": 1.8927860797143584e-07, + "loss": 0.2076, + "step": 2956 + }, + { + "epoch": 0.7868547099521022, + "grad_norm": 0.32366228103637695, + "learning_rate": 1.8927099876687676e-07, + "loss": 0.2024, + "step": 2957 + }, + { + "epoch": 0.7871208089409261, + "grad_norm": 0.25858035683631897, + "learning_rate": 1.8926338701612737e-07, + "loss": 0.1942, + "step": 2958 + }, + { + "epoch": 0.7873869079297499, + "grad_norm": 0.2857922911643982, + "learning_rate": 1.8925577271940475e-07, + "loss": 0.2271, + "step": 2959 + }, + { + "epoch": 0.7876530069185738, + "grad_norm": 0.2415630966424942, + "learning_rate": 1.8924815587692618e-07, + "loss": 0.2129, + "step": 2960 + }, + { + "epoch": 0.7879191059073976, + "grad_norm": 0.3588523864746094, + "learning_rate": 1.892405364889088e-07, + "loss": 0.2136, + "step": 2961 + }, + { + "epoch": 0.7881852048962213, + "grad_norm": 0.25316599011421204, + "learning_rate": 1.8923291455557e-07, + "loss": 0.19, + "step": 2962 + }, + { + "epoch": 0.7884513038850453, + "grad_norm": 0.5682730674743652, + "learning_rate": 1.8922529007712714e-07, + "loss": 0.2519, + "step": 2963 + }, + { + "epoch": 0.788717402873869, + "grad_norm": 0.2744855582714081, + "learning_rate": 1.892176630537977e-07, + "loss": 0.201, + "step": 2964 + }, + { + "epoch": 0.788983501862693, + "grad_norm": 0.2539944350719452, + "learning_rate": 1.8921003348579918e-07, + "loss": 0.1881, + "step": 2965 + }, + { + "epoch": 0.7892496008515167, + "grad_norm": 0.2317337840795517, + "learning_rate": 1.8920240137334923e-07, + "loss": 0.1897, + "step": 2966 + }, + { + "epoch": 0.7895156998403406, + "grad_norm": 0.47567692399024963, + "learning_rate": 1.8919476671666557e-07, + "loss": 0.2132, + "step": 2967 + }, + { + "epoch": 0.7897817988291644, + "grad_norm": 0.25847411155700684, + "learning_rate": 1.8918712951596586e-07, + "loss": 0.2034, + "step": 2968 + }, + { + "epoch": 0.7900478978179883, + "grad_norm": 0.24886348843574524, + "learning_rate": 1.8917948977146798e-07, + "loss": 0.1886, + "step": 2969 + }, + { + "epoch": 0.7903139968068121, + "grad_norm": 0.2602958679199219, + "learning_rate": 1.8917184748338987e-07, + "loss": 0.1988, + "step": 2970 + }, + { + "epoch": 0.7905800957956359, + "grad_norm": 0.23250436782836914, + "learning_rate": 1.8916420265194943e-07, + "loss": 0.2015, + "step": 2971 + }, + { + "epoch": 0.7908461947844598, + "grad_norm": 0.2882273197174072, + "learning_rate": 1.8915655527736475e-07, + "loss": 0.2241, + "step": 2972 + }, + { + "epoch": 0.7911122937732836, + "grad_norm": 0.28384244441986084, + "learning_rate": 1.8914890535985392e-07, + "loss": 0.2086, + "step": 2973 + }, + { + "epoch": 0.7913783927621075, + "grad_norm": 0.26199376583099365, + "learning_rate": 1.8914125289963518e-07, + "loss": 0.2005, + "step": 2974 + }, + { + "epoch": 0.7916444917509313, + "grad_norm": 0.3763008117675781, + "learning_rate": 1.8913359789692676e-07, + "loss": 0.2271, + "step": 2975 + }, + { + "epoch": 0.7919105907397552, + "grad_norm": 0.2634119987487793, + "learning_rate": 1.89125940351947e-07, + "loss": 0.2067, + "step": 2976 + }, + { + "epoch": 0.792176689728579, + "grad_norm": 0.267426460981369, + "learning_rate": 1.8911828026491432e-07, + "loss": 0.1954, + "step": 2977 + }, + { + "epoch": 0.7924427887174029, + "grad_norm": 0.39722415804862976, + "learning_rate": 1.8911061763604717e-07, + "loss": 0.1977, + "step": 2978 + }, + { + "epoch": 0.7927088877062267, + "grad_norm": 0.2776550352573395, + "learning_rate": 1.8910295246556412e-07, + "loss": 0.2021, + "step": 2979 + }, + { + "epoch": 0.7929749866950505, + "grad_norm": 0.3016453683376312, + "learning_rate": 1.890952847536838e-07, + "loss": 0.2114, + "step": 2980 + }, + { + "epoch": 0.7932410856838744, + "grad_norm": 0.29201140999794006, + "learning_rate": 1.890876145006249e-07, + "loss": 0.199, + "step": 2981 + }, + { + "epoch": 0.7935071846726982, + "grad_norm": 0.30203038454055786, + "learning_rate": 1.890799417066062e-07, + "loss": 0.2299, + "step": 2982 + }, + { + "epoch": 0.7937732836615221, + "grad_norm": 0.2851189374923706, + "learning_rate": 1.8907226637184656e-07, + "loss": 0.2082, + "step": 2983 + }, + { + "epoch": 0.7940393826503459, + "grad_norm": 0.3452824056148529, + "learning_rate": 1.8906458849656488e-07, + "loss": 0.2366, + "step": 2984 + }, + { + "epoch": 0.7943054816391698, + "grad_norm": 0.3888237178325653, + "learning_rate": 1.8905690808098017e-07, + "loss": 0.2238, + "step": 2985 + }, + { + "epoch": 0.7945715806279936, + "grad_norm": 0.2787272036075592, + "learning_rate": 1.8904922512531144e-07, + "loss": 0.2055, + "step": 2986 + }, + { + "epoch": 0.7948376796168175, + "grad_norm": 0.2642127275466919, + "learning_rate": 1.8904153962977784e-07, + "loss": 0.2115, + "step": 2987 + }, + { + "epoch": 0.7951037786056413, + "grad_norm": 0.3692740201950073, + "learning_rate": 1.890338515945986e-07, + "loss": 0.2255, + "step": 2988 + }, + { + "epoch": 0.7953698775944651, + "grad_norm": 0.2500305771827698, + "learning_rate": 1.89026161019993e-07, + "loss": 0.2013, + "step": 2989 + }, + { + "epoch": 0.795635976583289, + "grad_norm": 0.2762307822704315, + "learning_rate": 1.8901846790618036e-07, + "loss": 0.2185, + "step": 2990 + }, + { + "epoch": 0.7959020755721128, + "grad_norm": 0.2791963517665863, + "learning_rate": 1.8901077225338016e-07, + "loss": 0.2147, + "step": 2991 + }, + { + "epoch": 0.7961681745609367, + "grad_norm": 0.2685723900794983, + "learning_rate": 1.8900307406181182e-07, + "loss": 0.2009, + "step": 2992 + }, + { + "epoch": 0.7964342735497605, + "grad_norm": 0.2978755533695221, + "learning_rate": 1.8899537333169496e-07, + "loss": 0.2131, + "step": 2993 + }, + { + "epoch": 0.7967003725385844, + "grad_norm": 0.26573678851127625, + "learning_rate": 1.8898767006324919e-07, + "loss": 0.1987, + "step": 2994 + }, + { + "epoch": 0.7969664715274082, + "grad_norm": 0.26336121559143066, + "learning_rate": 1.8897996425669425e-07, + "loss": 0.1962, + "step": 2995 + }, + { + "epoch": 0.7972325705162321, + "grad_norm": 0.2764042317867279, + "learning_rate": 1.889722559122499e-07, + "loss": 0.2116, + "step": 2996 + }, + { + "epoch": 0.7974986695050559, + "grad_norm": 0.2846482992172241, + "learning_rate": 1.8896454503013604e-07, + "loss": 0.2135, + "step": 2997 + }, + { + "epoch": 0.7977647684938797, + "grad_norm": 0.35854172706604004, + "learning_rate": 1.8895683161057251e-07, + "loss": 0.2162, + "step": 2998 + }, + { + "epoch": 0.7980308674827036, + "grad_norm": 0.247584268450737, + "learning_rate": 1.8894911565377942e-07, + "loss": 0.2177, + "step": 2999 + }, + { + "epoch": 0.7982969664715274, + "grad_norm": 0.26266464591026306, + "learning_rate": 1.8894139715997682e-07, + "loss": 0.203, + "step": 3000 + }, + { + "epoch": 0.7985630654603513, + "grad_norm": 0.2744829058647156, + "learning_rate": 1.889336761293848e-07, + "loss": 0.1924, + "step": 3001 + }, + { + "epoch": 0.7988291644491751, + "grad_norm": 0.3039455711841583, + "learning_rate": 1.8892595256222364e-07, + "loss": 0.2026, + "step": 3002 + }, + { + "epoch": 0.799095263437999, + "grad_norm": 0.4161682426929474, + "learning_rate": 1.8891822645871358e-07, + "loss": 0.2293, + "step": 3003 + }, + { + "epoch": 0.7993613624268228, + "grad_norm": 0.27965328097343445, + "learning_rate": 1.8891049781907503e-07, + "loss": 0.1951, + "step": 3004 + }, + { + "epoch": 0.7996274614156467, + "grad_norm": 0.7995479106903076, + "learning_rate": 1.889027666435284e-07, + "loss": 0.2042, + "step": 3005 + }, + { + "epoch": 0.7998935604044705, + "grad_norm": 0.3736799657344818, + "learning_rate": 1.888950329322942e-07, + "loss": 0.2252, + "step": 3006 + }, + { + "epoch": 0.8001596593932943, + "grad_norm": 0.25736090540885925, + "learning_rate": 1.8888729668559302e-07, + "loss": 0.2204, + "step": 3007 + }, + { + "epoch": 0.8004257583821182, + "grad_norm": 0.2501136362552643, + "learning_rate": 1.8887955790364552e-07, + "loss": 0.1979, + "step": 3008 + }, + { + "epoch": 0.800691857370942, + "grad_norm": 0.2631394565105438, + "learning_rate": 1.8887181658667243e-07, + "loss": 0.2021, + "step": 3009 + }, + { + "epoch": 0.8009579563597659, + "grad_norm": 0.46171635389328003, + "learning_rate": 1.8886407273489448e-07, + "loss": 0.2173, + "step": 3010 + }, + { + "epoch": 0.8012240553485896, + "grad_norm": 0.2804260849952698, + "learning_rate": 1.8885632634853264e-07, + "loss": 0.2222, + "step": 3011 + }, + { + "epoch": 0.8014901543374136, + "grad_norm": 0.2732087969779968, + "learning_rate": 1.888485774278078e-07, + "loss": 0.2055, + "step": 3012 + }, + { + "epoch": 0.8017562533262373, + "grad_norm": 0.2608366012573242, + "learning_rate": 1.8884082597294097e-07, + "loss": 0.1978, + "step": 3013 + }, + { + "epoch": 0.8020223523150612, + "grad_norm": 0.3826599717140198, + "learning_rate": 1.888330719841533e-07, + "loss": 0.2112, + "step": 3014 + }, + { + "epoch": 0.802288451303885, + "grad_norm": 0.4314957559108734, + "learning_rate": 1.8882531546166583e-07, + "loss": 0.2047, + "step": 3015 + }, + { + "epoch": 0.8025545502927088, + "grad_norm": 0.28757306933403015, + "learning_rate": 1.888175564056999e-07, + "loss": 0.2164, + "step": 3016 + }, + { + "epoch": 0.8028206492815327, + "grad_norm": 0.24695412814617157, + "learning_rate": 1.8880979481647676e-07, + "loss": 0.1979, + "step": 3017 + }, + { + "epoch": 0.8030867482703565, + "grad_norm": 0.28333529829978943, + "learning_rate": 1.888020306942178e-07, + "loss": 0.1993, + "step": 3018 + }, + { + "epoch": 0.8033528472591804, + "grad_norm": 0.27017098665237427, + "learning_rate": 1.8879426403914448e-07, + "loss": 0.2172, + "step": 3019 + }, + { + "epoch": 0.8036189462480042, + "grad_norm": 0.26450255513191223, + "learning_rate": 1.887864948514783e-07, + "loss": 0.2057, + "step": 3020 + }, + { + "epoch": 0.8038850452368281, + "grad_norm": 0.28802788257598877, + "learning_rate": 1.8877872313144088e-07, + "loss": 0.1952, + "step": 3021 + }, + { + "epoch": 0.8041511442256519, + "grad_norm": 0.36818477511405945, + "learning_rate": 1.8877094887925388e-07, + "loss": 0.2384, + "step": 3022 + }, + { + "epoch": 0.8044172432144758, + "grad_norm": 0.351765513420105, + "learning_rate": 1.88763172095139e-07, + "loss": 0.2423, + "step": 3023 + }, + { + "epoch": 0.8046833422032996, + "grad_norm": 0.2572828233242035, + "learning_rate": 1.8875539277931808e-07, + "loss": 0.2098, + "step": 3024 + }, + { + "epoch": 0.8049494411921234, + "grad_norm": 0.2705504596233368, + "learning_rate": 1.8874761093201302e-07, + "loss": 0.21, + "step": 3025 + }, + { + "epoch": 0.8052155401809473, + "grad_norm": 0.33286041021347046, + "learning_rate": 1.8873982655344573e-07, + "loss": 0.2212, + "step": 3026 + }, + { + "epoch": 0.8054816391697711, + "grad_norm": 0.3411310315132141, + "learning_rate": 1.8873203964383827e-07, + "loss": 0.2152, + "step": 3027 + }, + { + "epoch": 0.805747738158595, + "grad_norm": 0.3271733522415161, + "learning_rate": 1.887242502034127e-07, + "loss": 0.2056, + "step": 3028 + }, + { + "epoch": 0.8060138371474188, + "grad_norm": 0.30129343271255493, + "learning_rate": 1.8871645823239127e-07, + "loss": 0.1972, + "step": 3029 + }, + { + "epoch": 0.8062799361362427, + "grad_norm": 0.26826539635658264, + "learning_rate": 1.8870866373099615e-07, + "loss": 0.1912, + "step": 3030 + }, + { + "epoch": 0.8065460351250665, + "grad_norm": 0.36527085304260254, + "learning_rate": 1.8870086669944967e-07, + "loss": 0.1954, + "step": 3031 + }, + { + "epoch": 0.8068121341138904, + "grad_norm": 0.2970724403858185, + "learning_rate": 1.8869306713797422e-07, + "loss": 0.2144, + "step": 3032 + }, + { + "epoch": 0.8070782331027142, + "grad_norm": 0.36025378108024597, + "learning_rate": 1.8868526504679225e-07, + "loss": 0.1999, + "step": 3033 + }, + { + "epoch": 0.807344332091538, + "grad_norm": 0.2406112253665924, + "learning_rate": 1.886774604261263e-07, + "loss": 0.1828, + "step": 3034 + }, + { + "epoch": 0.8076104310803619, + "grad_norm": 0.2893064022064209, + "learning_rate": 1.8866965327619901e-07, + "loss": 0.1903, + "step": 3035 + }, + { + "epoch": 0.8078765300691857, + "grad_norm": 0.30838140845298767, + "learning_rate": 1.8866184359723302e-07, + "loss": 0.1789, + "step": 3036 + }, + { + "epoch": 0.8081426290580096, + "grad_norm": 0.28435418009757996, + "learning_rate": 1.8865403138945105e-07, + "loss": 0.1977, + "step": 3037 + }, + { + "epoch": 0.8084087280468334, + "grad_norm": 0.4057683050632477, + "learning_rate": 1.8864621665307598e-07, + "loss": 0.237, + "step": 3038 + }, + { + "epoch": 0.8086748270356573, + "grad_norm": 0.32162803411483765, + "learning_rate": 1.8863839938833066e-07, + "loss": 0.2239, + "step": 3039 + }, + { + "epoch": 0.8089409260244811, + "grad_norm": 0.2534119188785553, + "learning_rate": 1.8863057959543806e-07, + "loss": 0.1994, + "step": 3040 + }, + { + "epoch": 0.809207025013305, + "grad_norm": 0.27097028493881226, + "learning_rate": 1.8862275727462124e-07, + "loss": 0.2169, + "step": 3041 + }, + { + "epoch": 0.8094731240021288, + "grad_norm": 0.31385427713394165, + "learning_rate": 1.886149324261033e-07, + "loss": 0.2045, + "step": 3042 + }, + { + "epoch": 0.8097392229909527, + "grad_norm": 0.2485044300556183, + "learning_rate": 1.8860710505010742e-07, + "loss": 0.1925, + "step": 3043 + }, + { + "epoch": 0.8100053219797765, + "grad_norm": 0.30410364270210266, + "learning_rate": 1.885992751468568e-07, + "loss": 0.2094, + "step": 3044 + }, + { + "epoch": 0.8102714209686003, + "grad_norm": 0.3456786274909973, + "learning_rate": 1.8859144271657485e-07, + "loss": 0.2317, + "step": 3045 + }, + { + "epoch": 0.8105375199574242, + "grad_norm": 0.34550487995147705, + "learning_rate": 1.8858360775948489e-07, + "loss": 0.2175, + "step": 3046 + }, + { + "epoch": 0.810803618946248, + "grad_norm": 0.2659991681575775, + "learning_rate": 1.8857577027581046e-07, + "loss": 0.2087, + "step": 3047 + }, + { + "epoch": 0.8110697179350719, + "grad_norm": 0.27783477306365967, + "learning_rate": 1.8856793026577507e-07, + "loss": 0.2083, + "step": 3048 + }, + { + "epoch": 0.8113358169238957, + "grad_norm": 0.2852303683757782, + "learning_rate": 1.885600877296023e-07, + "loss": 0.22, + "step": 3049 + }, + { + "epoch": 0.8116019159127196, + "grad_norm": 0.2789546549320221, + "learning_rate": 1.885522426675159e-07, + "loss": 0.2215, + "step": 3050 + }, + { + "epoch": 0.8118680149015434, + "grad_norm": 0.28555211424827576, + "learning_rate": 1.8854439507973958e-07, + "loss": 0.2124, + "step": 3051 + }, + { + "epoch": 0.8121341138903673, + "grad_norm": 0.2780151963233948, + "learning_rate": 1.8853654496649715e-07, + "loss": 0.221, + "step": 3052 + }, + { + "epoch": 0.8124002128791911, + "grad_norm": 0.28839752078056335, + "learning_rate": 1.8852869232801258e-07, + "loss": 0.1951, + "step": 3053 + }, + { + "epoch": 0.8126663118680149, + "grad_norm": 0.3158338665962219, + "learning_rate": 1.8852083716450983e-07, + "loss": 0.2153, + "step": 3054 + }, + { + "epoch": 0.8129324108568388, + "grad_norm": 0.34862077236175537, + "learning_rate": 1.8851297947621285e-07, + "loss": 0.2096, + "step": 3055 + }, + { + "epoch": 0.8131985098456626, + "grad_norm": 0.4302910268306732, + "learning_rate": 1.8850511926334587e-07, + "loss": 0.2262, + "step": 3056 + }, + { + "epoch": 0.8134646088344865, + "grad_norm": 0.25817909836769104, + "learning_rate": 1.8849725652613303e-07, + "loss": 0.2211, + "step": 3057 + }, + { + "epoch": 0.8137307078233103, + "grad_norm": 0.28348812460899353, + "learning_rate": 1.884893912647986e-07, + "loss": 0.2112, + "step": 3058 + }, + { + "epoch": 0.8139968068121342, + "grad_norm": 0.3662128448486328, + "learning_rate": 1.8848152347956692e-07, + "loss": 0.1975, + "step": 3059 + }, + { + "epoch": 0.814262905800958, + "grad_norm": 0.30291590094566345, + "learning_rate": 1.8847365317066238e-07, + "loss": 0.2086, + "step": 3060 + }, + { + "epoch": 0.8145290047897819, + "grad_norm": 0.2848469316959381, + "learning_rate": 1.8846578033830947e-07, + "loss": 0.2133, + "step": 3061 + }, + { + "epoch": 0.8147951037786056, + "grad_norm": 0.4186241626739502, + "learning_rate": 1.8845790498273274e-07, + "loss": 0.219, + "step": 3062 + }, + { + "epoch": 0.8150612027674294, + "grad_norm": 0.2527655363082886, + "learning_rate": 1.8845002710415677e-07, + "loss": 0.2068, + "step": 3063 + }, + { + "epoch": 0.8153273017562533, + "grad_norm": 0.2685460150241852, + "learning_rate": 1.884421467028063e-07, + "loss": 0.206, + "step": 3064 + }, + { + "epoch": 0.8155934007450771, + "grad_norm": 0.2743474543094635, + "learning_rate": 1.8843426377890612e-07, + "loss": 0.2044, + "step": 3065 + }, + { + "epoch": 0.815859499733901, + "grad_norm": 0.263254314661026, + "learning_rate": 1.88426378332681e-07, + "loss": 0.206, + "step": 3066 + }, + { + "epoch": 0.8161255987227248, + "grad_norm": 0.2494724988937378, + "learning_rate": 1.884184903643559e-07, + "loss": 0.2081, + "step": 3067 + }, + { + "epoch": 0.8163916977115487, + "grad_norm": 0.26134827733039856, + "learning_rate": 1.8841059987415574e-07, + "loss": 0.2112, + "step": 3068 + }, + { + "epoch": 0.8166577967003725, + "grad_norm": 0.27136892080307007, + "learning_rate": 1.8840270686230563e-07, + "loss": 0.2144, + "step": 3069 + }, + { + "epoch": 0.8169238956891964, + "grad_norm": 0.35941118001937866, + "learning_rate": 1.883948113290307e-07, + "loss": 0.2099, + "step": 3070 + }, + { + "epoch": 0.8171899946780202, + "grad_norm": 0.3979637920856476, + "learning_rate": 1.883869132745561e-07, + "loss": 0.242, + "step": 3071 + }, + { + "epoch": 0.817456093666844, + "grad_norm": 0.275285005569458, + "learning_rate": 1.8837901269910713e-07, + "loss": 0.1985, + "step": 3072 + }, + { + "epoch": 0.8177221926556679, + "grad_norm": 0.349896639585495, + "learning_rate": 1.8837110960290912e-07, + "loss": 0.2153, + "step": 3073 + }, + { + "epoch": 0.8179882916444917, + "grad_norm": 0.24770982563495636, + "learning_rate": 1.883632039861875e-07, + "loss": 0.1976, + "step": 3074 + }, + { + "epoch": 0.8182543906333156, + "grad_norm": 0.25320395827293396, + "learning_rate": 1.8835529584916774e-07, + "loss": 0.1847, + "step": 3075 + }, + { + "epoch": 0.8185204896221394, + "grad_norm": 0.26829275488853455, + "learning_rate": 1.883473851920754e-07, + "loss": 0.1918, + "step": 3076 + }, + { + "epoch": 0.8187865886109633, + "grad_norm": 0.4106839895248413, + "learning_rate": 1.8833947201513608e-07, + "loss": 0.2518, + "step": 3077 + }, + { + "epoch": 0.8190526875997871, + "grad_norm": 0.2683716118335724, + "learning_rate": 1.883315563185755e-07, + "loss": 0.1813, + "step": 3078 + }, + { + "epoch": 0.819318786588611, + "grad_norm": 0.350109338760376, + "learning_rate": 1.8832363810261941e-07, + "loss": 0.2121, + "step": 3079 + }, + { + "epoch": 0.8195848855774348, + "grad_norm": 0.27035242319107056, + "learning_rate": 1.8831571736749375e-07, + "loss": 0.1747, + "step": 3080 + }, + { + "epoch": 0.8198509845662586, + "grad_norm": 0.4347514808177948, + "learning_rate": 1.8830779411342432e-07, + "loss": 0.2447, + "step": 3081 + }, + { + "epoch": 0.8201170835550825, + "grad_norm": 0.4784340560436249, + "learning_rate": 1.8829986834063717e-07, + "loss": 0.2081, + "step": 3082 + }, + { + "epoch": 0.8203831825439063, + "grad_norm": 0.40302377939224243, + "learning_rate": 1.8829194004935833e-07, + "loss": 0.2063, + "step": 3083 + }, + { + "epoch": 0.8206492815327302, + "grad_norm": 0.3600340485572815, + "learning_rate": 1.8828400923981395e-07, + "loss": 0.2037, + "step": 3084 + }, + { + "epoch": 0.820915380521554, + "grad_norm": 0.38856568932533264, + "learning_rate": 1.882760759122302e-07, + "loss": 0.2265, + "step": 3085 + }, + { + "epoch": 0.8211814795103779, + "grad_norm": 0.2864875793457031, + "learning_rate": 1.8826814006683342e-07, + "loss": 0.1901, + "step": 3086 + }, + { + "epoch": 0.8214475784992017, + "grad_norm": 0.2666982412338257, + "learning_rate": 1.882602017038499e-07, + "loss": 0.2095, + "step": 3087 + }, + { + "epoch": 0.8217136774880256, + "grad_norm": 0.24486184120178223, + "learning_rate": 1.8825226082350608e-07, + "loss": 0.1843, + "step": 3088 + }, + { + "epoch": 0.8219797764768494, + "grad_norm": 0.3335663676261902, + "learning_rate": 1.8824431742602842e-07, + "loss": 0.2232, + "step": 3089 + }, + { + "epoch": 0.8222458754656732, + "grad_norm": 0.28137505054473877, + "learning_rate": 1.8823637151164352e-07, + "loss": 0.1882, + "step": 3090 + }, + { + "epoch": 0.8225119744544971, + "grad_norm": 0.2889452278614044, + "learning_rate": 1.8822842308057801e-07, + "loss": 0.2015, + "step": 3091 + }, + { + "epoch": 0.8227780734433209, + "grad_norm": 0.2880636751651764, + "learning_rate": 1.8822047213305857e-07, + "loss": 0.2187, + "step": 3092 + }, + { + "epoch": 0.8230441724321448, + "grad_norm": 0.37237676978111267, + "learning_rate": 1.8821251866931198e-07, + "loss": 0.2065, + "step": 3093 + }, + { + "epoch": 0.8233102714209686, + "grad_norm": 0.2968027889728546, + "learning_rate": 1.8820456268956514e-07, + "loss": 0.209, + "step": 3094 + }, + { + "epoch": 0.8235763704097925, + "grad_norm": 0.2624080181121826, + "learning_rate": 1.881966041940449e-07, + "loss": 0.2036, + "step": 3095 + }, + { + "epoch": 0.8238424693986163, + "grad_norm": 0.3100726008415222, + "learning_rate": 1.8818864318297827e-07, + "loss": 0.188, + "step": 3096 + }, + { + "epoch": 0.8241085683874402, + "grad_norm": 0.2932191491127014, + "learning_rate": 1.8818067965659237e-07, + "loss": 0.2237, + "step": 3097 + }, + { + "epoch": 0.824374667376264, + "grad_norm": 0.33756887912750244, + "learning_rate": 1.8817271361511429e-07, + "loss": 0.2025, + "step": 3098 + }, + { + "epoch": 0.8246407663650878, + "grad_norm": 0.296474814414978, + "learning_rate": 1.881647450587712e-07, + "loss": 0.1995, + "step": 3099 + }, + { + "epoch": 0.8249068653539117, + "grad_norm": 0.2920937240123749, + "learning_rate": 1.8815677398779045e-07, + "loss": 0.2183, + "step": 3100 + }, + { + "epoch": 0.8251729643427355, + "grad_norm": 0.31715530157089233, + "learning_rate": 1.8814880040239938e-07, + "loss": 0.22, + "step": 3101 + }, + { + "epoch": 0.8254390633315594, + "grad_norm": 0.2427847683429718, + "learning_rate": 1.8814082430282538e-07, + "loss": 0.1968, + "step": 3102 + }, + { + "epoch": 0.8257051623203832, + "grad_norm": 0.4434344470500946, + "learning_rate": 1.8813284568929596e-07, + "loss": 0.217, + "step": 3103 + }, + { + "epoch": 0.8259712613092071, + "grad_norm": 0.2606218755245209, + "learning_rate": 1.881248645620387e-07, + "loss": 0.2005, + "step": 3104 + }, + { + "epoch": 0.8262373602980309, + "grad_norm": 0.308113157749176, + "learning_rate": 1.8811688092128119e-07, + "loss": 0.2121, + "step": 3105 + }, + { + "epoch": 0.8265034592868548, + "grad_norm": 0.24559184908866882, + "learning_rate": 1.881088947672512e-07, + "loss": 0.1923, + "step": 3106 + }, + { + "epoch": 0.8267695582756786, + "grad_norm": 0.27757376432418823, + "learning_rate": 1.8810090610017647e-07, + "loss": 0.201, + "step": 3107 + }, + { + "epoch": 0.8270356572645023, + "grad_norm": 0.33325687050819397, + "learning_rate": 1.8809291492028488e-07, + "loss": 0.2079, + "step": 3108 + }, + { + "epoch": 0.8273017562533262, + "grad_norm": 0.3883524537086487, + "learning_rate": 1.8808492122780436e-07, + "loss": 0.2203, + "step": 3109 + }, + { + "epoch": 0.82756785524215, + "grad_norm": 0.25379785895347595, + "learning_rate": 1.8807692502296284e-07, + "loss": 0.1854, + "step": 3110 + }, + { + "epoch": 0.827833954230974, + "grad_norm": 0.2576816976070404, + "learning_rate": 1.8806892630598847e-07, + "loss": 0.2042, + "step": 3111 + }, + { + "epoch": 0.8281000532197977, + "grad_norm": 0.27059873938560486, + "learning_rate": 1.8806092507710938e-07, + "loss": 0.2066, + "step": 3112 + }, + { + "epoch": 0.8283661522086216, + "grad_norm": 0.4954772889614105, + "learning_rate": 1.8805292133655372e-07, + "loss": 0.2354, + "step": 3113 + }, + { + "epoch": 0.8286322511974454, + "grad_norm": 0.3333410620689392, + "learning_rate": 1.8804491508454982e-07, + "loss": 0.2053, + "step": 3114 + }, + { + "epoch": 0.8288983501862693, + "grad_norm": 0.26223310828208923, + "learning_rate": 1.8803690632132603e-07, + "loss": 0.1843, + "step": 3115 + }, + { + "epoch": 0.8291644491750931, + "grad_norm": 0.44451218843460083, + "learning_rate": 1.8802889504711077e-07, + "loss": 0.2249, + "step": 3116 + }, + { + "epoch": 0.8294305481639169, + "grad_norm": 0.2707904875278473, + "learning_rate": 1.8802088126213256e-07, + "loss": 0.2196, + "step": 3117 + }, + { + "epoch": 0.8296966471527408, + "grad_norm": 0.24819175899028778, + "learning_rate": 1.8801286496661993e-07, + "loss": 0.1991, + "step": 3118 + }, + { + "epoch": 0.8299627461415646, + "grad_norm": 0.3478013575077057, + "learning_rate": 1.8800484616080152e-07, + "loss": 0.2185, + "step": 3119 + }, + { + "epoch": 0.8302288451303885, + "grad_norm": 0.2840394973754883, + "learning_rate": 1.879968248449061e-07, + "loss": 0.2196, + "step": 3120 + }, + { + "epoch": 0.8304949441192123, + "grad_norm": 0.28322577476501465, + "learning_rate": 1.8798880101916242e-07, + "loss": 0.2006, + "step": 3121 + }, + { + "epoch": 0.8307610431080362, + "grad_norm": 0.32486775517463684, + "learning_rate": 1.8798077468379932e-07, + "loss": 0.199, + "step": 3122 + }, + { + "epoch": 0.83102714209686, + "grad_norm": 0.26204365491867065, + "learning_rate": 1.8797274583904573e-07, + "loss": 0.1924, + "step": 3123 + }, + { + "epoch": 0.8312932410856839, + "grad_norm": 0.2856151759624481, + "learning_rate": 1.8796471448513068e-07, + "loss": 0.2025, + "step": 3124 + }, + { + "epoch": 0.8315593400745077, + "grad_norm": 0.2831627130508423, + "learning_rate": 1.8795668062228318e-07, + "loss": 0.213, + "step": 3125 + }, + { + "epoch": 0.8318254390633315, + "grad_norm": 0.25733673572540283, + "learning_rate": 1.8794864425073244e-07, + "loss": 0.2063, + "step": 3126 + }, + { + "epoch": 0.8320915380521554, + "grad_norm": 0.24903136491775513, + "learning_rate": 1.8794060537070765e-07, + "loss": 0.1726, + "step": 3127 + }, + { + "epoch": 0.8323576370409792, + "grad_norm": 0.2694866359233856, + "learning_rate": 1.879325639824381e-07, + "loss": 0.2106, + "step": 3128 + }, + { + "epoch": 0.8326237360298031, + "grad_norm": 0.3309517502784729, + "learning_rate": 1.879245200861531e-07, + "loss": 0.2017, + "step": 3129 + }, + { + "epoch": 0.8328898350186269, + "grad_norm": 0.29588282108306885, + "learning_rate": 1.8791647368208214e-07, + "loss": 0.2184, + "step": 3130 + }, + { + "epoch": 0.8331559340074508, + "grad_norm": 0.4584406912326813, + "learning_rate": 1.879084247704547e-07, + "loss": 0.2383, + "step": 3131 + }, + { + "epoch": 0.8334220329962746, + "grad_norm": 0.2532344162464142, + "learning_rate": 1.8790037335150034e-07, + "loss": 0.1953, + "step": 3132 + }, + { + "epoch": 0.8336881319850985, + "grad_norm": 0.2624218165874481, + "learning_rate": 1.8789231942544873e-07, + "loss": 0.2168, + "step": 3133 + }, + { + "epoch": 0.8339542309739223, + "grad_norm": 0.30386990308761597, + "learning_rate": 1.8788426299252953e-07, + "loss": 0.2011, + "step": 3134 + }, + { + "epoch": 0.8342203299627461, + "grad_norm": 0.25189733505249023, + "learning_rate": 1.8787620405297257e-07, + "loss": 0.2066, + "step": 3135 + }, + { + "epoch": 0.83448642895157, + "grad_norm": 0.3300282955169678, + "learning_rate": 1.8786814260700772e-07, + "loss": 0.2139, + "step": 3136 + }, + { + "epoch": 0.8347525279403938, + "grad_norm": 0.390593022108078, + "learning_rate": 1.8786007865486487e-07, + "loss": 0.2152, + "step": 3137 + }, + { + "epoch": 0.8350186269292177, + "grad_norm": 0.2612099349498749, + "learning_rate": 1.8785201219677404e-07, + "loss": 0.1848, + "step": 3138 + }, + { + "epoch": 0.8352847259180415, + "grad_norm": 0.7412945032119751, + "learning_rate": 1.878439432329653e-07, + "loss": 0.2253, + "step": 3139 + }, + { + "epoch": 0.8355508249068654, + "grad_norm": 0.32267361879348755, + "learning_rate": 1.8783587176366878e-07, + "loss": 0.2265, + "step": 3140 + }, + { + "epoch": 0.8358169238956892, + "grad_norm": 0.3054623305797577, + "learning_rate": 1.878277977891147e-07, + "loss": 0.2146, + "step": 3141 + }, + { + "epoch": 0.8360830228845131, + "grad_norm": 0.29831159114837646, + "learning_rate": 1.8781972130953336e-07, + "loss": 0.2127, + "step": 3142 + }, + { + "epoch": 0.8363491218733369, + "grad_norm": 0.26219940185546875, + "learning_rate": 1.8781164232515513e-07, + "loss": 0.2013, + "step": 3143 + }, + { + "epoch": 0.8366152208621607, + "grad_norm": 0.27327850461006165, + "learning_rate": 1.8780356083621038e-07, + "loss": 0.207, + "step": 3144 + }, + { + "epoch": 0.8368813198509846, + "grad_norm": 0.27628228068351746, + "learning_rate": 1.8779547684292968e-07, + "loss": 0.205, + "step": 3145 + }, + { + "epoch": 0.8371474188398084, + "grad_norm": 0.2735518515110016, + "learning_rate": 1.877873903455436e-07, + "loss": 0.2074, + "step": 3146 + }, + { + "epoch": 0.8374135178286323, + "grad_norm": 0.3128666579723358, + "learning_rate": 1.877793013442827e-07, + "loss": 0.2096, + "step": 3147 + }, + { + "epoch": 0.8376796168174561, + "grad_norm": 0.4854583442211151, + "learning_rate": 1.8777120983937777e-07, + "loss": 0.2136, + "step": 3148 + }, + { + "epoch": 0.83794571580628, + "grad_norm": 0.25802767276763916, + "learning_rate": 1.8776311583105958e-07, + "loss": 0.2001, + "step": 3149 + }, + { + "epoch": 0.8382118147951038, + "grad_norm": 0.32424142956733704, + "learning_rate": 1.8775501931955895e-07, + "loss": 0.2059, + "step": 3150 + }, + { + "epoch": 0.8384779137839277, + "grad_norm": 0.3798828423023224, + "learning_rate": 1.8774692030510687e-07, + "loss": 0.2144, + "step": 3151 + }, + { + "epoch": 0.8387440127727515, + "grad_norm": 0.44497063755989075, + "learning_rate": 1.8773881878793433e-07, + "loss": 0.233, + "step": 3152 + }, + { + "epoch": 0.8390101117615754, + "grad_norm": 0.38006171584129333, + "learning_rate": 1.8773071476827238e-07, + "loss": 0.2215, + "step": 3153 + }, + { + "epoch": 0.8392762107503992, + "grad_norm": 0.27157092094421387, + "learning_rate": 1.8772260824635215e-07, + "loss": 0.207, + "step": 3154 + }, + { + "epoch": 0.839542309739223, + "grad_norm": 0.2866951823234558, + "learning_rate": 1.8771449922240488e-07, + "loss": 0.2094, + "step": 3155 + }, + { + "epoch": 0.8398084087280469, + "grad_norm": 0.37730738520622253, + "learning_rate": 1.8770638769666186e-07, + "loss": 0.1947, + "step": 3156 + }, + { + "epoch": 0.8400745077168706, + "grad_norm": 0.27862659096717834, + "learning_rate": 1.876982736693544e-07, + "loss": 0.2099, + "step": 3157 + }, + { + "epoch": 0.8403406067056945, + "grad_norm": 0.28448641300201416, + "learning_rate": 1.8769015714071398e-07, + "loss": 0.1962, + "step": 3158 + }, + { + "epoch": 0.8406067056945183, + "grad_norm": 0.2864014804363251, + "learning_rate": 1.876820381109721e-07, + "loss": 0.2217, + "step": 3159 + }, + { + "epoch": 0.8408728046833422, + "grad_norm": 0.2873334586620331, + "learning_rate": 1.876739165803603e-07, + "loss": 0.189, + "step": 3160 + }, + { + "epoch": 0.841138903672166, + "grad_norm": 0.31945234537124634, + "learning_rate": 1.8766579254911023e-07, + "loss": 0.2067, + "step": 3161 + }, + { + "epoch": 0.8414050026609899, + "grad_norm": 0.2967507243156433, + "learning_rate": 1.8765766601745362e-07, + "loss": 0.2203, + "step": 3162 + }, + { + "epoch": 0.8416711016498137, + "grad_norm": 0.28185316920280457, + "learning_rate": 1.8764953698562223e-07, + "loss": 0.2066, + "step": 3163 + }, + { + "epoch": 0.8419372006386375, + "grad_norm": 0.31052809953689575, + "learning_rate": 1.8764140545384797e-07, + "loss": 0.2039, + "step": 3164 + }, + { + "epoch": 0.8422032996274614, + "grad_norm": 0.28413331508636475, + "learning_rate": 1.8763327142236268e-07, + "loss": 0.2241, + "step": 3165 + }, + { + "epoch": 0.8424693986162852, + "grad_norm": 0.30539098381996155, + "learning_rate": 1.8762513489139847e-07, + "loss": 0.2202, + "step": 3166 + }, + { + "epoch": 0.8427354976051091, + "grad_norm": 0.25442200899124146, + "learning_rate": 1.876169958611873e-07, + "loss": 0.2042, + "step": 3167 + }, + { + "epoch": 0.8430015965939329, + "grad_norm": 0.25398534536361694, + "learning_rate": 1.876088543319614e-07, + "loss": 0.187, + "step": 3168 + }, + { + "epoch": 0.8432676955827568, + "grad_norm": 0.3837607800960541, + "learning_rate": 1.8760071030395292e-07, + "loss": 0.2194, + "step": 3169 + }, + { + "epoch": 0.8435337945715806, + "grad_norm": 0.2721881568431854, + "learning_rate": 1.8759256377739417e-07, + "loss": 0.2068, + "step": 3170 + }, + { + "epoch": 0.8437998935604045, + "grad_norm": 0.2791810631752014, + "learning_rate": 1.875844147525175e-07, + "loss": 0.214, + "step": 3171 + }, + { + "epoch": 0.8440659925492283, + "grad_norm": 0.2830832898616791, + "learning_rate": 1.8757626322955536e-07, + "loss": 0.2279, + "step": 3172 + }, + { + "epoch": 0.8443320915380521, + "grad_norm": 0.3974735736846924, + "learning_rate": 1.8756810920874023e-07, + "loss": 0.2276, + "step": 3173 + }, + { + "epoch": 0.844598190526876, + "grad_norm": 0.2736077308654785, + "learning_rate": 1.8755995269030467e-07, + "loss": 0.201, + "step": 3174 + }, + { + "epoch": 0.8448642895156998, + "grad_norm": 0.3758902847766876, + "learning_rate": 1.875517936744813e-07, + "loss": 0.2227, + "step": 3175 + }, + { + "epoch": 0.8451303885045237, + "grad_norm": 0.3836982548236847, + "learning_rate": 1.8754363216150292e-07, + "loss": 0.2327, + "step": 3176 + }, + { + "epoch": 0.8453964874933475, + "grad_norm": 0.23347529768943787, + "learning_rate": 1.8753546815160222e-07, + "loss": 0.1831, + "step": 3177 + }, + { + "epoch": 0.8456625864821714, + "grad_norm": 0.3466804325580597, + "learning_rate": 1.8752730164501206e-07, + "loss": 0.2057, + "step": 3178 + }, + { + "epoch": 0.8459286854709952, + "grad_norm": 0.4357702136039734, + "learning_rate": 1.8751913264196543e-07, + "loss": 0.2151, + "step": 3179 + }, + { + "epoch": 0.8461947844598191, + "grad_norm": 0.3976533114910126, + "learning_rate": 1.8751096114269528e-07, + "loss": 0.2029, + "step": 3180 + }, + { + "epoch": 0.8464608834486429, + "grad_norm": 0.32784852385520935, + "learning_rate": 1.8750278714743472e-07, + "loss": 0.1876, + "step": 3181 + }, + { + "epoch": 0.8467269824374667, + "grad_norm": 0.2596149444580078, + "learning_rate": 1.874946106564168e-07, + "loss": 0.2132, + "step": 3182 + }, + { + "epoch": 0.8469930814262906, + "grad_norm": 0.36642131209373474, + "learning_rate": 1.874864316698748e-07, + "loss": 0.1997, + "step": 3183 + }, + { + "epoch": 0.8472591804151144, + "grad_norm": 0.25976449251174927, + "learning_rate": 1.8747825018804202e-07, + "loss": 0.1957, + "step": 3184 + }, + { + "epoch": 0.8475252794039383, + "grad_norm": 0.3173132836818695, + "learning_rate": 1.8747006621115176e-07, + "loss": 0.218, + "step": 3185 + }, + { + "epoch": 0.8477913783927621, + "grad_norm": 0.2524561882019043, + "learning_rate": 1.8746187973943747e-07, + "loss": 0.1929, + "step": 3186 + }, + { + "epoch": 0.848057477381586, + "grad_norm": 0.33913692831993103, + "learning_rate": 1.8745369077313265e-07, + "loss": 0.2189, + "step": 3187 + }, + { + "epoch": 0.8483235763704098, + "grad_norm": 0.3234288990497589, + "learning_rate": 1.8744549931247083e-07, + "loss": 0.2192, + "step": 3188 + }, + { + "epoch": 0.8485896753592337, + "grad_norm": 0.2935561537742615, + "learning_rate": 1.8743730535768566e-07, + "loss": 0.1953, + "step": 3189 + }, + { + "epoch": 0.8488557743480575, + "grad_norm": 0.31689485907554626, + "learning_rate": 1.874291089090109e-07, + "loss": 0.208, + "step": 3190 + }, + { + "epoch": 0.8491218733368813, + "grad_norm": 0.26019495725631714, + "learning_rate": 1.8742090996668025e-07, + "loss": 0.2025, + "step": 3191 + }, + { + "epoch": 0.8493879723257052, + "grad_norm": 0.2373671680688858, + "learning_rate": 1.8741270853092762e-07, + "loss": 0.1779, + "step": 3192 + }, + { + "epoch": 0.849654071314529, + "grad_norm": 0.36015117168426514, + "learning_rate": 1.874045046019869e-07, + "loss": 0.2326, + "step": 3193 + }, + { + "epoch": 0.8499201703033529, + "grad_norm": 0.33495208621025085, + "learning_rate": 1.873962981800921e-07, + "loss": 0.2095, + "step": 3194 + }, + { + "epoch": 0.8501862692921767, + "grad_norm": 0.2859368622303009, + "learning_rate": 1.8738808926547732e-07, + "loss": 0.2175, + "step": 3195 + }, + { + "epoch": 0.8504523682810006, + "grad_norm": 0.3947612941265106, + "learning_rate": 1.873798778583766e-07, + "loss": 0.2187, + "step": 3196 + }, + { + "epoch": 0.8507184672698244, + "grad_norm": 0.35280972719192505, + "learning_rate": 1.8737166395902422e-07, + "loss": 0.1956, + "step": 3197 + }, + { + "epoch": 0.8509845662586483, + "grad_norm": 0.2758723795413971, + "learning_rate": 1.8736344756765444e-07, + "loss": 0.1889, + "step": 3198 + }, + { + "epoch": 0.8512506652474721, + "grad_norm": 0.2651209533214569, + "learning_rate": 1.8735522868450157e-07, + "loss": 0.1938, + "step": 3199 + }, + { + "epoch": 0.8515167642362959, + "grad_norm": 0.38422486186027527, + "learning_rate": 1.8734700730980012e-07, + "loss": 0.2227, + "step": 3200 + }, + { + "epoch": 0.8517828632251198, + "grad_norm": 0.3304511308670044, + "learning_rate": 1.8733878344378448e-07, + "loss": 0.2014, + "step": 3201 + }, + { + "epoch": 0.8520489622139436, + "grad_norm": 0.3540647029876709, + "learning_rate": 1.8733055708668925e-07, + "loss": 0.2103, + "step": 3202 + }, + { + "epoch": 0.8523150612027675, + "grad_norm": 0.32136428356170654, + "learning_rate": 1.873223282387491e-07, + "loss": 0.2184, + "step": 3203 + }, + { + "epoch": 0.8525811601915912, + "grad_norm": 0.26667290925979614, + "learning_rate": 1.8731409690019867e-07, + "loss": 0.1972, + "step": 3204 + }, + { + "epoch": 0.8528472591804152, + "grad_norm": 0.39635229110717773, + "learning_rate": 1.873058630712728e-07, + "loss": 0.2383, + "step": 3205 + }, + { + "epoch": 0.8531133581692389, + "grad_norm": 0.2863321304321289, + "learning_rate": 1.8729762675220632e-07, + "loss": 0.2107, + "step": 3206 + }, + { + "epoch": 0.8533794571580628, + "grad_norm": 0.37099578976631165, + "learning_rate": 1.8728938794323406e-07, + "loss": 0.2363, + "step": 3207 + }, + { + "epoch": 0.8536455561468866, + "grad_norm": 0.2966608703136444, + "learning_rate": 1.8728114664459111e-07, + "loss": 0.1995, + "step": 3208 + }, + { + "epoch": 0.8539116551357104, + "grad_norm": 0.2864580452442169, + "learning_rate": 1.872729028565125e-07, + "loss": 0.2073, + "step": 3209 + }, + { + "epoch": 0.8541777541245343, + "grad_norm": 0.2772403955459595, + "learning_rate": 1.8726465657923334e-07, + "loss": 0.2044, + "step": 3210 + }, + { + "epoch": 0.8544438531133581, + "grad_norm": 0.33832597732543945, + "learning_rate": 1.8725640781298883e-07, + "loss": 0.2108, + "step": 3211 + }, + { + "epoch": 0.854709952102182, + "grad_norm": 0.3023662269115448, + "learning_rate": 1.872481565580143e-07, + "loss": 0.2024, + "step": 3212 + }, + { + "epoch": 0.8549760510910058, + "grad_norm": 0.3493943512439728, + "learning_rate": 1.8723990281454501e-07, + "loss": 0.2016, + "step": 3213 + }, + { + "epoch": 0.8552421500798297, + "grad_norm": 0.49400272965431213, + "learning_rate": 1.8723164658281643e-07, + "loss": 0.235, + "step": 3214 + }, + { + "epoch": 0.8555082490686535, + "grad_norm": 0.2818351686000824, + "learning_rate": 1.87223387863064e-07, + "loss": 0.1961, + "step": 3215 + }, + { + "epoch": 0.8557743480574774, + "grad_norm": 0.26858359575271606, + "learning_rate": 1.8721512665552336e-07, + "loss": 0.2087, + "step": 3216 + }, + { + "epoch": 0.8560404470463012, + "grad_norm": 0.27293866872787476, + "learning_rate": 1.8720686296043003e-07, + "loss": 0.2004, + "step": 3217 + }, + { + "epoch": 0.856306546035125, + "grad_norm": 0.257591187953949, + "learning_rate": 1.8719859677801977e-07, + "loss": 0.1853, + "step": 3218 + }, + { + "epoch": 0.8565726450239489, + "grad_norm": 0.3008226752281189, + "learning_rate": 1.8719032810852832e-07, + "loss": 0.2124, + "step": 3219 + }, + { + "epoch": 0.8568387440127727, + "grad_norm": 0.339061439037323, + "learning_rate": 1.8718205695219154e-07, + "loss": 0.2005, + "step": 3220 + }, + { + "epoch": 0.8571048430015966, + "grad_norm": 0.2596091330051422, + "learning_rate": 1.8717378330924534e-07, + "loss": 0.1814, + "step": 3221 + }, + { + "epoch": 0.8573709419904204, + "grad_norm": 0.2581021785736084, + "learning_rate": 1.871655071799257e-07, + "loss": 0.2129, + "step": 3222 + }, + { + "epoch": 0.8576370409792443, + "grad_norm": 0.274890661239624, + "learning_rate": 1.8715722856446865e-07, + "loss": 0.1947, + "step": 3223 + }, + { + "epoch": 0.8579031399680681, + "grad_norm": 0.34793809056282043, + "learning_rate": 1.8714894746311034e-07, + "loss": 0.2054, + "step": 3224 + }, + { + "epoch": 0.858169238956892, + "grad_norm": 0.4190228581428528, + "learning_rate": 1.8714066387608693e-07, + "loss": 0.2028, + "step": 3225 + }, + { + "epoch": 0.8584353379457158, + "grad_norm": 0.2737067937850952, + "learning_rate": 1.8713237780363477e-07, + "loss": 0.2211, + "step": 3226 + }, + { + "epoch": 0.8587014369345396, + "grad_norm": 0.28446653485298157, + "learning_rate": 1.8712408924599007e-07, + "loss": 0.1954, + "step": 3227 + }, + { + "epoch": 0.8589675359233635, + "grad_norm": 0.2738778591156006, + "learning_rate": 1.8711579820338931e-07, + "loss": 0.2016, + "step": 3228 + }, + { + "epoch": 0.8592336349121873, + "grad_norm": 0.26594865322113037, + "learning_rate": 1.8710750467606897e-07, + "loss": 0.2044, + "step": 3229 + }, + { + "epoch": 0.8594997339010112, + "grad_norm": 0.2653839588165283, + "learning_rate": 1.870992086642656e-07, + "loss": 0.1939, + "step": 3230 + }, + { + "epoch": 0.859765832889835, + "grad_norm": 0.2540951669216156, + "learning_rate": 1.8709091016821578e-07, + "loss": 0.1944, + "step": 3231 + }, + { + "epoch": 0.8600319318786589, + "grad_norm": 0.4525187015533447, + "learning_rate": 1.8708260918815624e-07, + "loss": 0.2334, + "step": 3232 + }, + { + "epoch": 0.8602980308674827, + "grad_norm": 0.339508980512619, + "learning_rate": 1.8707430572432373e-07, + "loss": 0.2034, + "step": 3233 + }, + { + "epoch": 0.8605641298563066, + "grad_norm": 0.2878727912902832, + "learning_rate": 1.8706599977695506e-07, + "loss": 0.2129, + "step": 3234 + }, + { + "epoch": 0.8608302288451304, + "grad_norm": 0.411178320646286, + "learning_rate": 1.8705769134628718e-07, + "loss": 0.2113, + "step": 3235 + }, + { + "epoch": 0.8610963278339542, + "grad_norm": 0.3952265977859497, + "learning_rate": 1.87049380432557e-07, + "loss": 0.2406, + "step": 3236 + }, + { + "epoch": 0.8613624268227781, + "grad_norm": 0.33466994762420654, + "learning_rate": 1.870410670360016e-07, + "loss": 0.2206, + "step": 3237 + }, + { + "epoch": 0.8616285258116019, + "grad_norm": 0.2715080678462982, + "learning_rate": 1.870327511568581e-07, + "loss": 0.2138, + "step": 3238 + }, + { + "epoch": 0.8618946248004258, + "grad_norm": 0.27624526619911194, + "learning_rate": 1.8702443279536367e-07, + "loss": 0.1947, + "step": 3239 + }, + { + "epoch": 0.8621607237892496, + "grad_norm": 0.272235244512558, + "learning_rate": 1.8701611195175557e-07, + "loss": 0.2001, + "step": 3240 + }, + { + "epoch": 0.8624268227780735, + "grad_norm": 0.31649965047836304, + "learning_rate": 1.8700778862627115e-07, + "loss": 0.2175, + "step": 3241 + }, + { + "epoch": 0.8626929217668973, + "grad_norm": 0.3190178871154785, + "learning_rate": 1.869994628191478e-07, + "loss": 0.2109, + "step": 3242 + }, + { + "epoch": 0.8629590207557212, + "grad_norm": 0.28721022605895996, + "learning_rate": 1.8699113453062294e-07, + "loss": 0.2106, + "step": 3243 + }, + { + "epoch": 0.863225119744545, + "grad_norm": 0.7642039656639099, + "learning_rate": 1.869828037609342e-07, + "loss": 0.2114, + "step": 3244 + }, + { + "epoch": 0.8634912187333688, + "grad_norm": 0.6939727663993835, + "learning_rate": 1.8697447051031911e-07, + "loss": 0.2465, + "step": 3245 + }, + { + "epoch": 0.8637573177221927, + "grad_norm": 0.26591289043426514, + "learning_rate": 1.8696613477901537e-07, + "loss": 0.2099, + "step": 3246 + }, + { + "epoch": 0.8640234167110165, + "grad_norm": 0.38016703724861145, + "learning_rate": 1.8695779656726077e-07, + "loss": 0.2221, + "step": 3247 + }, + { + "epoch": 0.8642895156998404, + "grad_norm": 0.40718117356300354, + "learning_rate": 1.8694945587529306e-07, + "loss": 0.2366, + "step": 3248 + }, + { + "epoch": 0.8645556146886642, + "grad_norm": 0.3670583963394165, + "learning_rate": 1.8694111270335023e-07, + "loss": 0.2046, + "step": 3249 + }, + { + "epoch": 0.8648217136774881, + "grad_norm": 0.2666459083557129, + "learning_rate": 1.8693276705167015e-07, + "loss": 0.2096, + "step": 3250 + }, + { + "epoch": 0.8650878126663119, + "grad_norm": 0.3260599672794342, + "learning_rate": 1.8692441892049092e-07, + "loss": 0.2065, + "step": 3251 + }, + { + "epoch": 0.8653539116551358, + "grad_norm": 0.3396673798561096, + "learning_rate": 1.8691606831005063e-07, + "loss": 0.2193, + "step": 3252 + }, + { + "epoch": 0.8656200106439595, + "grad_norm": 0.2564309537410736, + "learning_rate": 1.8690771522058746e-07, + "loss": 0.1933, + "step": 3253 + }, + { + "epoch": 0.8658861096327833, + "grad_norm": 0.45882806181907654, + "learning_rate": 1.868993596523396e-07, + "loss": 0.2323, + "step": 3254 + }, + { + "epoch": 0.8661522086216072, + "grad_norm": 0.28418606519699097, + "learning_rate": 1.8689100160554547e-07, + "loss": 0.2121, + "step": 3255 + }, + { + "epoch": 0.866418307610431, + "grad_norm": 0.343650758266449, + "learning_rate": 1.8688264108044337e-07, + "loss": 0.2062, + "step": 3256 + }, + { + "epoch": 0.8666844065992549, + "grad_norm": 0.2526421844959259, + "learning_rate": 1.868742780772718e-07, + "loss": 0.1849, + "step": 3257 + }, + { + "epoch": 0.8669505055880787, + "grad_norm": 0.2724807560443878, + "learning_rate": 1.8686591259626927e-07, + "loss": 0.2028, + "step": 3258 + }, + { + "epoch": 0.8672166045769026, + "grad_norm": 0.3235759735107422, + "learning_rate": 1.868575446376744e-07, + "loss": 0.1924, + "step": 3259 + }, + { + "epoch": 0.8674827035657264, + "grad_norm": 0.27810850739479065, + "learning_rate": 1.8684917420172588e-07, + "loss": 0.2187, + "step": 3260 + }, + { + "epoch": 0.8677488025545503, + "grad_norm": 0.41647815704345703, + "learning_rate": 1.8684080128866241e-07, + "loss": 0.2159, + "step": 3261 + }, + { + "epoch": 0.8680149015433741, + "grad_norm": 0.2793427109718323, + "learning_rate": 1.868324258987228e-07, + "loss": 0.2244, + "step": 3262 + }, + { + "epoch": 0.868281000532198, + "grad_norm": 0.2956774830818176, + "learning_rate": 1.8682404803214598e-07, + "loss": 0.2207, + "step": 3263 + }, + { + "epoch": 0.8685470995210218, + "grad_norm": 0.39516714215278625, + "learning_rate": 1.8681566768917084e-07, + "loss": 0.2368, + "step": 3264 + }, + { + "epoch": 0.8688131985098456, + "grad_norm": 0.2560863792896271, + "learning_rate": 1.8680728487003647e-07, + "loss": 0.1888, + "step": 3265 + }, + { + "epoch": 0.8690792974986695, + "grad_norm": 0.27027055621147156, + "learning_rate": 1.8679889957498193e-07, + "loss": 0.208, + "step": 3266 + }, + { + "epoch": 0.8693453964874933, + "grad_norm": 0.2635810077190399, + "learning_rate": 1.8679051180424637e-07, + "loss": 0.1955, + "step": 3267 + }, + { + "epoch": 0.8696114954763172, + "grad_norm": 0.2525891065597534, + "learning_rate": 1.8678212155806906e-07, + "loss": 0.1877, + "step": 3268 + }, + { + "epoch": 0.869877594465141, + "grad_norm": 0.25654932856559753, + "learning_rate": 1.867737288366893e-07, + "loss": 0.2112, + "step": 3269 + }, + { + "epoch": 0.8701436934539649, + "grad_norm": 0.24842193722724915, + "learning_rate": 1.8676533364034646e-07, + "loss": 0.187, + "step": 3270 + }, + { + "epoch": 0.8704097924427887, + "grad_norm": 0.2588275372982025, + "learning_rate": 1.8675693596927998e-07, + "loss": 0.2053, + "step": 3271 + }, + { + "epoch": 0.8706758914316126, + "grad_norm": 0.2543754279613495, + "learning_rate": 1.867485358237294e-07, + "loss": 0.1941, + "step": 3272 + }, + { + "epoch": 0.8709419904204364, + "grad_norm": 0.2813057601451874, + "learning_rate": 1.8674013320393427e-07, + "loss": 0.1926, + "step": 3273 + }, + { + "epoch": 0.8712080894092602, + "grad_norm": 0.3889528512954712, + "learning_rate": 1.8673172811013432e-07, + "loss": 0.2276, + "step": 3274 + }, + { + "epoch": 0.8714741883980841, + "grad_norm": 0.3300858438014984, + "learning_rate": 1.867233205425692e-07, + "loss": 0.215, + "step": 3275 + }, + { + "epoch": 0.8717402873869079, + "grad_norm": 0.34174829721450806, + "learning_rate": 1.8671491050147877e-07, + "loss": 0.2261, + "step": 3276 + }, + { + "epoch": 0.8720063863757318, + "grad_norm": 0.43757739663124084, + "learning_rate": 1.8670649798710288e-07, + "loss": 0.2128, + "step": 3277 + }, + { + "epoch": 0.8722724853645556, + "grad_norm": 0.26348015666007996, + "learning_rate": 1.8669808299968147e-07, + "loss": 0.1913, + "step": 3278 + }, + { + "epoch": 0.8725385843533795, + "grad_norm": 0.32361558079719543, + "learning_rate": 1.8668966553945452e-07, + "loss": 0.2183, + "step": 3279 + }, + { + "epoch": 0.8728046833422033, + "grad_norm": 0.26916489005088806, + "learning_rate": 1.8668124560666217e-07, + "loss": 0.2003, + "step": 3280 + }, + { + "epoch": 0.8730707823310272, + "grad_norm": 0.27192139625549316, + "learning_rate": 1.8667282320154457e-07, + "loss": 0.1976, + "step": 3281 + }, + { + "epoch": 0.873336881319851, + "grad_norm": 0.2672799825668335, + "learning_rate": 1.8666439832434188e-07, + "loss": 0.2038, + "step": 3282 + }, + { + "epoch": 0.8736029803086748, + "grad_norm": 0.2516234815120697, + "learning_rate": 1.8665597097529449e-07, + "loss": 0.197, + "step": 3283 + }, + { + "epoch": 0.8738690792974987, + "grad_norm": 0.3847910165786743, + "learning_rate": 1.866475411546427e-07, + "loss": 0.2068, + "step": 3284 + }, + { + "epoch": 0.8741351782863225, + "grad_norm": 0.2614700198173523, + "learning_rate": 1.8663910886262693e-07, + "loss": 0.179, + "step": 3285 + }, + { + "epoch": 0.8744012772751464, + "grad_norm": 0.24475356936454773, + "learning_rate": 1.8663067409948773e-07, + "loss": 0.1837, + "step": 3286 + }, + { + "epoch": 0.8746673762639702, + "grad_norm": 0.3166518211364746, + "learning_rate": 1.8662223686546568e-07, + "loss": 0.2194, + "step": 3287 + }, + { + "epoch": 0.8749334752527941, + "grad_norm": 0.3342883884906769, + "learning_rate": 1.866137971608014e-07, + "loss": 0.2282, + "step": 3288 + }, + { + "epoch": 0.8751995742416179, + "grad_norm": 0.37748950719833374, + "learning_rate": 1.866053549857356e-07, + "loss": 0.236, + "step": 3289 + }, + { + "epoch": 0.8754656732304418, + "grad_norm": 0.26777151226997375, + "learning_rate": 1.8659691034050913e-07, + "loss": 0.1945, + "step": 3290 + }, + { + "epoch": 0.8757317722192656, + "grad_norm": 0.2956629693508148, + "learning_rate": 1.8658846322536274e-07, + "loss": 0.1924, + "step": 3291 + }, + { + "epoch": 0.8759978712080894, + "grad_norm": 0.2805291414260864, + "learning_rate": 1.8658001364053747e-07, + "loss": 0.2207, + "step": 3292 + }, + { + "epoch": 0.8762639701969133, + "grad_norm": 0.3655954897403717, + "learning_rate": 1.8657156158627428e-07, + "loss": 0.1981, + "step": 3293 + }, + { + "epoch": 0.8765300691857371, + "grad_norm": 0.2683213949203491, + "learning_rate": 1.865631070628142e-07, + "loss": 0.2, + "step": 3294 + }, + { + "epoch": 0.876796168174561, + "grad_norm": 0.2646369934082031, + "learning_rate": 1.865546500703984e-07, + "loss": 0.1752, + "step": 3295 + }, + { + "epoch": 0.8770622671633848, + "grad_norm": 0.3748983144760132, + "learning_rate": 1.8654619060926813e-07, + "loss": 0.2453, + "step": 3296 + }, + { + "epoch": 0.8773283661522087, + "grad_norm": 0.250666081905365, + "learning_rate": 1.865377286796646e-07, + "loss": 0.1818, + "step": 3297 + }, + { + "epoch": 0.8775944651410325, + "grad_norm": 0.3670032024383545, + "learning_rate": 1.865292642818292e-07, + "loss": 0.2203, + "step": 3298 + }, + { + "epoch": 0.8778605641298564, + "grad_norm": 0.2691868245601654, + "learning_rate": 1.8652079741600338e-07, + "loss": 0.2047, + "step": 3299 + }, + { + "epoch": 0.8781266631186802, + "grad_norm": 0.24554947018623352, + "learning_rate": 1.8651232808242856e-07, + "loss": 0.1945, + "step": 3300 + }, + { + "epoch": 0.8783927621075039, + "grad_norm": 0.3189505636692047, + "learning_rate": 1.8650385628134636e-07, + "loss": 0.2165, + "step": 3301 + }, + { + "epoch": 0.8786588610963278, + "grad_norm": 0.29884812235832214, + "learning_rate": 1.8649538201299837e-07, + "loss": 0.2246, + "step": 3302 + }, + { + "epoch": 0.8789249600851516, + "grad_norm": 0.27087873220443726, + "learning_rate": 1.8648690527762634e-07, + "loss": 0.1969, + "step": 3303 + }, + { + "epoch": 0.8791910590739755, + "grad_norm": 0.35482847690582275, + "learning_rate": 1.86478426075472e-07, + "loss": 0.2141, + "step": 3304 + }, + { + "epoch": 0.8794571580627993, + "grad_norm": 0.37198591232299805, + "learning_rate": 1.8646994440677726e-07, + "loss": 0.2231, + "step": 3305 + }, + { + "epoch": 0.8797232570516232, + "grad_norm": 0.2713069021701813, + "learning_rate": 1.8646146027178396e-07, + "loss": 0.1965, + "step": 3306 + }, + { + "epoch": 0.879989356040447, + "grad_norm": 0.2782302796840668, + "learning_rate": 1.8645297367073412e-07, + "loss": 0.1952, + "step": 3307 + }, + { + "epoch": 0.8802554550292709, + "grad_norm": 0.2747561037540436, + "learning_rate": 1.8644448460386978e-07, + "loss": 0.2002, + "step": 3308 + }, + { + "epoch": 0.8805215540180947, + "grad_norm": 0.2811580300331116, + "learning_rate": 1.8643599307143308e-07, + "loss": 0.1911, + "step": 3309 + }, + { + "epoch": 0.8807876530069185, + "grad_norm": 0.3724557161331177, + "learning_rate": 1.864274990736662e-07, + "loss": 0.1896, + "step": 3310 + }, + { + "epoch": 0.8810537519957424, + "grad_norm": 0.2678873836994171, + "learning_rate": 1.8641900261081145e-07, + "loss": 0.2128, + "step": 3311 + }, + { + "epoch": 0.8813198509845662, + "grad_norm": 0.2642453908920288, + "learning_rate": 1.8641050368311112e-07, + "loss": 0.2036, + "step": 3312 + }, + { + "epoch": 0.8815859499733901, + "grad_norm": 0.26755490899086, + "learning_rate": 1.8640200229080762e-07, + "loss": 0.2082, + "step": 3313 + }, + { + "epoch": 0.8818520489622139, + "grad_norm": 0.32979485392570496, + "learning_rate": 1.8639349843414345e-07, + "loss": 0.1988, + "step": 3314 + }, + { + "epoch": 0.8821181479510378, + "grad_norm": 0.2769794464111328, + "learning_rate": 1.8638499211336113e-07, + "loss": 0.2013, + "step": 3315 + }, + { + "epoch": 0.8823842469398616, + "grad_norm": 0.2649977505207062, + "learning_rate": 1.8637648332870332e-07, + "loss": 0.2024, + "step": 3316 + }, + { + "epoch": 0.8826503459286855, + "grad_norm": 0.39293351769447327, + "learning_rate": 1.8636797208041265e-07, + "loss": 0.2043, + "step": 3317 + }, + { + "epoch": 0.8829164449175093, + "grad_norm": 0.30290234088897705, + "learning_rate": 1.8635945836873192e-07, + "loss": 0.208, + "step": 3318 + }, + { + "epoch": 0.8831825439063331, + "grad_norm": 0.279057115316391, + "learning_rate": 1.8635094219390392e-07, + "loss": 0.1881, + "step": 3319 + }, + { + "epoch": 0.883448642895157, + "grad_norm": 0.23869504034519196, + "learning_rate": 1.8634242355617161e-07, + "loss": 0.1751, + "step": 3320 + }, + { + "epoch": 0.8837147418839808, + "grad_norm": 0.29911136627197266, + "learning_rate": 1.8633390245577792e-07, + "loss": 0.2017, + "step": 3321 + }, + { + "epoch": 0.8839808408728047, + "grad_norm": 0.392115980386734, + "learning_rate": 1.863253788929659e-07, + "loss": 0.1949, + "step": 3322 + }, + { + "epoch": 0.8842469398616285, + "grad_norm": 0.3656608462333679, + "learning_rate": 1.8631685286797865e-07, + "loss": 0.2217, + "step": 3323 + }, + { + "epoch": 0.8845130388504524, + "grad_norm": 0.2935267984867096, + "learning_rate": 1.8630832438105933e-07, + "loss": 0.1994, + "step": 3324 + }, + { + "epoch": 0.8847791378392762, + "grad_norm": 0.29464587569236755, + "learning_rate": 1.8629979343245123e-07, + "loss": 0.2156, + "step": 3325 + }, + { + "epoch": 0.8850452368281001, + "grad_norm": 0.28764787316322327, + "learning_rate": 1.8629126002239764e-07, + "loss": 0.1976, + "step": 3326 + }, + { + "epoch": 0.8853113358169239, + "grad_norm": 0.3700929284095764, + "learning_rate": 1.8628272415114197e-07, + "loss": 0.2204, + "step": 3327 + }, + { + "epoch": 0.8855774348057477, + "grad_norm": 0.30232441425323486, + "learning_rate": 1.8627418581892765e-07, + "loss": 0.222, + "step": 3328 + }, + { + "epoch": 0.8858435337945716, + "grad_norm": 0.35232263803482056, + "learning_rate": 1.8626564502599826e-07, + "loss": 0.2168, + "step": 3329 + }, + { + "epoch": 0.8861096327833954, + "grad_norm": 0.4029355049133301, + "learning_rate": 1.8625710177259738e-07, + "loss": 0.1986, + "step": 3330 + }, + { + "epoch": 0.8863757317722193, + "grad_norm": 0.2739047110080719, + "learning_rate": 1.8624855605896865e-07, + "loss": 0.1984, + "step": 3331 + }, + { + "epoch": 0.8866418307610431, + "grad_norm": 0.27529093623161316, + "learning_rate": 1.8624000788535582e-07, + "loss": 0.1904, + "step": 3332 + }, + { + "epoch": 0.886907929749867, + "grad_norm": 0.5193217992782593, + "learning_rate": 1.8623145725200277e-07, + "loss": 0.2166, + "step": 3333 + }, + { + "epoch": 0.8871740287386908, + "grad_norm": 0.2729797065258026, + "learning_rate": 1.862229041591533e-07, + "loss": 0.2018, + "step": 3334 + }, + { + "epoch": 0.8874401277275147, + "grad_norm": 0.43449845910072327, + "learning_rate": 1.862143486070514e-07, + "loss": 0.2034, + "step": 3335 + }, + { + "epoch": 0.8877062267163385, + "grad_norm": 0.34657227993011475, + "learning_rate": 1.8620579059594107e-07, + "loss": 0.2229, + "step": 3336 + }, + { + "epoch": 0.8879723257051623, + "grad_norm": 0.356340616941452, + "learning_rate": 1.861972301260664e-07, + "loss": 0.2149, + "step": 3337 + }, + { + "epoch": 0.8882384246939862, + "grad_norm": 0.2627890110015869, + "learning_rate": 1.861886671976716e-07, + "loss": 0.1987, + "step": 3338 + }, + { + "epoch": 0.88850452368281, + "grad_norm": 0.2708023488521576, + "learning_rate": 1.8618010181100084e-07, + "loss": 0.2006, + "step": 3339 + }, + { + "epoch": 0.8887706226716339, + "grad_norm": 0.29810214042663574, + "learning_rate": 1.8617153396629848e-07, + "loss": 0.2231, + "step": 3340 + }, + { + "epoch": 0.8890367216604577, + "grad_norm": 0.2567444443702698, + "learning_rate": 1.8616296366380884e-07, + "loss": 0.1978, + "step": 3341 + }, + { + "epoch": 0.8893028206492816, + "grad_norm": 0.3283185362815857, + "learning_rate": 1.8615439090377637e-07, + "loss": 0.2089, + "step": 3342 + }, + { + "epoch": 0.8895689196381054, + "grad_norm": 0.37672746181488037, + "learning_rate": 1.8614581568644563e-07, + "loss": 0.221, + "step": 3343 + }, + { + "epoch": 0.8898350186269293, + "grad_norm": 0.27769163250923157, + "learning_rate": 1.8613723801206115e-07, + "loss": 0.205, + "step": 3344 + }, + { + "epoch": 0.8901011176157531, + "grad_norm": 0.2843565344810486, + "learning_rate": 1.861286578808676e-07, + "loss": 0.1808, + "step": 3345 + }, + { + "epoch": 0.8903672166045768, + "grad_norm": 0.3848264515399933, + "learning_rate": 1.8612007529310973e-07, + "loss": 0.1996, + "step": 3346 + }, + { + "epoch": 0.8906333155934008, + "grad_norm": 0.3448012173175812, + "learning_rate": 1.861114902490323e-07, + "loss": 0.2144, + "step": 3347 + }, + { + "epoch": 0.8908994145822245, + "grad_norm": 0.2916563153266907, + "learning_rate": 1.8610290274888017e-07, + "loss": 0.2065, + "step": 3348 + }, + { + "epoch": 0.8911655135710485, + "grad_norm": 0.29947033524513245, + "learning_rate": 1.8609431279289827e-07, + "loss": 0.2108, + "step": 3349 + }, + { + "epoch": 0.8914316125598722, + "grad_norm": 0.46834084391593933, + "learning_rate": 1.8608572038133165e-07, + "loss": 0.2266, + "step": 3350 + }, + { + "epoch": 0.8916977115486961, + "grad_norm": 0.2634826898574829, + "learning_rate": 1.8607712551442533e-07, + "loss": 0.2078, + "step": 3351 + }, + { + "epoch": 0.8919638105375199, + "grad_norm": 0.32863301038742065, + "learning_rate": 1.8606852819242448e-07, + "loss": 0.1841, + "step": 3352 + }, + { + "epoch": 0.8922299095263438, + "grad_norm": 0.3593713343143463, + "learning_rate": 1.8605992841557429e-07, + "loss": 0.2143, + "step": 3353 + }, + { + "epoch": 0.8924960085151676, + "grad_norm": 0.36456024646759033, + "learning_rate": 1.8605132618412008e-07, + "loss": 0.2101, + "step": 3354 + }, + { + "epoch": 0.8927621075039914, + "grad_norm": 0.28541314601898193, + "learning_rate": 1.8604272149830717e-07, + "loss": 0.2057, + "step": 3355 + }, + { + "epoch": 0.8930282064928153, + "grad_norm": 0.34414562582969666, + "learning_rate": 1.8603411435838102e-07, + "loss": 0.2344, + "step": 3356 + }, + { + "epoch": 0.8932943054816391, + "grad_norm": 0.2931491732597351, + "learning_rate": 1.8602550476458708e-07, + "loss": 0.2007, + "step": 3357 + }, + { + "epoch": 0.893560404470463, + "grad_norm": 0.4498755633831024, + "learning_rate": 1.8601689271717093e-07, + "loss": 0.229, + "step": 3358 + }, + { + "epoch": 0.8938265034592868, + "grad_norm": 0.2853013575077057, + "learning_rate": 1.860082782163782e-07, + "loss": 0.2171, + "step": 3359 + }, + { + "epoch": 0.8940926024481107, + "grad_norm": 0.2993173599243164, + "learning_rate": 1.859996612624546e-07, + "loss": 0.1785, + "step": 3360 + }, + { + "epoch": 0.8943587014369345, + "grad_norm": 0.42382484674453735, + "learning_rate": 1.8599104185564593e-07, + "loss": 0.2147, + "step": 3361 + }, + { + "epoch": 0.8946248004257584, + "grad_norm": 0.25436505675315857, + "learning_rate": 1.8598241999619798e-07, + "loss": 0.192, + "step": 3362 + }, + { + "epoch": 0.8948908994145822, + "grad_norm": 0.27968496084213257, + "learning_rate": 1.8597379568435666e-07, + "loss": 0.1985, + "step": 3363 + }, + { + "epoch": 0.895156998403406, + "grad_norm": 0.2715419828891754, + "learning_rate": 1.85965168920368e-07, + "loss": 0.2199, + "step": 3364 + }, + { + "epoch": 0.8954230973922299, + "grad_norm": 0.3509947657585144, + "learning_rate": 1.8595653970447804e-07, + "loss": 0.2265, + "step": 3365 + }, + { + "epoch": 0.8956891963810537, + "grad_norm": 0.3643444776535034, + "learning_rate": 1.8594790803693292e-07, + "loss": 0.2088, + "step": 3366 + }, + { + "epoch": 0.8959552953698776, + "grad_norm": 0.3251781463623047, + "learning_rate": 1.8593927391797878e-07, + "loss": 0.1992, + "step": 3367 + }, + { + "epoch": 0.8962213943587014, + "grad_norm": 0.3308534026145935, + "learning_rate": 1.859306373478619e-07, + "loss": 0.2105, + "step": 3368 + }, + { + "epoch": 0.8964874933475253, + "grad_norm": 0.2905437648296356, + "learning_rate": 1.8592199832682868e-07, + "loss": 0.2091, + "step": 3369 + }, + { + "epoch": 0.8967535923363491, + "grad_norm": 0.26257452368736267, + "learning_rate": 1.859133568551254e-07, + "loss": 0.2001, + "step": 3370 + }, + { + "epoch": 0.897019691325173, + "grad_norm": 0.2629469633102417, + "learning_rate": 1.8590471293299862e-07, + "loss": 0.1818, + "step": 3371 + }, + { + "epoch": 0.8972857903139968, + "grad_norm": 0.28844451904296875, + "learning_rate": 1.8589606656069488e-07, + "loss": 0.2131, + "step": 3372 + }, + { + "epoch": 0.8975518893028207, + "grad_norm": 0.36703962087631226, + "learning_rate": 1.8588741773846075e-07, + "loss": 0.2181, + "step": 3373 + }, + { + "epoch": 0.8978179882916445, + "grad_norm": 0.2861939072608948, + "learning_rate": 1.8587876646654293e-07, + "loss": 0.2059, + "step": 3374 + }, + { + "epoch": 0.8980840872804683, + "grad_norm": 0.2978915870189667, + "learning_rate": 1.8587011274518819e-07, + "loss": 0.2056, + "step": 3375 + }, + { + "epoch": 0.8983501862692922, + "grad_norm": 0.3359217047691345, + "learning_rate": 1.8586145657464334e-07, + "loss": 0.2126, + "step": 3376 + }, + { + "epoch": 0.898616285258116, + "grad_norm": 0.42895880341529846, + "learning_rate": 1.8585279795515527e-07, + "loss": 0.2053, + "step": 3377 + }, + { + "epoch": 0.8988823842469399, + "grad_norm": 0.2741157114505768, + "learning_rate": 1.8584413688697094e-07, + "loss": 0.1941, + "step": 3378 + }, + { + "epoch": 0.8991484832357637, + "grad_norm": 0.2649787664413452, + "learning_rate": 1.8583547337033735e-07, + "loss": 0.2101, + "step": 3379 + }, + { + "epoch": 0.8994145822245876, + "grad_norm": 0.26809632778167725, + "learning_rate": 1.8582680740550167e-07, + "loss": 0.1983, + "step": 3380 + }, + { + "epoch": 0.8996806812134114, + "grad_norm": 0.4139702320098877, + "learning_rate": 1.8581813899271102e-07, + "loss": 0.2084, + "step": 3381 + }, + { + "epoch": 0.8999467802022353, + "grad_norm": 0.25576019287109375, + "learning_rate": 1.8580946813221265e-07, + "loss": 0.2105, + "step": 3382 + }, + { + "epoch": 0.9002128791910591, + "grad_norm": 0.2635625898838043, + "learning_rate": 1.858007948242539e-07, + "loss": 0.2102, + "step": 3383 + }, + { + "epoch": 0.9004789781798829, + "grad_norm": 0.2602601647377014, + "learning_rate": 1.857921190690821e-07, + "loss": 0.1839, + "step": 3384 + }, + { + "epoch": 0.9007450771687068, + "grad_norm": 0.32465970516204834, + "learning_rate": 1.8578344086694476e-07, + "loss": 0.2017, + "step": 3385 + }, + { + "epoch": 0.9010111761575306, + "grad_norm": 0.38612034916877747, + "learning_rate": 1.8577476021808934e-07, + "loss": 0.2137, + "step": 3386 + }, + { + "epoch": 0.9012772751463545, + "grad_norm": 0.2911206781864166, + "learning_rate": 1.8576607712276346e-07, + "loss": 0.2082, + "step": 3387 + }, + { + "epoch": 0.9015433741351783, + "grad_norm": 0.3821903467178345, + "learning_rate": 1.8575739158121477e-07, + "loss": 0.2145, + "step": 3388 + }, + { + "epoch": 0.9018094731240022, + "grad_norm": 0.26856762170791626, + "learning_rate": 1.8574870359369102e-07, + "loss": 0.2025, + "step": 3389 + }, + { + "epoch": 0.902075572112826, + "grad_norm": 0.27145177125930786, + "learning_rate": 1.8574001316044e-07, + "loss": 0.206, + "step": 3390 + }, + { + "epoch": 0.9023416711016499, + "grad_norm": 0.2621195614337921, + "learning_rate": 1.8573132028170956e-07, + "loss": 0.21, + "step": 3391 + }, + { + "epoch": 0.9026077700904737, + "grad_norm": 0.3752131164073944, + "learning_rate": 1.8572262495774763e-07, + "loss": 0.2087, + "step": 3392 + }, + { + "epoch": 0.9028738690792975, + "grad_norm": 0.4314408302307129, + "learning_rate": 1.8571392718880229e-07, + "loss": 0.2251, + "step": 3393 + }, + { + "epoch": 0.9031399680681214, + "grad_norm": 0.35722601413726807, + "learning_rate": 1.857052269751215e-07, + "loss": 0.2287, + "step": 3394 + }, + { + "epoch": 0.9034060670569452, + "grad_norm": 0.35289013385772705, + "learning_rate": 1.8569652431695353e-07, + "loss": 0.2199, + "step": 3395 + }, + { + "epoch": 0.903672166045769, + "grad_norm": 0.2879377007484436, + "learning_rate": 1.8568781921454653e-07, + "loss": 0.1992, + "step": 3396 + }, + { + "epoch": 0.9039382650345928, + "grad_norm": 0.2755813002586365, + "learning_rate": 1.8567911166814878e-07, + "loss": 0.2078, + "step": 3397 + }, + { + "epoch": 0.9042043640234168, + "grad_norm": 0.2819230854511261, + "learning_rate": 1.8567040167800868e-07, + "loss": 0.2114, + "step": 3398 + }, + { + "epoch": 0.9044704630122405, + "grad_norm": 0.2872898578643799, + "learning_rate": 1.856616892443746e-07, + "loss": 0.1966, + "step": 3399 + }, + { + "epoch": 0.9047365620010644, + "grad_norm": 0.25057217478752136, + "learning_rate": 1.856529743674951e-07, + "loss": 0.1773, + "step": 3400 + }, + { + "epoch": 0.9050026609898882, + "grad_norm": 0.4702950119972229, + "learning_rate": 1.856442570476187e-07, + "loss": 0.215, + "step": 3401 + }, + { + "epoch": 0.905268759978712, + "grad_norm": 0.33038589358329773, + "learning_rate": 1.8563553728499408e-07, + "loss": 0.1899, + "step": 3402 + }, + { + "epoch": 0.9055348589675359, + "grad_norm": 0.30531853437423706, + "learning_rate": 1.8562681507986988e-07, + "loss": 0.19, + "step": 3403 + }, + { + "epoch": 0.9058009579563597, + "grad_norm": 0.3539304733276367, + "learning_rate": 1.8561809043249494e-07, + "loss": 0.2021, + "step": 3404 + }, + { + "epoch": 0.9060670569451836, + "grad_norm": 0.27724766731262207, + "learning_rate": 1.8560936334311806e-07, + "loss": 0.1729, + "step": 3405 + }, + { + "epoch": 0.9063331559340074, + "grad_norm": 0.35543882846832275, + "learning_rate": 1.8560063381198822e-07, + "loss": 0.1929, + "step": 3406 + }, + { + "epoch": 0.9065992549228313, + "grad_norm": 0.29542291164398193, + "learning_rate": 1.8559190183935427e-07, + "loss": 0.2, + "step": 3407 + }, + { + "epoch": 0.9068653539116551, + "grad_norm": 0.244664266705513, + "learning_rate": 1.8558316742546542e-07, + "loss": 0.1791, + "step": 3408 + }, + { + "epoch": 0.907131452900479, + "grad_norm": 0.2524752616882324, + "learning_rate": 1.855744305705707e-07, + "loss": 0.1919, + "step": 3409 + }, + { + "epoch": 0.9073975518893028, + "grad_norm": 0.34346067905426025, + "learning_rate": 1.8556569127491929e-07, + "loss": 0.2196, + "step": 3410 + }, + { + "epoch": 0.9076636508781266, + "grad_norm": 0.26751402020454407, + "learning_rate": 1.8555694953876052e-07, + "loss": 0.1919, + "step": 3411 + }, + { + "epoch": 0.9079297498669505, + "grad_norm": 0.31526854634284973, + "learning_rate": 1.855482053623437e-07, + "loss": 0.1964, + "step": 3412 + }, + { + "epoch": 0.9081958488557743, + "grad_norm": 0.3305530548095703, + "learning_rate": 1.8553945874591818e-07, + "loss": 0.2204, + "step": 3413 + }, + { + "epoch": 0.9084619478445982, + "grad_norm": 0.41045942902565, + "learning_rate": 1.8553070968973353e-07, + "loss": 0.1808, + "step": 3414 + }, + { + "epoch": 0.908728046833422, + "grad_norm": 0.3762493133544922, + "learning_rate": 1.8552195819403917e-07, + "loss": 0.1894, + "step": 3415 + }, + { + "epoch": 0.9089941458222459, + "grad_norm": 0.2664561867713928, + "learning_rate": 1.855132042590848e-07, + "loss": 0.2104, + "step": 3416 + }, + { + "epoch": 0.9092602448110697, + "grad_norm": 0.26582300662994385, + "learning_rate": 1.8550444788512007e-07, + "loss": 0.1968, + "step": 3417 + }, + { + "epoch": 0.9095263437998936, + "grad_norm": 0.2762686312198639, + "learning_rate": 1.8549568907239473e-07, + "loss": 0.2061, + "step": 3418 + }, + { + "epoch": 0.9097924427887174, + "grad_norm": 0.43579885363578796, + "learning_rate": 1.8548692782115866e-07, + "loss": 0.2335, + "step": 3419 + }, + { + "epoch": 0.9100585417775412, + "grad_norm": 0.38013866543769836, + "learning_rate": 1.854781641316616e-07, + "loss": 0.2169, + "step": 3420 + }, + { + "epoch": 0.9103246407663651, + "grad_norm": 0.31659263372421265, + "learning_rate": 1.8546939800415363e-07, + "loss": 0.2112, + "step": 3421 + }, + { + "epoch": 0.9105907397551889, + "grad_norm": 0.36547067761421204, + "learning_rate": 1.8546062943888477e-07, + "loss": 0.2233, + "step": 3422 + }, + { + "epoch": 0.9108568387440128, + "grad_norm": 0.31424421072006226, + "learning_rate": 1.854518584361051e-07, + "loss": 0.2055, + "step": 3423 + }, + { + "epoch": 0.9111229377328366, + "grad_norm": 0.3907301127910614, + "learning_rate": 1.8544308499606476e-07, + "loss": 0.1974, + "step": 3424 + }, + { + "epoch": 0.9113890367216605, + "grad_norm": 0.33448997139930725, + "learning_rate": 1.85434309119014e-07, + "loss": 0.2179, + "step": 3425 + }, + { + "epoch": 0.9116551357104843, + "grad_norm": 0.251142680644989, + "learning_rate": 1.8542553080520315e-07, + "loss": 0.21, + "step": 3426 + }, + { + "epoch": 0.9119212346993082, + "grad_norm": 0.2726818323135376, + "learning_rate": 1.8541675005488257e-07, + "loss": 0.2093, + "step": 3427 + }, + { + "epoch": 0.912187333688132, + "grad_norm": 0.2898302674293518, + "learning_rate": 1.8540796686830268e-07, + "loss": 0.2064, + "step": 3428 + }, + { + "epoch": 0.9124534326769558, + "grad_norm": 0.2788359224796295, + "learning_rate": 1.8539918124571404e-07, + "loss": 0.203, + "step": 3429 + }, + { + "epoch": 0.9127195316657797, + "grad_norm": 0.32445019483566284, + "learning_rate": 1.8539039318736723e-07, + "loss": 0.2158, + "step": 3430 + }, + { + "epoch": 0.9129856306546035, + "grad_norm": 0.2967442572116852, + "learning_rate": 1.8538160269351283e-07, + "loss": 0.2081, + "step": 3431 + }, + { + "epoch": 0.9132517296434274, + "grad_norm": 0.28663092851638794, + "learning_rate": 1.8537280976440165e-07, + "loss": 0.1982, + "step": 3432 + }, + { + "epoch": 0.9135178286322512, + "grad_norm": 0.451618492603302, + "learning_rate": 1.8536401440028447e-07, + "loss": 0.215, + "step": 3433 + }, + { + "epoch": 0.9137839276210751, + "grad_norm": 0.26151329278945923, + "learning_rate": 1.8535521660141208e-07, + "loss": 0.1986, + "step": 3434 + }, + { + "epoch": 0.9140500266098989, + "grad_norm": 0.2905513644218445, + "learning_rate": 1.853464163680355e-07, + "loss": 0.2214, + "step": 3435 + }, + { + "epoch": 0.9143161255987228, + "grad_norm": 0.2982380986213684, + "learning_rate": 1.8533761370040568e-07, + "loss": 0.1912, + "step": 3436 + }, + { + "epoch": 0.9145822245875466, + "grad_norm": 0.36427903175354004, + "learning_rate": 1.853288085987737e-07, + "loss": 0.2215, + "step": 3437 + }, + { + "epoch": 0.9148483235763704, + "grad_norm": 0.28542816638946533, + "learning_rate": 1.8532000106339073e-07, + "loss": 0.2083, + "step": 3438 + }, + { + "epoch": 0.9151144225651943, + "grad_norm": 0.3402915894985199, + "learning_rate": 1.853111910945079e-07, + "loss": 0.208, + "step": 3439 + }, + { + "epoch": 0.9153805215540181, + "grad_norm": 0.2547129988670349, + "learning_rate": 1.8530237869237658e-07, + "loss": 0.2022, + "step": 3440 + }, + { + "epoch": 0.915646620542842, + "grad_norm": 0.34949254989624023, + "learning_rate": 1.8529356385724805e-07, + "loss": 0.2118, + "step": 3441 + }, + { + "epoch": 0.9159127195316658, + "grad_norm": 0.37295231223106384, + "learning_rate": 1.852847465893738e-07, + "loss": 0.2045, + "step": 3442 + }, + { + "epoch": 0.9161788185204897, + "grad_norm": 0.26529109477996826, + "learning_rate": 1.8527592688900522e-07, + "loss": 0.1988, + "step": 3443 + }, + { + "epoch": 0.9164449175093135, + "grad_norm": 0.43909379839897156, + "learning_rate": 1.8526710475639393e-07, + "loss": 0.2352, + "step": 3444 + }, + { + "epoch": 0.9167110164981374, + "grad_norm": 1.7540667057037354, + "learning_rate": 1.8525828019179154e-07, + "loss": 0.2236, + "step": 3445 + }, + { + "epoch": 0.9169771154869611, + "grad_norm": 0.2778792083263397, + "learning_rate": 1.8524945319544978e-07, + "loss": 0.2033, + "step": 3446 + }, + { + "epoch": 0.9172432144757849, + "grad_norm": 0.38022851943969727, + "learning_rate": 1.8524062376762032e-07, + "loss": 0.1943, + "step": 3447 + }, + { + "epoch": 0.9175093134646088, + "grad_norm": 0.34378695487976074, + "learning_rate": 1.852317919085551e-07, + "loss": 0.2061, + "step": 3448 + }, + { + "epoch": 0.9177754124534326, + "grad_norm": 0.2810015082359314, + "learning_rate": 1.8522295761850597e-07, + "loss": 0.2166, + "step": 3449 + }, + { + "epoch": 0.9180415114422565, + "grad_norm": 0.32513535022735596, + "learning_rate": 1.852141208977249e-07, + "loss": 0.2091, + "step": 3450 + }, + { + "epoch": 0.9183076104310803, + "grad_norm": 0.2850893437862396, + "learning_rate": 1.8520528174646394e-07, + "loss": 0.2054, + "step": 3451 + }, + { + "epoch": 0.9185737094199042, + "grad_norm": 0.27635806798934937, + "learning_rate": 1.851964401649752e-07, + "loss": 0.1988, + "step": 3452 + }, + { + "epoch": 0.918839808408728, + "grad_norm": 0.29104503989219666, + "learning_rate": 1.8518759615351085e-07, + "loss": 0.2125, + "step": 3453 + }, + { + "epoch": 0.9191059073975519, + "grad_norm": 0.26064130663871765, + "learning_rate": 1.8517874971232317e-07, + "loss": 0.2086, + "step": 3454 + }, + { + "epoch": 0.9193720063863757, + "grad_norm": 0.313640832901001, + "learning_rate": 1.8516990084166442e-07, + "loss": 0.1894, + "step": 3455 + }, + { + "epoch": 0.9196381053751995, + "grad_norm": 0.26010191440582275, + "learning_rate": 1.8516104954178707e-07, + "loss": 0.1987, + "step": 3456 + }, + { + "epoch": 0.9199042043640234, + "grad_norm": 0.2609744071960449, + "learning_rate": 1.8515219581294348e-07, + "loss": 0.2149, + "step": 3457 + }, + { + "epoch": 0.9201703033528472, + "grad_norm": 0.34155988693237305, + "learning_rate": 1.8514333965538625e-07, + "loss": 0.2062, + "step": 3458 + }, + { + "epoch": 0.9204364023416711, + "grad_norm": 0.2999574542045593, + "learning_rate": 1.8513448106936797e-07, + "loss": 0.2141, + "step": 3459 + }, + { + "epoch": 0.9207025013304949, + "grad_norm": 0.2788478434085846, + "learning_rate": 1.8512562005514127e-07, + "loss": 0.2109, + "step": 3460 + }, + { + "epoch": 0.9209686003193188, + "grad_norm": 0.2615709900856018, + "learning_rate": 1.851167566129589e-07, + "loss": 0.2018, + "step": 3461 + }, + { + "epoch": 0.9212346993081426, + "grad_norm": 0.2650910019874573, + "learning_rate": 1.851078907430737e-07, + "loss": 0.1871, + "step": 3462 + }, + { + "epoch": 0.9215007982969665, + "grad_norm": 0.33179494738578796, + "learning_rate": 1.8509902244573847e-07, + "loss": 0.2072, + "step": 3463 + }, + { + "epoch": 0.9217668972857903, + "grad_norm": 0.28215327858924866, + "learning_rate": 1.850901517212062e-07, + "loss": 0.2141, + "step": 3464 + }, + { + "epoch": 0.9220329962746141, + "grad_norm": 0.2678569257259369, + "learning_rate": 1.8508127856972992e-07, + "loss": 0.2181, + "step": 3465 + }, + { + "epoch": 0.922299095263438, + "grad_norm": 0.38702982664108276, + "learning_rate": 1.8507240299156264e-07, + "loss": 0.2088, + "step": 3466 + }, + { + "epoch": 0.9225651942522618, + "grad_norm": 0.4305088222026825, + "learning_rate": 1.8506352498695756e-07, + "loss": 0.224, + "step": 3467 + }, + { + "epoch": 0.9228312932410857, + "grad_norm": 0.34692996740341187, + "learning_rate": 1.850546445561679e-07, + "loss": 0.2092, + "step": 3468 + }, + { + "epoch": 0.9230973922299095, + "grad_norm": 0.2894788384437561, + "learning_rate": 1.8504576169944691e-07, + "loss": 0.2139, + "step": 3469 + }, + { + "epoch": 0.9233634912187334, + "grad_norm": 0.3758470416069031, + "learning_rate": 1.8503687641704802e-07, + "loss": 0.2245, + "step": 3470 + }, + { + "epoch": 0.9236295902075572, + "grad_norm": 0.34377479553222656, + "learning_rate": 1.850279887092246e-07, + "loss": 0.2007, + "step": 3471 + }, + { + "epoch": 0.9238956891963811, + "grad_norm": 0.26221904158592224, + "learning_rate": 1.8501909857623014e-07, + "loss": 0.1909, + "step": 3472 + }, + { + "epoch": 0.9241617881852049, + "grad_norm": 0.3016570806503296, + "learning_rate": 1.8501020601831825e-07, + "loss": 0.2299, + "step": 3473 + }, + { + "epoch": 0.9244278871740287, + "grad_norm": 0.3567301034927368, + "learning_rate": 1.850013110357425e-07, + "loss": 0.1993, + "step": 3474 + }, + { + "epoch": 0.9246939861628526, + "grad_norm": 0.28474143147468567, + "learning_rate": 1.8499241362875667e-07, + "loss": 0.1971, + "step": 3475 + }, + { + "epoch": 0.9249600851516764, + "grad_norm": 0.2583165168762207, + "learning_rate": 1.8498351379761444e-07, + "loss": 0.1994, + "step": 3476 + }, + { + "epoch": 0.9252261841405003, + "grad_norm": 0.2772411108016968, + "learning_rate": 1.8497461154256973e-07, + "loss": 0.2077, + "step": 3477 + }, + { + "epoch": 0.9254922831293241, + "grad_norm": 0.2599528133869171, + "learning_rate": 1.8496570686387642e-07, + "loss": 0.1789, + "step": 3478 + }, + { + "epoch": 0.925758382118148, + "grad_norm": 0.2871329188346863, + "learning_rate": 1.849567997617885e-07, + "loss": 0.2149, + "step": 3479 + }, + { + "epoch": 0.9260244811069718, + "grad_norm": 0.262491911649704, + "learning_rate": 1.8494789023656e-07, + "loss": 0.2061, + "step": 3480 + }, + { + "epoch": 0.9262905800957957, + "grad_norm": 0.2865854501724243, + "learning_rate": 1.8493897828844504e-07, + "loss": 0.2312, + "step": 3481 + }, + { + "epoch": 0.9265566790846195, + "grad_norm": 0.2862778902053833, + "learning_rate": 1.8493006391769783e-07, + "loss": 0.1993, + "step": 3482 + }, + { + "epoch": 0.9268227780734434, + "grad_norm": 0.28779667615890503, + "learning_rate": 1.8492114712457258e-07, + "loss": 0.2017, + "step": 3483 + }, + { + "epoch": 0.9270888770622672, + "grad_norm": 0.27458158135414124, + "learning_rate": 1.8491222790932368e-07, + "loss": 0.2054, + "step": 3484 + }, + { + "epoch": 0.927354976051091, + "grad_norm": 0.2705831825733185, + "learning_rate": 1.849033062722055e-07, + "loss": 0.214, + "step": 3485 + }, + { + "epoch": 0.9276210750399149, + "grad_norm": 0.27485933899879456, + "learning_rate": 1.8489438221347247e-07, + "loss": 0.2108, + "step": 3486 + }, + { + "epoch": 0.9278871740287387, + "grad_norm": 0.2880268394947052, + "learning_rate": 1.8488545573337914e-07, + "loss": 0.2086, + "step": 3487 + }, + { + "epoch": 0.9281532730175626, + "grad_norm": 0.2708705961704254, + "learning_rate": 1.8487652683218014e-07, + "loss": 0.2013, + "step": 3488 + }, + { + "epoch": 0.9284193720063864, + "grad_norm": 0.43291163444519043, + "learning_rate": 1.848675955101301e-07, + "loss": 0.1914, + "step": 3489 + }, + { + "epoch": 0.9286854709952103, + "grad_norm": 0.3250850737094879, + "learning_rate": 1.8485866176748378e-07, + "loss": 0.2067, + "step": 3490 + }, + { + "epoch": 0.928951569984034, + "grad_norm": 0.39230072498321533, + "learning_rate": 1.84849725604496e-07, + "loss": 0.2063, + "step": 3491 + }, + { + "epoch": 0.929217668972858, + "grad_norm": 0.37971529364585876, + "learning_rate": 1.848407870214216e-07, + "loss": 0.23, + "step": 3492 + }, + { + "epoch": 0.9294837679616818, + "grad_norm": 0.2748142182826996, + "learning_rate": 1.8483184601851555e-07, + "loss": 0.192, + "step": 3493 + }, + { + "epoch": 0.9297498669505055, + "grad_norm": 0.4599490165710449, + "learning_rate": 1.848229025960329e-07, + "loss": 0.2179, + "step": 3494 + }, + { + "epoch": 0.9300159659393294, + "grad_norm": 0.3134431540966034, + "learning_rate": 1.8481395675422863e-07, + "loss": 0.2122, + "step": 3495 + }, + { + "epoch": 0.9302820649281532, + "grad_norm": 0.2842048108577728, + "learning_rate": 1.8480500849335801e-07, + "loss": 0.2063, + "step": 3496 + }, + { + "epoch": 0.9305481639169771, + "grad_norm": 0.3924690783023834, + "learning_rate": 1.847960578136762e-07, + "loss": 0.2097, + "step": 3497 + }, + { + "epoch": 0.9308142629058009, + "grad_norm": 0.26276057958602905, + "learning_rate": 1.8478710471543855e-07, + "loss": 0.2059, + "step": 3498 + }, + { + "epoch": 0.9310803618946248, + "grad_norm": 0.5693216919898987, + "learning_rate": 1.8477814919890036e-07, + "loss": 0.1994, + "step": 3499 + }, + { + "epoch": 0.9313464608834486, + "grad_norm": 0.42996761202812195, + "learning_rate": 1.8476919126431704e-07, + "loss": 0.2109, + "step": 3500 + }, + { + "epoch": 0.9316125598722725, + "grad_norm": 0.2815895974636078, + "learning_rate": 1.8476023091194416e-07, + "loss": 0.204, + "step": 3501 + }, + { + "epoch": 0.9318786588610963, + "grad_norm": 0.35597267746925354, + "learning_rate": 1.8475126814203724e-07, + "loss": 0.2152, + "step": 3502 + }, + { + "epoch": 0.9321447578499201, + "grad_norm": 0.4136335253715515, + "learning_rate": 1.8474230295485194e-07, + "loss": 0.2118, + "step": 3503 + }, + { + "epoch": 0.932410856838744, + "grad_norm": 0.30961883068084717, + "learning_rate": 1.8473333535064394e-07, + "loss": 0.1737, + "step": 3504 + }, + { + "epoch": 0.9326769558275678, + "grad_norm": 0.2632218599319458, + "learning_rate": 1.8472436532966904e-07, + "loss": 0.204, + "step": 3505 + }, + { + "epoch": 0.9329430548163917, + "grad_norm": 0.29814833402633667, + "learning_rate": 1.8471539289218305e-07, + "loss": 0.2116, + "step": 3506 + }, + { + "epoch": 0.9332091538052155, + "grad_norm": 0.28293511271476746, + "learning_rate": 1.8470641803844192e-07, + "loss": 0.2051, + "step": 3507 + }, + { + "epoch": 0.9334752527940394, + "grad_norm": 0.398531049489975, + "learning_rate": 1.8469744076870163e-07, + "loss": 0.2172, + "step": 3508 + }, + { + "epoch": 0.9337413517828632, + "grad_norm": 0.25758638978004456, + "learning_rate": 1.8468846108321823e-07, + "loss": 0.1904, + "step": 3509 + }, + { + "epoch": 0.9340074507716871, + "grad_norm": 0.2758156657218933, + "learning_rate": 1.846794789822478e-07, + "loss": 0.2107, + "step": 3510 + }, + { + "epoch": 0.9342735497605109, + "grad_norm": 0.26757198572158813, + "learning_rate": 1.8467049446604657e-07, + "loss": 0.1937, + "step": 3511 + }, + { + "epoch": 0.9345396487493347, + "grad_norm": 0.4480312168598175, + "learning_rate": 1.8466150753487076e-07, + "loss": 0.2233, + "step": 3512 + }, + { + "epoch": 0.9348057477381586, + "grad_norm": 0.2694166600704193, + "learning_rate": 1.8465251818897668e-07, + "loss": 0.1978, + "step": 3513 + }, + { + "epoch": 0.9350718467269824, + "grad_norm": 0.45584553480148315, + "learning_rate": 1.8464352642862082e-07, + "loss": 0.2208, + "step": 3514 + }, + { + "epoch": 0.9353379457158063, + "grad_norm": 0.39059996604919434, + "learning_rate": 1.8463453225405957e-07, + "loss": 0.2098, + "step": 3515 + }, + { + "epoch": 0.9356040447046301, + "grad_norm": 0.2889675199985504, + "learning_rate": 1.8462553566554945e-07, + "loss": 0.1993, + "step": 3516 + }, + { + "epoch": 0.935870143693454, + "grad_norm": 0.2858288884162903, + "learning_rate": 1.846165366633471e-07, + "loss": 0.2031, + "step": 3517 + }, + { + "epoch": 0.9361362426822778, + "grad_norm": 0.26278555393218994, + "learning_rate": 1.846075352477092e-07, + "loss": 0.2123, + "step": 3518 + }, + { + "epoch": 0.9364023416711017, + "grad_norm": 0.3197155296802521, + "learning_rate": 1.8459853141889243e-07, + "loss": 0.1981, + "step": 3519 + }, + { + "epoch": 0.9366684406599255, + "grad_norm": 0.2556213438510895, + "learning_rate": 1.8458952517715362e-07, + "loss": 0.1998, + "step": 3520 + }, + { + "epoch": 0.9369345396487493, + "grad_norm": 0.2781357765197754, + "learning_rate": 1.845805165227497e-07, + "loss": 0.1888, + "step": 3521 + }, + { + "epoch": 0.9372006386375732, + "grad_norm": 0.2657802999019623, + "learning_rate": 1.8457150545593753e-07, + "loss": 0.2035, + "step": 3522 + }, + { + "epoch": 0.937466737626397, + "grad_norm": 0.2599506676197052, + "learning_rate": 1.845624919769742e-07, + "loss": 0.2059, + "step": 3523 + }, + { + "epoch": 0.9377328366152209, + "grad_norm": 0.25518062710762024, + "learning_rate": 1.8455347608611676e-07, + "loss": 0.1935, + "step": 3524 + }, + { + "epoch": 0.9379989356040447, + "grad_norm": 0.26547771692276, + "learning_rate": 1.8454445778362232e-07, + "loss": 0.2044, + "step": 3525 + }, + { + "epoch": 0.9382650345928686, + "grad_norm": 0.28222259879112244, + "learning_rate": 1.845354370697482e-07, + "loss": 0.1987, + "step": 3526 + }, + { + "epoch": 0.9385311335816924, + "grad_norm": 0.32635319232940674, + "learning_rate": 1.845264139447516e-07, + "loss": 0.2074, + "step": 3527 + }, + { + "epoch": 0.9387972325705163, + "grad_norm": 0.32133910059928894, + "learning_rate": 1.845173884088899e-07, + "loss": 0.1956, + "step": 3528 + }, + { + "epoch": 0.9390633315593401, + "grad_norm": 0.27477651834487915, + "learning_rate": 1.8450836046242055e-07, + "loss": 0.1922, + "step": 3529 + }, + { + "epoch": 0.9393294305481639, + "grad_norm": 0.38004446029663086, + "learning_rate": 1.8449933010560102e-07, + "loss": 0.2323, + "step": 3530 + }, + { + "epoch": 0.9395955295369878, + "grad_norm": 1.2113099098205566, + "learning_rate": 1.8449029733868888e-07, + "loss": 0.2212, + "step": 3531 + }, + { + "epoch": 0.9398616285258116, + "grad_norm": 0.32414475083351135, + "learning_rate": 1.844812621619418e-07, + "loss": 0.2146, + "step": 3532 + }, + { + "epoch": 0.9401277275146355, + "grad_norm": 0.29270386695861816, + "learning_rate": 1.844722245756174e-07, + "loss": 0.1865, + "step": 3533 + }, + { + "epoch": 0.9403938265034593, + "grad_norm": 0.27960795164108276, + "learning_rate": 1.8446318457997355e-07, + "loss": 0.2098, + "step": 3534 + }, + { + "epoch": 0.9406599254922832, + "grad_norm": 0.25804319977760315, + "learning_rate": 1.84454142175268e-07, + "loss": 0.1981, + "step": 3535 + }, + { + "epoch": 0.940926024481107, + "grad_norm": 0.34702423214912415, + "learning_rate": 1.844450973617587e-07, + "loss": 0.2079, + "step": 3536 + }, + { + "epoch": 0.9411921234699309, + "grad_norm": 0.32451963424682617, + "learning_rate": 1.8443605013970365e-07, + "loss": 0.2098, + "step": 3537 + }, + { + "epoch": 0.9414582224587547, + "grad_norm": 0.3359982371330261, + "learning_rate": 1.8442700050936084e-07, + "loss": 0.2012, + "step": 3538 + }, + { + "epoch": 0.9417243214475784, + "grad_norm": 0.46837738156318665, + "learning_rate": 1.844179484709884e-07, + "loss": 0.2184, + "step": 3539 + }, + { + "epoch": 0.9419904204364024, + "grad_norm": 0.2764146029949188, + "learning_rate": 1.844088940248446e-07, + "loss": 0.1969, + "step": 3540 + }, + { + "epoch": 0.9422565194252261, + "grad_norm": 0.28583958745002747, + "learning_rate": 1.8439983717118756e-07, + "loss": 0.2204, + "step": 3541 + }, + { + "epoch": 0.94252261841405, + "grad_norm": 0.25721660256385803, + "learning_rate": 1.8439077791027561e-07, + "loss": 0.1615, + "step": 3542 + }, + { + "epoch": 0.9427887174028738, + "grad_norm": 0.38062137365341187, + "learning_rate": 1.8438171624236725e-07, + "loss": 0.197, + "step": 3543 + }, + { + "epoch": 0.9430548163916977, + "grad_norm": 0.27432653307914734, + "learning_rate": 1.8437265216772085e-07, + "loss": 0.2034, + "step": 3544 + }, + { + "epoch": 0.9433209153805215, + "grad_norm": 0.2765968143939972, + "learning_rate": 1.8436358568659495e-07, + "loss": 0.2028, + "step": 3545 + }, + { + "epoch": 0.9435870143693454, + "grad_norm": 0.2587282657623291, + "learning_rate": 1.8435451679924817e-07, + "loss": 0.1965, + "step": 3546 + }, + { + "epoch": 0.9438531133581692, + "grad_norm": 0.26214250922203064, + "learning_rate": 1.8434544550593913e-07, + "loss": 0.2031, + "step": 3547 + }, + { + "epoch": 0.944119212346993, + "grad_norm": 0.4317483901977539, + "learning_rate": 1.843363718069266e-07, + "loss": 0.1973, + "step": 3548 + }, + { + "epoch": 0.9443853113358169, + "grad_norm": 0.31342118978500366, + "learning_rate": 1.8432729570246938e-07, + "loss": 0.2019, + "step": 3549 + }, + { + "epoch": 0.9446514103246407, + "grad_norm": 0.2868804931640625, + "learning_rate": 1.843182171928263e-07, + "loss": 0.2093, + "step": 3550 + }, + { + "epoch": 0.9449175093134646, + "grad_norm": 0.3551959693431854, + "learning_rate": 1.8430913627825633e-07, + "loss": 0.1922, + "step": 3551 + }, + { + "epoch": 0.9451836083022884, + "grad_norm": 0.28245288133621216, + "learning_rate": 1.843000529590185e-07, + "loss": 0.2148, + "step": 3552 + }, + { + "epoch": 0.9454497072911123, + "grad_norm": 0.2712455093860626, + "learning_rate": 1.842909672353718e-07, + "loss": 0.2025, + "step": 3553 + }, + { + "epoch": 0.9457158062799361, + "grad_norm": 0.281281977891922, + "learning_rate": 1.8428187910757545e-07, + "loss": 0.1809, + "step": 3554 + }, + { + "epoch": 0.94598190526876, + "grad_norm": 0.5191128253936768, + "learning_rate": 1.8427278857588867e-07, + "loss": 0.2125, + "step": 3555 + }, + { + "epoch": 0.9462480042575838, + "grad_norm": 0.3434670567512512, + "learning_rate": 1.8426369564057067e-07, + "loss": 0.2122, + "step": 3556 + }, + { + "epoch": 0.9465141032464076, + "grad_norm": 0.4421411454677582, + "learning_rate": 1.8425460030188083e-07, + "loss": 0.2167, + "step": 3557 + }, + { + "epoch": 0.9467802022352315, + "grad_norm": 0.2736130356788635, + "learning_rate": 1.842455025600786e-07, + "loss": 0.1934, + "step": 3558 + }, + { + "epoch": 0.9470463012240553, + "grad_norm": 0.41176700592041016, + "learning_rate": 1.8423640241542343e-07, + "loss": 0.2128, + "step": 3559 + }, + { + "epoch": 0.9473124002128792, + "grad_norm": 0.3230496048927307, + "learning_rate": 1.8422729986817491e-07, + "loss": 0.211, + "step": 3560 + }, + { + "epoch": 0.947578499201703, + "grad_norm": 0.27369704842567444, + "learning_rate": 1.8421819491859262e-07, + "loss": 0.2169, + "step": 3561 + }, + { + "epoch": 0.9478445981905269, + "grad_norm": 0.27903658151626587, + "learning_rate": 1.842090875669363e-07, + "loss": 0.2038, + "step": 3562 + }, + { + "epoch": 0.9481106971793507, + "grad_norm": 0.2792099416255951, + "learning_rate": 1.8419997781346566e-07, + "loss": 0.2137, + "step": 3563 + }, + { + "epoch": 0.9483767961681746, + "grad_norm": 0.28273794054985046, + "learning_rate": 1.8419086565844053e-07, + "loss": 0.1965, + "step": 3564 + }, + { + "epoch": 0.9486428951569984, + "grad_norm": 0.2837531268596649, + "learning_rate": 1.8418175110212083e-07, + "loss": 0.2063, + "step": 3565 + }, + { + "epoch": 0.9489089941458222, + "grad_norm": 0.3883967399597168, + "learning_rate": 1.8417263414476655e-07, + "loss": 0.1961, + "step": 3566 + }, + { + "epoch": 0.9491750931346461, + "grad_norm": 0.2859798073768616, + "learning_rate": 1.841635147866377e-07, + "loss": 0.1921, + "step": 3567 + }, + { + "epoch": 0.9494411921234699, + "grad_norm": 0.35088205337524414, + "learning_rate": 1.8415439302799433e-07, + "loss": 0.1914, + "step": 3568 + }, + { + "epoch": 0.9497072911122938, + "grad_norm": 0.3711570203304291, + "learning_rate": 1.841452688690967e-07, + "loss": 0.2085, + "step": 3569 + }, + { + "epoch": 0.9499733901011176, + "grad_norm": 0.2599192261695862, + "learning_rate": 1.8413614231020503e-07, + "loss": 0.1757, + "step": 3570 + }, + { + "epoch": 0.9502394890899415, + "grad_norm": 0.38796448707580566, + "learning_rate": 1.841270133515796e-07, + "loss": 0.2095, + "step": 3571 + }, + { + "epoch": 0.9505055880787653, + "grad_norm": 0.3360874056816101, + "learning_rate": 1.8411788199348076e-07, + "loss": 0.2036, + "step": 3572 + }, + { + "epoch": 0.9507716870675892, + "grad_norm": 0.3064013123512268, + "learning_rate": 1.84108748236169e-07, + "loss": 0.2214, + "step": 3573 + }, + { + "epoch": 0.951037786056413, + "grad_norm": 0.27381056547164917, + "learning_rate": 1.8409961207990486e-07, + "loss": 0.2073, + "step": 3574 + }, + { + "epoch": 0.9513038850452368, + "grad_norm": 0.3562329113483429, + "learning_rate": 1.8409047352494885e-07, + "loss": 0.2143, + "step": 3575 + }, + { + "epoch": 0.9515699840340607, + "grad_norm": 0.3143787980079651, + "learning_rate": 1.8408133257156164e-07, + "loss": 0.1962, + "step": 3576 + }, + { + "epoch": 0.9518360830228845, + "grad_norm": 0.24717077612876892, + "learning_rate": 1.8407218922000398e-07, + "loss": 0.1843, + "step": 3577 + }, + { + "epoch": 0.9521021820117084, + "grad_norm": 0.3037233054637909, + "learning_rate": 1.8406304347053667e-07, + "loss": 0.1844, + "step": 3578 + }, + { + "epoch": 0.9523682810005322, + "grad_norm": 0.31879922747612, + "learning_rate": 1.8405389532342048e-07, + "loss": 0.2174, + "step": 3579 + }, + { + "epoch": 0.9526343799893561, + "grad_norm": 0.2847050428390503, + "learning_rate": 1.8404474477891644e-07, + "loss": 0.2043, + "step": 3580 + }, + { + "epoch": 0.9529004789781799, + "grad_norm": 0.2476840615272522, + "learning_rate": 1.8403559183728547e-07, + "loss": 0.2046, + "step": 3581 + }, + { + "epoch": 0.9531665779670038, + "grad_norm": 0.2794705033302307, + "learning_rate": 1.8402643649878867e-07, + "loss": 0.2233, + "step": 3582 + }, + { + "epoch": 0.9534326769558276, + "grad_norm": 0.2976905405521393, + "learning_rate": 1.8401727876368713e-07, + "loss": 0.207, + "step": 3583 + }, + { + "epoch": 0.9536987759446514, + "grad_norm": 0.32327908277511597, + "learning_rate": 1.8400811863224206e-07, + "loss": 0.1928, + "step": 3584 + }, + { + "epoch": 0.9539648749334753, + "grad_norm": 0.4015810489654541, + "learning_rate": 1.8399895610471477e-07, + "loss": 0.2202, + "step": 3585 + }, + { + "epoch": 0.954230973922299, + "grad_norm": 0.2851676642894745, + "learning_rate": 1.8398979118136651e-07, + "loss": 0.1985, + "step": 3586 + }, + { + "epoch": 0.954497072911123, + "grad_norm": 0.36499103903770447, + "learning_rate": 1.839806238624588e-07, + "loss": 0.2101, + "step": 3587 + }, + { + "epoch": 0.9547631718999467, + "grad_norm": 0.2719111740589142, + "learning_rate": 1.8397145414825298e-07, + "loss": 0.2016, + "step": 3588 + }, + { + "epoch": 0.9550292708887707, + "grad_norm": 0.2732781171798706, + "learning_rate": 1.8396228203901067e-07, + "loss": 0.2178, + "step": 3589 + }, + { + "epoch": 0.9552953698775944, + "grad_norm": 0.285473495721817, + "learning_rate": 1.8395310753499347e-07, + "loss": 0.2071, + "step": 3590 + }, + { + "epoch": 0.9555614688664184, + "grad_norm": 0.5601904392242432, + "learning_rate": 1.83943930636463e-07, + "loss": 0.1957, + "step": 3591 + }, + { + "epoch": 0.9558275678552421, + "grad_norm": 0.2676280736923218, + "learning_rate": 1.8393475134368109e-07, + "loss": 0.1938, + "step": 3592 + }, + { + "epoch": 0.956093666844066, + "grad_norm": 0.2532208263874054, + "learning_rate": 1.839255696569095e-07, + "loss": 0.1869, + "step": 3593 + }, + { + "epoch": 0.9563597658328898, + "grad_norm": 0.2854382395744324, + "learning_rate": 1.839163855764101e-07, + "loss": 0.211, + "step": 3594 + }, + { + "epoch": 0.9566258648217136, + "grad_norm": 0.2929840683937073, + "learning_rate": 1.8390719910244486e-07, + "loss": 0.1961, + "step": 3595 + }, + { + "epoch": 0.9568919638105375, + "grad_norm": 0.25489896535873413, + "learning_rate": 1.838980102352758e-07, + "loss": 0.1832, + "step": 3596 + }, + { + "epoch": 0.9571580627993613, + "grad_norm": 0.36055248975753784, + "learning_rate": 1.8388881897516499e-07, + "loss": 0.2158, + "step": 3597 + }, + { + "epoch": 0.9574241617881852, + "grad_norm": 0.35349130630493164, + "learning_rate": 1.8387962532237462e-07, + "loss": 0.2031, + "step": 3598 + }, + { + "epoch": 0.957690260777009, + "grad_norm": 0.3326471447944641, + "learning_rate": 1.8387042927716685e-07, + "loss": 0.2435, + "step": 3599 + }, + { + "epoch": 0.9579563597658329, + "grad_norm": 0.2774471640586853, + "learning_rate": 1.8386123083980402e-07, + "loss": 0.1934, + "step": 3600 + }, + { + "epoch": 0.9582224587546567, + "grad_norm": 0.38724377751350403, + "learning_rate": 1.8385203001054847e-07, + "loss": 0.1904, + "step": 3601 + }, + { + "epoch": 0.9584885577434806, + "grad_norm": 0.2648245692253113, + "learning_rate": 1.8384282678966266e-07, + "loss": 0.1963, + "step": 3602 + }, + { + "epoch": 0.9587546567323044, + "grad_norm": 0.26858627796173096, + "learning_rate": 1.8383362117740902e-07, + "loss": 0.1978, + "step": 3603 + }, + { + "epoch": 0.9590207557211282, + "grad_norm": 0.27289077639579773, + "learning_rate": 1.8382441317405016e-07, + "loss": 0.1948, + "step": 3604 + }, + { + "epoch": 0.9592868547099521, + "grad_norm": 0.42153018712997437, + "learning_rate": 1.8381520277984865e-07, + "loss": 0.2153, + "step": 3605 + }, + { + "epoch": 0.9595529536987759, + "grad_norm": 0.2582976520061493, + "learning_rate": 1.838059899950673e-07, + "loss": 0.1939, + "step": 3606 + }, + { + "epoch": 0.9598190526875998, + "grad_norm": 0.25666359066963196, + "learning_rate": 1.8379677481996878e-07, + "loss": 0.1944, + "step": 3607 + }, + { + "epoch": 0.9600851516764236, + "grad_norm": 0.2749585211277008, + "learning_rate": 1.8378755725481595e-07, + "loss": 0.1996, + "step": 3608 + }, + { + "epoch": 0.9603512506652475, + "grad_norm": 0.31785309314727783, + "learning_rate": 1.8377833729987176e-07, + "loss": 0.2082, + "step": 3609 + }, + { + "epoch": 0.9606173496540713, + "grad_norm": 0.28615817427635193, + "learning_rate": 1.8376911495539914e-07, + "loss": 0.2132, + "step": 3610 + }, + { + "epoch": 0.9608834486428952, + "grad_norm": 0.2906321585178375, + "learning_rate": 1.837598902216611e-07, + "loss": 0.2253, + "step": 3611 + }, + { + "epoch": 0.961149547631719, + "grad_norm": 0.3666772246360779, + "learning_rate": 1.837506630989208e-07, + "loss": 0.1853, + "step": 3612 + }, + { + "epoch": 0.9614156466205428, + "grad_norm": 0.4559297263622284, + "learning_rate": 1.837414335874414e-07, + "loss": 0.2064, + "step": 3613 + }, + { + "epoch": 0.9616817456093667, + "grad_norm": 0.2746466398239136, + "learning_rate": 1.8373220168748612e-07, + "loss": 0.2141, + "step": 3614 + }, + { + "epoch": 0.9619478445981905, + "grad_norm": 0.41063642501831055, + "learning_rate": 1.8372296739931835e-07, + "loss": 0.2132, + "step": 3615 + }, + { + "epoch": 0.9622139435870144, + "grad_norm": 0.3251335918903351, + "learning_rate": 1.837137307232014e-07, + "loss": 0.1975, + "step": 3616 + }, + { + "epoch": 0.9624800425758382, + "grad_norm": 0.28247174620628357, + "learning_rate": 1.837044916593987e-07, + "loss": 0.211, + "step": 3617 + }, + { + "epoch": 0.9627461415646621, + "grad_norm": 0.3778824210166931, + "learning_rate": 1.836952502081738e-07, + "loss": 0.2212, + "step": 3618 + }, + { + "epoch": 0.9630122405534859, + "grad_norm": 0.2574740946292877, + "learning_rate": 1.8368600636979033e-07, + "loss": 0.1899, + "step": 3619 + }, + { + "epoch": 0.9632783395423098, + "grad_norm": 0.2693413197994232, + "learning_rate": 1.8367676014451185e-07, + "loss": 0.2035, + "step": 3620 + }, + { + "epoch": 0.9635444385311336, + "grad_norm": 0.4591482877731323, + "learning_rate": 1.836675115326022e-07, + "loss": 0.182, + "step": 3621 + }, + { + "epoch": 0.9638105375199574, + "grad_norm": 0.2526170313358307, + "learning_rate": 1.8365826053432504e-07, + "loss": 0.1898, + "step": 3622 + }, + { + "epoch": 0.9640766365087813, + "grad_norm": 0.345365047454834, + "learning_rate": 1.8364900714994432e-07, + "loss": 0.2167, + "step": 3623 + }, + { + "epoch": 0.9643427354976051, + "grad_norm": 0.4218215346336365, + "learning_rate": 1.836397513797239e-07, + "loss": 0.2313, + "step": 3624 + }, + { + "epoch": 0.964608834486429, + "grad_norm": 0.5372536778450012, + "learning_rate": 1.8363049322392785e-07, + "loss": 0.2109, + "step": 3625 + }, + { + "epoch": 0.9648749334752528, + "grad_norm": 0.40563133358955383, + "learning_rate": 1.8362123268282017e-07, + "loss": 0.2232, + "step": 3626 + }, + { + "epoch": 0.9651410324640767, + "grad_norm": 0.3385651111602783, + "learning_rate": 1.8361196975666502e-07, + "loss": 0.2071, + "step": 3627 + }, + { + "epoch": 0.9654071314529005, + "grad_norm": 0.25179436802864075, + "learning_rate": 1.8360270444572654e-07, + "loss": 0.1823, + "step": 3628 + }, + { + "epoch": 0.9656732304417244, + "grad_norm": 0.3697410523891449, + "learning_rate": 1.8359343675026908e-07, + "loss": 0.2262, + "step": 3629 + }, + { + "epoch": 0.9659393294305482, + "grad_norm": 0.4059831202030182, + "learning_rate": 1.8358416667055692e-07, + "loss": 0.2201, + "step": 3630 + }, + { + "epoch": 0.966205428419372, + "grad_norm": 0.2779991626739502, + "learning_rate": 1.8357489420685449e-07, + "loss": 0.221, + "step": 3631 + }, + { + "epoch": 0.9664715274081959, + "grad_norm": 0.26365482807159424, + "learning_rate": 1.8356561935942624e-07, + "loss": 0.1953, + "step": 3632 + }, + { + "epoch": 0.9667376263970197, + "grad_norm": 0.2651229500770569, + "learning_rate": 1.835563421285367e-07, + "loss": 0.1837, + "step": 3633 + }, + { + "epoch": 0.9670037253858436, + "grad_norm": 0.2729814350605011, + "learning_rate": 1.835470625144505e-07, + "loss": 0.1996, + "step": 3634 + }, + { + "epoch": 0.9672698243746674, + "grad_norm": 0.28484147787094116, + "learning_rate": 1.835377805174323e-07, + "loss": 0.1914, + "step": 3635 + }, + { + "epoch": 0.9675359233634913, + "grad_norm": 0.2844192385673523, + "learning_rate": 1.8352849613774683e-07, + "loss": 0.1923, + "step": 3636 + }, + { + "epoch": 0.967802022352315, + "grad_norm": 0.2790581285953522, + "learning_rate": 1.8351920937565893e-07, + "loss": 0.1925, + "step": 3637 + }, + { + "epoch": 0.968068121341139, + "grad_norm": 0.2848733365535736, + "learning_rate": 1.8350992023143347e-07, + "loss": 0.1864, + "step": 3638 + }, + { + "epoch": 0.9683342203299627, + "grad_norm": 0.40992051362991333, + "learning_rate": 1.8350062870533534e-07, + "loss": 0.2097, + "step": 3639 + }, + { + "epoch": 0.9686003193187865, + "grad_norm": 0.31588447093963623, + "learning_rate": 1.8349133479762962e-07, + "loss": 0.2045, + "step": 3640 + }, + { + "epoch": 0.9688664183076104, + "grad_norm": 0.2695743143558502, + "learning_rate": 1.8348203850858138e-07, + "loss": 0.2064, + "step": 3641 + }, + { + "epoch": 0.9691325172964342, + "grad_norm": 0.3200497031211853, + "learning_rate": 1.8347273983845573e-07, + "loss": 0.22, + "step": 3642 + }, + { + "epoch": 0.9693986162852581, + "grad_norm": 0.2792840600013733, + "learning_rate": 1.8346343878751796e-07, + "loss": 0.2092, + "step": 3643 + }, + { + "epoch": 0.9696647152740819, + "grad_norm": 0.33136555552482605, + "learning_rate": 1.8345413535603326e-07, + "loss": 0.2259, + "step": 3644 + }, + { + "epoch": 0.9699308142629058, + "grad_norm": 0.2867850065231323, + "learning_rate": 1.8344482954426708e-07, + "loss": 0.222, + "step": 3645 + }, + { + "epoch": 0.9701969132517296, + "grad_norm": 0.37427079677581787, + "learning_rate": 1.8343552135248478e-07, + "loss": 0.2095, + "step": 3646 + }, + { + "epoch": 0.9704630122405535, + "grad_norm": 0.279346764087677, + "learning_rate": 1.8342621078095184e-07, + "loss": 0.2015, + "step": 3647 + }, + { + "epoch": 0.9707291112293773, + "grad_norm": 0.2681523859500885, + "learning_rate": 1.8341689782993384e-07, + "loss": 0.1924, + "step": 3648 + }, + { + "epoch": 0.9709952102182011, + "grad_norm": 0.2606600522994995, + "learning_rate": 1.8340758249969644e-07, + "loss": 0.2047, + "step": 3649 + }, + { + "epoch": 0.971261309207025, + "grad_norm": 0.3077443838119507, + "learning_rate": 1.8339826479050524e-07, + "loss": 0.2161, + "step": 3650 + }, + { + "epoch": 0.9715274081958488, + "grad_norm": 0.3523707985877991, + "learning_rate": 1.833889447026261e-07, + "loss": 0.2279, + "step": 3651 + }, + { + "epoch": 0.9717935071846727, + "grad_norm": 0.270003080368042, + "learning_rate": 1.8337962223632478e-07, + "loss": 0.2146, + "step": 3652 + }, + { + "epoch": 0.9720596061734965, + "grad_norm": 0.28101804852485657, + "learning_rate": 1.833702973918672e-07, + "loss": 0.1997, + "step": 3653 + }, + { + "epoch": 0.9723257051623204, + "grad_norm": 0.4058057367801666, + "learning_rate": 1.8336097016951934e-07, + "loss": 0.2479, + "step": 3654 + }, + { + "epoch": 0.9725918041511442, + "grad_norm": 0.29106807708740234, + "learning_rate": 1.833516405695472e-07, + "loss": 0.2134, + "step": 3655 + }, + { + "epoch": 0.9728579031399681, + "grad_norm": 0.3047497868537903, + "learning_rate": 1.8334230859221688e-07, + "loss": 0.2019, + "step": 3656 + }, + { + "epoch": 0.9731240021287919, + "grad_norm": 0.28219011425971985, + "learning_rate": 1.8333297423779458e-07, + "loss": 0.2118, + "step": 3657 + }, + { + "epoch": 0.9733901011176157, + "grad_norm": 0.35072585940361023, + "learning_rate": 1.8332363750654652e-07, + "loss": 0.2204, + "step": 3658 + }, + { + "epoch": 0.9736562001064396, + "grad_norm": 0.3254748284816742, + "learning_rate": 1.8331429839873898e-07, + "loss": 0.204, + "step": 3659 + }, + { + "epoch": 0.9739222990952634, + "grad_norm": 0.2572984993457794, + "learning_rate": 1.833049569146383e-07, + "loss": 0.2087, + "step": 3660 + }, + { + "epoch": 0.9741883980840873, + "grad_norm": 0.264816015958786, + "learning_rate": 1.8329561305451104e-07, + "loss": 0.2016, + "step": 3661 + }, + { + "epoch": 0.9744544970729111, + "grad_norm": 0.26408007740974426, + "learning_rate": 1.832862668186236e-07, + "loss": 0.1961, + "step": 3662 + }, + { + "epoch": 0.974720596061735, + "grad_norm": 0.35072553157806396, + "learning_rate": 1.832769182072426e-07, + "loss": 0.2252, + "step": 3663 + }, + { + "epoch": 0.9749866950505588, + "grad_norm": 0.248180091381073, + "learning_rate": 1.8326756722063465e-07, + "loss": 0.1902, + "step": 3664 + }, + { + "epoch": 0.9752527940393827, + "grad_norm": 0.27402088046073914, + "learning_rate": 1.832582138590665e-07, + "loss": 0.2017, + "step": 3665 + }, + { + "epoch": 0.9755188930282065, + "grad_norm": 0.4878603219985962, + "learning_rate": 1.832488581228049e-07, + "loss": 0.1998, + "step": 3666 + }, + { + "epoch": 0.9757849920170303, + "grad_norm": 0.2580772042274475, + "learning_rate": 1.8323950001211665e-07, + "loss": 0.2037, + "step": 3667 + }, + { + "epoch": 0.9760510910058542, + "grad_norm": 0.2888212203979492, + "learning_rate": 1.8323013952726873e-07, + "loss": 0.2112, + "step": 3668 + }, + { + "epoch": 0.976317189994678, + "grad_norm": 0.2666975259780884, + "learning_rate": 1.8322077666852812e-07, + "loss": 0.1969, + "step": 3669 + }, + { + "epoch": 0.9765832889835019, + "grad_norm": 0.2840885519981384, + "learning_rate": 1.8321141143616182e-07, + "loss": 0.2175, + "step": 3670 + }, + { + "epoch": 0.9768493879723257, + "grad_norm": 0.2800719141960144, + "learning_rate": 1.83202043830437e-07, + "loss": 0.1962, + "step": 3671 + }, + { + "epoch": 0.9771154869611496, + "grad_norm": 0.429043710231781, + "learning_rate": 1.831926738516208e-07, + "loss": 0.2415, + "step": 3672 + }, + { + "epoch": 0.9773815859499734, + "grad_norm": 0.26703405380249023, + "learning_rate": 1.8318330149998048e-07, + "loss": 0.2045, + "step": 3673 + }, + { + "epoch": 0.9776476849387973, + "grad_norm": 0.3696446716785431, + "learning_rate": 1.831739267757834e-07, + "loss": 0.2041, + "step": 3674 + }, + { + "epoch": 0.9779137839276211, + "grad_norm": 0.28739863634109497, + "learning_rate": 1.8316454967929683e-07, + "loss": 0.2171, + "step": 3675 + }, + { + "epoch": 0.9781798829164449, + "grad_norm": 0.24869012832641602, + "learning_rate": 1.831551702107884e-07, + "loss": 0.1849, + "step": 3676 + }, + { + "epoch": 0.9784459819052688, + "grad_norm": 0.2824556231498718, + "learning_rate": 1.8314578837052547e-07, + "loss": 0.205, + "step": 3677 + }, + { + "epoch": 0.9787120808940926, + "grad_norm": 0.2705800533294678, + "learning_rate": 1.8313640415877573e-07, + "loss": 0.1836, + "step": 3678 + }, + { + "epoch": 0.9789781798829165, + "grad_norm": 0.3446887135505676, + "learning_rate": 1.831270175758068e-07, + "loss": 0.2328, + "step": 3679 + }, + { + "epoch": 0.9792442788717403, + "grad_norm": 0.2514229118824005, + "learning_rate": 1.831176286218864e-07, + "loss": 0.1932, + "step": 3680 + }, + { + "epoch": 0.9795103778605642, + "grad_norm": 0.26970529556274414, + "learning_rate": 1.8310823729728232e-07, + "loss": 0.2187, + "step": 3681 + }, + { + "epoch": 0.979776476849388, + "grad_norm": 0.4373980760574341, + "learning_rate": 1.8309884360226243e-07, + "loss": 0.2198, + "step": 3682 + }, + { + "epoch": 0.9800425758382119, + "grad_norm": 0.297911137342453, + "learning_rate": 1.8308944753709468e-07, + "loss": 0.206, + "step": 3683 + }, + { + "epoch": 0.9803086748270357, + "grad_norm": 0.38753101229667664, + "learning_rate": 1.83080049102047e-07, + "loss": 0.2099, + "step": 3684 + }, + { + "epoch": 0.9805747738158594, + "grad_norm": 0.26433172821998596, + "learning_rate": 1.8307064829738753e-07, + "loss": 0.1992, + "step": 3685 + }, + { + "epoch": 0.9808408728046834, + "grad_norm": 0.6050260663032532, + "learning_rate": 1.8306124512338431e-07, + "loss": 0.2004, + "step": 3686 + }, + { + "epoch": 0.9811069717935071, + "grad_norm": 0.27110159397125244, + "learning_rate": 1.8305183958030565e-07, + "loss": 0.1977, + "step": 3687 + }, + { + "epoch": 0.981373070782331, + "grad_norm": 0.25683385133743286, + "learning_rate": 1.8304243166841973e-07, + "loss": 0.2055, + "step": 3688 + }, + { + "epoch": 0.9816391697711548, + "grad_norm": 0.2922229468822479, + "learning_rate": 1.830330213879949e-07, + "loss": 0.2369, + "step": 3689 + }, + { + "epoch": 0.9819052687599787, + "grad_norm": 0.2635367512702942, + "learning_rate": 1.830236087392996e-07, + "loss": 0.2101, + "step": 3690 + }, + { + "epoch": 0.9821713677488025, + "grad_norm": 0.2911183834075928, + "learning_rate": 1.8301419372260223e-07, + "loss": 0.2107, + "step": 3691 + }, + { + "epoch": 0.9824374667376264, + "grad_norm": 0.27590250968933105, + "learning_rate": 1.8300477633817136e-07, + "loss": 0.2038, + "step": 3692 + }, + { + "epoch": 0.9827035657264502, + "grad_norm": 0.9183558225631714, + "learning_rate": 1.8299535658627563e-07, + "loss": 0.2139, + "step": 3693 + }, + { + "epoch": 0.982969664715274, + "grad_norm": 0.27317675948143005, + "learning_rate": 1.8298593446718366e-07, + "loss": 0.2025, + "step": 3694 + }, + { + "epoch": 0.9832357637040979, + "grad_norm": 0.35046935081481934, + "learning_rate": 1.8297650998116422e-07, + "loss": 0.193, + "step": 3695 + }, + { + "epoch": 0.9835018626929217, + "grad_norm": 0.2832682430744171, + "learning_rate": 1.8296708312848607e-07, + "loss": 0.199, + "step": 3696 + }, + { + "epoch": 0.9837679616817456, + "grad_norm": 0.43805432319641113, + "learning_rate": 1.8295765390941814e-07, + "loss": 0.186, + "step": 3697 + }, + { + "epoch": 0.9840340606705694, + "grad_norm": 0.26703670620918274, + "learning_rate": 1.829482223242293e-07, + "loss": 0.2072, + "step": 3698 + }, + { + "epoch": 0.9843001596593933, + "grad_norm": 0.3321874439716339, + "learning_rate": 1.8293878837318863e-07, + "loss": 0.2072, + "step": 3699 + }, + { + "epoch": 0.9845662586482171, + "grad_norm": 0.25797873735427856, + "learning_rate": 1.8292935205656517e-07, + "loss": 0.1992, + "step": 3700 + }, + { + "epoch": 0.984832357637041, + "grad_norm": 0.3399753272533417, + "learning_rate": 1.829199133746281e-07, + "loss": 0.1997, + "step": 3701 + }, + { + "epoch": 0.9850984566258648, + "grad_norm": 0.31862056255340576, + "learning_rate": 1.8291047232764656e-07, + "loss": 0.2092, + "step": 3702 + }, + { + "epoch": 0.9853645556146887, + "grad_norm": 0.27591291069984436, + "learning_rate": 1.8290102891588987e-07, + "loss": 0.2044, + "step": 3703 + }, + { + "epoch": 0.9856306546035125, + "grad_norm": 0.32350191473960876, + "learning_rate": 1.828915831396274e-07, + "loss": 0.2143, + "step": 3704 + }, + { + "epoch": 0.9858967535923363, + "grad_norm": 0.26337048411369324, + "learning_rate": 1.828821349991285e-07, + "loss": 0.2029, + "step": 3705 + }, + { + "epoch": 0.9861628525811602, + "grad_norm": 0.282805860042572, + "learning_rate": 1.8287268449466268e-07, + "loss": 0.2113, + "step": 3706 + }, + { + "epoch": 0.986428951569984, + "grad_norm": 0.2728693187236786, + "learning_rate": 1.8286323162649953e-07, + "loss": 0.2074, + "step": 3707 + }, + { + "epoch": 0.9866950505588079, + "grad_norm": 0.4577121138572693, + "learning_rate": 1.828537763949086e-07, + "loss": 0.2262, + "step": 3708 + }, + { + "epoch": 0.9869611495476317, + "grad_norm": 0.4063203036785126, + "learning_rate": 1.8284431880015963e-07, + "loss": 0.2154, + "step": 3709 + }, + { + "epoch": 0.9872272485364556, + "grad_norm": 0.2435389906167984, + "learning_rate": 1.828348588425223e-07, + "loss": 0.181, + "step": 3710 + }, + { + "epoch": 0.9874933475252794, + "grad_norm": 0.3390055298805237, + "learning_rate": 1.828253965222665e-07, + "loss": 0.1862, + "step": 3711 + }, + { + "epoch": 0.9877594465141033, + "grad_norm": 0.27144238352775574, + "learning_rate": 1.828159318396621e-07, + "loss": 0.2079, + "step": 3712 + }, + { + "epoch": 0.9880255455029271, + "grad_norm": 0.3171105682849884, + "learning_rate": 1.8280646479497895e-07, + "loss": 0.1938, + "step": 3713 + }, + { + "epoch": 0.9882916444917509, + "grad_norm": 0.24565881490707397, + "learning_rate": 1.8279699538848724e-07, + "loss": 0.1934, + "step": 3714 + }, + { + "epoch": 0.9885577434805748, + "grad_norm": 0.3989403545856476, + "learning_rate": 1.8278752362045693e-07, + "loss": 0.2305, + "step": 3715 + }, + { + "epoch": 0.9888238424693986, + "grad_norm": 0.28576794266700745, + "learning_rate": 1.8277804949115824e-07, + "loss": 0.211, + "step": 3716 + }, + { + "epoch": 0.9890899414582225, + "grad_norm": 0.3413444459438324, + "learning_rate": 1.8276857300086133e-07, + "loss": 0.228, + "step": 3717 + }, + { + "epoch": 0.9893560404470463, + "grad_norm": 0.3950311243534088, + "learning_rate": 1.8275909414983655e-07, + "loss": 0.1909, + "step": 3718 + }, + { + "epoch": 0.9896221394358702, + "grad_norm": 0.2615435719490051, + "learning_rate": 1.8274961293835423e-07, + "loss": 0.2058, + "step": 3719 + }, + { + "epoch": 0.989888238424694, + "grad_norm": 0.25855717062950134, + "learning_rate": 1.8274012936668477e-07, + "loss": 0.1962, + "step": 3720 + }, + { + "epoch": 0.9901543374135179, + "grad_norm": 0.40375518798828125, + "learning_rate": 1.8273064343509872e-07, + "loss": 0.197, + "step": 3721 + }, + { + "epoch": 0.9904204364023417, + "grad_norm": 0.2548274099826813, + "learning_rate": 1.8272115514386658e-07, + "loss": 0.2048, + "step": 3722 + }, + { + "epoch": 0.9906865353911655, + "grad_norm": 0.3111441433429718, + "learning_rate": 1.82711664493259e-07, + "loss": 0.1928, + "step": 3723 + }, + { + "epoch": 0.9909526343799894, + "grad_norm": 0.26431846618652344, + "learning_rate": 1.8270217148354665e-07, + "loss": 0.1905, + "step": 3724 + }, + { + "epoch": 0.9912187333688132, + "grad_norm": 0.31914639472961426, + "learning_rate": 1.8269267611500034e-07, + "loss": 0.2007, + "step": 3725 + }, + { + "epoch": 0.9914848323576371, + "grad_norm": 0.28264081478118896, + "learning_rate": 1.8268317838789086e-07, + "loss": 0.198, + "step": 3726 + }, + { + "epoch": 0.9917509313464609, + "grad_norm": 0.35777539014816284, + "learning_rate": 1.826736783024891e-07, + "loss": 0.196, + "step": 3727 + }, + { + "epoch": 0.9920170303352848, + "grad_norm": 0.27067238092422485, + "learning_rate": 1.8266417585906606e-07, + "loss": 0.1919, + "step": 3728 + }, + { + "epoch": 0.9922831293241086, + "grad_norm": 0.455025851726532, + "learning_rate": 1.826546710578927e-07, + "loss": 0.1997, + "step": 3729 + }, + { + "epoch": 0.9925492283129325, + "grad_norm": 0.328774094581604, + "learning_rate": 1.826451638992402e-07, + "loss": 0.1967, + "step": 3730 + }, + { + "epoch": 0.9928153273017563, + "grad_norm": 0.36599844694137573, + "learning_rate": 1.8263565438337967e-07, + "loss": 0.2017, + "step": 3731 + }, + { + "epoch": 0.99308142629058, + "grad_norm": 0.4293357729911804, + "learning_rate": 1.8262614251058238e-07, + "loss": 0.2337, + "step": 3732 + }, + { + "epoch": 0.993347525279404, + "grad_norm": 0.3327883183956146, + "learning_rate": 1.8261662828111957e-07, + "loss": 0.2024, + "step": 3733 + }, + { + "epoch": 0.9936136242682277, + "grad_norm": 0.4572966992855072, + "learning_rate": 1.8260711169526264e-07, + "loss": 0.2252, + "step": 3734 + }, + { + "epoch": 0.9938797232570517, + "grad_norm": 0.283425897359848, + "learning_rate": 1.82597592753283e-07, + "loss": 0.2132, + "step": 3735 + }, + { + "epoch": 0.9941458222458754, + "grad_norm": 0.28806206583976746, + "learning_rate": 1.825880714554522e-07, + "loss": 0.2442, + "step": 3736 + }, + { + "epoch": 0.9944119212346993, + "grad_norm": 0.43685442209243774, + "learning_rate": 1.8257854780204177e-07, + "loss": 0.195, + "step": 3737 + }, + { + "epoch": 0.9946780202235231, + "grad_norm": 0.40181395411491394, + "learning_rate": 1.8256902179332335e-07, + "loss": 0.2088, + "step": 3738 + }, + { + "epoch": 0.994944119212347, + "grad_norm": 0.3687134087085724, + "learning_rate": 1.8255949342956863e-07, + "loss": 0.1932, + "step": 3739 + }, + { + "epoch": 0.9952102182011708, + "grad_norm": 0.49878188967704773, + "learning_rate": 1.8254996271104938e-07, + "loss": 0.2126, + "step": 3740 + }, + { + "epoch": 0.9954763171899946, + "grad_norm": 0.27835240960121155, + "learning_rate": 1.8254042963803745e-07, + "loss": 0.1998, + "step": 3741 + }, + { + "epoch": 0.9957424161788185, + "grad_norm": 0.4032785892486572, + "learning_rate": 1.8253089421080474e-07, + "loss": 0.1934, + "step": 3742 + }, + { + "epoch": 0.9960085151676423, + "grad_norm": 0.3569347858428955, + "learning_rate": 1.8252135642962323e-07, + "loss": 0.19, + "step": 3743 + }, + { + "epoch": 0.9962746141564662, + "grad_norm": 0.34073978662490845, + "learning_rate": 1.8251181629476493e-07, + "loss": 0.2149, + "step": 3744 + }, + { + "epoch": 0.99654071314529, + "grad_norm": 0.3590311110019684, + "learning_rate": 1.8250227380650193e-07, + "loss": 0.223, + "step": 3745 + }, + { + "epoch": 0.9968068121341139, + "grad_norm": 0.26704055070877075, + "learning_rate": 1.824927289651065e-07, + "loss": 0.2113, + "step": 3746 + }, + { + "epoch": 0.9970729111229377, + "grad_norm": 0.40497884154319763, + "learning_rate": 1.8248318177085075e-07, + "loss": 0.2192, + "step": 3747 + }, + { + "epoch": 0.9973390101117616, + "grad_norm": 0.37979692220687866, + "learning_rate": 1.8247363222400703e-07, + "loss": 0.2218, + "step": 3748 + }, + { + "epoch": 0.9976051091005854, + "grad_norm": 0.36727145314216614, + "learning_rate": 1.8246408032484776e-07, + "loss": 0.2054, + "step": 3749 + }, + { + "epoch": 0.9978712080894092, + "grad_norm": 0.264194518327713, + "learning_rate": 1.8245452607364534e-07, + "loss": 0.1927, + "step": 3750 + }, + { + "epoch": 0.9981373070782331, + "grad_norm": 0.28998294472694397, + "learning_rate": 1.8244496947067226e-07, + "loss": 0.1982, + "step": 3751 + }, + { + "epoch": 0.9984034060670569, + "grad_norm": 0.3445931375026703, + "learning_rate": 1.8243541051620116e-07, + "loss": 0.2044, + "step": 3752 + }, + { + "epoch": 0.9986695050558808, + "grad_norm": 0.42425504326820374, + "learning_rate": 1.8242584921050458e-07, + "loss": 0.202, + "step": 3753 + }, + { + "epoch": 0.9989356040447046, + "grad_norm": 0.2919238209724426, + "learning_rate": 1.8241628555385532e-07, + "loss": 0.2104, + "step": 3754 + }, + { + "epoch": 0.9992017030335285, + "grad_norm": 0.273322731256485, + "learning_rate": 1.824067195465261e-07, + "loss": 0.1892, + "step": 3755 + }, + { + "epoch": 0.9994678020223523, + "grad_norm": 0.2639957070350647, + "learning_rate": 1.823971511887898e-07, + "loss": 0.1846, + "step": 3756 + }, + { + "epoch": 0.9997339010111762, + "grad_norm": 0.29162323474884033, + "learning_rate": 1.8238758048091928e-07, + "loss": 0.195, + "step": 3757 + }, + { + "epoch": 1.0, + "grad_norm": 0.26841017603874207, + "learning_rate": 1.8237800742318755e-07, + "loss": 0.1902, + "step": 3758 + }, + { + "epoch": 1.000266098988824, + "grad_norm": 0.265520840883255, + "learning_rate": 1.8236843201586767e-07, + "loss": 0.1975, + "step": 3759 + }, + { + "epoch": 1.0005321979776476, + "grad_norm": 0.37553057074546814, + "learning_rate": 1.823588542592327e-07, + "loss": 0.1942, + "step": 3760 + }, + { + "epoch": 1.0007982969664715, + "grad_norm": 0.250739723443985, + "learning_rate": 1.8234927415355588e-07, + "loss": 0.1826, + "step": 3761 + }, + { + "epoch": 1.0010643959552954, + "grad_norm": 0.2604050040245056, + "learning_rate": 1.8233969169911042e-07, + "loss": 0.1846, + "step": 3762 + }, + { + "epoch": 1.0013304949441193, + "grad_norm": 0.2596757113933563, + "learning_rate": 1.823301068961696e-07, + "loss": 0.1958, + "step": 3763 + }, + { + "epoch": 1.001596593932943, + "grad_norm": 0.25380879640579224, + "learning_rate": 1.8232051974500683e-07, + "loss": 0.1843, + "step": 3764 + }, + { + "epoch": 1.0018626929217669, + "grad_norm": 0.3928976058959961, + "learning_rate": 1.8231093024589558e-07, + "loss": 0.2118, + "step": 3765 + }, + { + "epoch": 1.0021287919105908, + "grad_norm": 0.26711878180503845, + "learning_rate": 1.823013383991093e-07, + "loss": 0.204, + "step": 3766 + }, + { + "epoch": 1.0023948908994147, + "grad_norm": 0.32553261518478394, + "learning_rate": 1.8229174420492162e-07, + "loss": 0.1978, + "step": 3767 + }, + { + "epoch": 1.0026609898882384, + "grad_norm": 0.25994592905044556, + "learning_rate": 1.8228214766360618e-07, + "loss": 0.1935, + "step": 3768 + }, + { + "epoch": 1.0029270888770623, + "grad_norm": 0.3091168999671936, + "learning_rate": 1.8227254877543668e-07, + "loss": 0.2188, + "step": 3769 + }, + { + "epoch": 1.0031931878658862, + "grad_norm": 0.27039167284965515, + "learning_rate": 1.822629475406869e-07, + "loss": 0.1919, + "step": 3770 + }, + { + "epoch": 1.0034592868547099, + "grad_norm": 0.2699425220489502, + "learning_rate": 1.822533439596307e-07, + "loss": 0.202, + "step": 3771 + }, + { + "epoch": 1.0037253858435338, + "grad_norm": 0.29818400740623474, + "learning_rate": 1.8224373803254198e-07, + "loss": 0.2077, + "step": 3772 + }, + { + "epoch": 1.0039914848323577, + "grad_norm": 0.25337889790534973, + "learning_rate": 1.822341297596947e-07, + "loss": 0.188, + "step": 3773 + }, + { + "epoch": 1.0042575838211816, + "grad_norm": 0.2800453007221222, + "learning_rate": 1.8222451914136295e-07, + "loss": 0.187, + "step": 3774 + }, + { + "epoch": 1.0045236828100053, + "grad_norm": 0.29518285393714905, + "learning_rate": 1.8221490617782083e-07, + "loss": 0.219, + "step": 3775 + }, + { + "epoch": 1.0047897817988292, + "grad_norm": 0.28980717062950134, + "learning_rate": 1.822052908693425e-07, + "loss": 0.215, + "step": 3776 + }, + { + "epoch": 1.005055880787653, + "grad_norm": 0.2970407009124756, + "learning_rate": 1.8219567321620225e-07, + "loss": 0.1993, + "step": 3777 + }, + { + "epoch": 1.0053219797764767, + "grad_norm": 0.3714858889579773, + "learning_rate": 1.8218605321867436e-07, + "loss": 0.2095, + "step": 3778 + }, + { + "epoch": 1.0055880787653007, + "grad_norm": 0.34098702669143677, + "learning_rate": 1.8217643087703321e-07, + "loss": 0.2145, + "step": 3779 + }, + { + "epoch": 1.0058541777541246, + "grad_norm": 0.3390212059020996, + "learning_rate": 1.821668061915533e-07, + "loss": 0.1869, + "step": 3780 + }, + { + "epoch": 1.0061202767429485, + "grad_norm": 0.27589643001556396, + "learning_rate": 1.8215717916250906e-07, + "loss": 0.2072, + "step": 3781 + }, + { + "epoch": 1.0063863757317721, + "grad_norm": 0.2569311261177063, + "learning_rate": 1.8214754979017512e-07, + "loss": 0.2105, + "step": 3782 + }, + { + "epoch": 1.006652474720596, + "grad_norm": 0.33314988017082214, + "learning_rate": 1.8213791807482616e-07, + "loss": 0.2089, + "step": 3783 + }, + { + "epoch": 1.00691857370942, + "grad_norm": 0.35049787163734436, + "learning_rate": 1.8212828401673687e-07, + "loss": 0.2182, + "step": 3784 + }, + { + "epoch": 1.0071846726982439, + "grad_norm": 0.33307120203971863, + "learning_rate": 1.8211864761618196e-07, + "loss": 0.2103, + "step": 3785 + }, + { + "epoch": 1.0074507716870675, + "grad_norm": 0.5153335928916931, + "learning_rate": 1.821090088734364e-07, + "loss": 0.2126, + "step": 3786 + }, + { + "epoch": 1.0077168706758914, + "grad_norm": 0.3106221854686737, + "learning_rate": 1.8209936778877506e-07, + "loss": 0.2037, + "step": 3787 + }, + { + "epoch": 1.0079829696647153, + "grad_norm": 0.2552943229675293, + "learning_rate": 1.8208972436247292e-07, + "loss": 0.1888, + "step": 3788 + }, + { + "epoch": 1.008249068653539, + "grad_norm": 0.3601953089237213, + "learning_rate": 1.8208007859480498e-07, + "loss": 0.2009, + "step": 3789 + }, + { + "epoch": 1.008515167642363, + "grad_norm": 0.27647536993026733, + "learning_rate": 1.8207043048604644e-07, + "loss": 0.2079, + "step": 3790 + }, + { + "epoch": 1.0087812666311868, + "grad_norm": 0.26739996671676636, + "learning_rate": 1.8206078003647245e-07, + "loss": 0.1871, + "step": 3791 + }, + { + "epoch": 1.0090473656200107, + "grad_norm": 0.26837456226348877, + "learning_rate": 1.8205112724635824e-07, + "loss": 0.2161, + "step": 3792 + }, + { + "epoch": 1.0093134646088344, + "grad_norm": 0.449358731508255, + "learning_rate": 1.8204147211597917e-07, + "loss": 0.231, + "step": 3793 + }, + { + "epoch": 1.0095795635976583, + "grad_norm": 0.388216108083725, + "learning_rate": 1.8203181464561056e-07, + "loss": 0.1948, + "step": 3794 + }, + { + "epoch": 1.0098456625864822, + "grad_norm": 0.36147555708885193, + "learning_rate": 1.8202215483552795e-07, + "loss": 0.198, + "step": 3795 + }, + { + "epoch": 1.010111761575306, + "grad_norm": 0.27455589175224304, + "learning_rate": 1.8201249268600676e-07, + "loss": 0.1999, + "step": 3796 + }, + { + "epoch": 1.0103778605641298, + "grad_norm": 0.26435649394989014, + "learning_rate": 1.8200282819732265e-07, + "loss": 0.2141, + "step": 3797 + }, + { + "epoch": 1.0106439595529537, + "grad_norm": 0.2655662000179291, + "learning_rate": 1.8199316136975122e-07, + "loss": 0.1967, + "step": 3798 + }, + { + "epoch": 1.0109100585417776, + "grad_norm": 0.3194880783557892, + "learning_rate": 1.8198349220356823e-07, + "loss": 0.223, + "step": 3799 + }, + { + "epoch": 1.0111761575306013, + "grad_norm": 0.35995951294898987, + "learning_rate": 1.8197382069904946e-07, + "loss": 0.2146, + "step": 3800 + }, + { + "epoch": 1.0114422565194252, + "grad_norm": 0.3148493766784668, + "learning_rate": 1.819641468564707e-07, + "loss": 0.2038, + "step": 3801 + }, + { + "epoch": 1.0117083555082491, + "grad_norm": 0.30482321977615356, + "learning_rate": 1.8195447067610795e-07, + "loss": 0.2021, + "step": 3802 + }, + { + "epoch": 1.011974454497073, + "grad_norm": 0.3770957291126251, + "learning_rate": 1.8194479215823713e-07, + "loss": 0.2249, + "step": 3803 + }, + { + "epoch": 1.0122405534858967, + "grad_norm": 0.7411269545555115, + "learning_rate": 1.8193511130313432e-07, + "loss": 0.2287, + "step": 3804 + }, + { + "epoch": 1.0125066524747206, + "grad_norm": 0.28607580065727234, + "learning_rate": 1.8192542811107564e-07, + "loss": 0.1953, + "step": 3805 + }, + { + "epoch": 1.0127727514635445, + "grad_norm": 0.25232183933258057, + "learning_rate": 1.8191574258233729e-07, + "loss": 0.1679, + "step": 3806 + }, + { + "epoch": 1.0130388504523682, + "grad_norm": 0.3418407738208771, + "learning_rate": 1.8190605471719547e-07, + "loss": 0.1933, + "step": 3807 + }, + { + "epoch": 1.013304949441192, + "grad_norm": 0.28798961639404297, + "learning_rate": 1.8189636451592655e-07, + "loss": 0.1987, + "step": 3808 + }, + { + "epoch": 1.013571048430016, + "grad_norm": 0.27087467908859253, + "learning_rate": 1.8188667197880689e-07, + "loss": 0.198, + "step": 3809 + }, + { + "epoch": 1.01383714741884, + "grad_norm": 0.2617466449737549, + "learning_rate": 1.8187697710611296e-07, + "loss": 0.1892, + "step": 3810 + }, + { + "epoch": 1.0141032464076636, + "grad_norm": 0.263156533241272, + "learning_rate": 1.8186727989812124e-07, + "loss": 0.2011, + "step": 3811 + }, + { + "epoch": 1.0143693453964875, + "grad_norm": 0.2873097062110901, + "learning_rate": 1.8185758035510832e-07, + "loss": 0.2158, + "step": 3812 + }, + { + "epoch": 1.0146354443853114, + "grad_norm": 0.40078186988830566, + "learning_rate": 1.8184787847735088e-07, + "loss": 0.2029, + "step": 3813 + }, + { + "epoch": 1.014901543374135, + "grad_norm": 0.28821274638175964, + "learning_rate": 1.818381742651256e-07, + "loss": 0.2248, + "step": 3814 + }, + { + "epoch": 1.015167642362959, + "grad_norm": 0.3080149292945862, + "learning_rate": 1.8182846771870931e-07, + "loss": 0.1973, + "step": 3815 + }, + { + "epoch": 1.0154337413517829, + "grad_norm": 0.3699595034122467, + "learning_rate": 1.8181875883837888e-07, + "loss": 0.217, + "step": 3816 + }, + { + "epoch": 1.0156998403406068, + "grad_norm": 0.37228912115097046, + "learning_rate": 1.8180904762441114e-07, + "loss": 0.1899, + "step": 3817 + }, + { + "epoch": 1.0159659393294305, + "grad_norm": 0.46246621012687683, + "learning_rate": 1.817993340770831e-07, + "loss": 0.2133, + "step": 3818 + }, + { + "epoch": 1.0162320383182544, + "grad_norm": 0.26787975430488586, + "learning_rate": 1.8178961819667187e-07, + "loss": 0.2126, + "step": 3819 + }, + { + "epoch": 1.0164981373070783, + "grad_norm": 0.28322526812553406, + "learning_rate": 1.817798999834545e-07, + "loss": 0.1818, + "step": 3820 + }, + { + "epoch": 1.0167642362959022, + "grad_norm": 0.2818073630332947, + "learning_rate": 1.8177017943770817e-07, + "loss": 0.2122, + "step": 3821 + }, + { + "epoch": 1.0170303352847259, + "grad_norm": 0.25759702920913696, + "learning_rate": 1.817604565597102e-07, + "loss": 0.1942, + "step": 3822 + }, + { + "epoch": 1.0172964342735498, + "grad_norm": 0.2508583664894104, + "learning_rate": 1.8175073134973787e-07, + "loss": 0.1831, + "step": 3823 + }, + { + "epoch": 1.0175625332623737, + "grad_norm": 0.40199658274650574, + "learning_rate": 1.8174100380806853e-07, + "loss": 0.2066, + "step": 3824 + }, + { + "epoch": 1.0178286322511974, + "grad_norm": 0.4193482995033264, + "learning_rate": 1.8173127393497968e-07, + "loss": 0.2082, + "step": 3825 + }, + { + "epoch": 1.0180947312400213, + "grad_norm": 0.27529624104499817, + "learning_rate": 1.817215417307488e-07, + "loss": 0.1912, + "step": 3826 + }, + { + "epoch": 1.0183608302288452, + "grad_norm": 0.3179660141468048, + "learning_rate": 1.8171180719565346e-07, + "loss": 0.2148, + "step": 3827 + }, + { + "epoch": 1.018626929217669, + "grad_norm": 0.28123971819877625, + "learning_rate": 1.8170207032997136e-07, + "loss": 0.1996, + "step": 3828 + }, + { + "epoch": 1.0188930282064927, + "grad_norm": 0.6620848178863525, + "learning_rate": 1.8169233113398017e-07, + "loss": 0.1853, + "step": 3829 + }, + { + "epoch": 1.0191591271953166, + "grad_norm": 0.32983720302581787, + "learning_rate": 1.816825896079577e-07, + "loss": 0.1959, + "step": 3830 + }, + { + "epoch": 1.0194252261841406, + "grad_norm": 0.2844213545322418, + "learning_rate": 1.8167284575218178e-07, + "loss": 0.1766, + "step": 3831 + }, + { + "epoch": 1.0196913251729642, + "grad_norm": 0.2629064917564392, + "learning_rate": 1.8166309956693034e-07, + "loss": 0.1872, + "step": 3832 + }, + { + "epoch": 1.0199574241617881, + "grad_norm": 0.27273833751678467, + "learning_rate": 1.8165335105248136e-07, + "loss": 0.2068, + "step": 3833 + }, + { + "epoch": 1.020223523150612, + "grad_norm": 0.2559123933315277, + "learning_rate": 1.8164360020911286e-07, + "loss": 0.1865, + "step": 3834 + }, + { + "epoch": 1.020489622139436, + "grad_norm": 0.3812136650085449, + "learning_rate": 1.81633847037103e-07, + "loss": 0.2091, + "step": 3835 + }, + { + "epoch": 1.0207557211282596, + "grad_norm": 0.2741689682006836, + "learning_rate": 1.8162409153672992e-07, + "loss": 0.2005, + "step": 3836 + }, + { + "epoch": 1.0210218201170835, + "grad_norm": 0.25644221901893616, + "learning_rate": 1.816143337082719e-07, + "loss": 0.1966, + "step": 3837 + }, + { + "epoch": 1.0212879191059074, + "grad_norm": 0.2987830638885498, + "learning_rate": 1.8160457355200717e-07, + "loss": 0.2182, + "step": 3838 + }, + { + "epoch": 1.0215540180947313, + "grad_norm": 0.30307185649871826, + "learning_rate": 1.8159481106821427e-07, + "loss": 0.2127, + "step": 3839 + }, + { + "epoch": 1.021820117083555, + "grad_norm": 0.37773334980010986, + "learning_rate": 1.815850462571715e-07, + "loss": 0.2041, + "step": 3840 + }, + { + "epoch": 1.022086216072379, + "grad_norm": 0.27076613903045654, + "learning_rate": 1.815752791191574e-07, + "loss": 0.1901, + "step": 3841 + }, + { + "epoch": 1.0223523150612028, + "grad_norm": 0.3225579559803009, + "learning_rate": 1.8156550965445058e-07, + "loss": 0.2037, + "step": 3842 + }, + { + "epoch": 1.0226184140500265, + "grad_norm": 0.27784451842308044, + "learning_rate": 1.815557378633297e-07, + "loss": 0.195, + "step": 3843 + }, + { + "epoch": 1.0228845130388504, + "grad_norm": 0.34294018149375916, + "learning_rate": 1.8154596374607342e-07, + "loss": 0.1963, + "step": 3844 + }, + { + "epoch": 1.0231506120276743, + "grad_norm": 0.2632116377353668, + "learning_rate": 1.8153618730296056e-07, + "loss": 0.1932, + "step": 3845 + }, + { + "epoch": 1.0234167110164982, + "grad_norm": 0.26653653383255005, + "learning_rate": 1.8152640853426995e-07, + "loss": 0.1997, + "step": 3846 + }, + { + "epoch": 1.023682810005322, + "grad_norm": 0.3542065918445587, + "learning_rate": 1.815166274402805e-07, + "loss": 0.1956, + "step": 3847 + }, + { + "epoch": 1.0239489089941458, + "grad_norm": 0.27966180443763733, + "learning_rate": 1.815068440212712e-07, + "loss": 0.2045, + "step": 3848 + }, + { + "epoch": 1.0242150079829697, + "grad_norm": 0.27304649353027344, + "learning_rate": 1.8149705827752105e-07, + "loss": 0.2026, + "step": 3849 + }, + { + "epoch": 1.0244811069717934, + "grad_norm": 0.3371782898902893, + "learning_rate": 1.814872702093092e-07, + "loss": 0.2189, + "step": 3850 + }, + { + "epoch": 1.0247472059606173, + "grad_norm": 0.33050617575645447, + "learning_rate": 1.814774798169148e-07, + "loss": 0.2086, + "step": 3851 + }, + { + "epoch": 1.0250133049494412, + "grad_norm": 0.39283376932144165, + "learning_rate": 1.8146768710061713e-07, + "loss": 0.1897, + "step": 3852 + }, + { + "epoch": 1.025279403938265, + "grad_norm": 0.27384185791015625, + "learning_rate": 1.8145789206069546e-07, + "loss": 0.1952, + "step": 3853 + }, + { + "epoch": 1.0255455029270888, + "grad_norm": 0.2697853744029999, + "learning_rate": 1.814480946974292e-07, + "loss": 0.1906, + "step": 3854 + }, + { + "epoch": 1.0258116019159127, + "grad_norm": 0.24143116176128387, + "learning_rate": 1.8143829501109774e-07, + "loss": 0.1954, + "step": 3855 + }, + { + "epoch": 1.0260777009047366, + "grad_norm": 0.31152400374412537, + "learning_rate": 1.8142849300198062e-07, + "loss": 0.2001, + "step": 3856 + }, + { + "epoch": 1.0263437998935605, + "grad_norm": 0.3124155104160309, + "learning_rate": 1.8141868867035744e-07, + "loss": 0.193, + "step": 3857 + }, + { + "epoch": 1.0266098988823842, + "grad_norm": 0.281058132648468, + "learning_rate": 1.8140888201650775e-07, + "loss": 0.2062, + "step": 3858 + }, + { + "epoch": 1.026875997871208, + "grad_norm": 0.27487504482269287, + "learning_rate": 1.8139907304071134e-07, + "loss": 0.2029, + "step": 3859 + }, + { + "epoch": 1.027142096860032, + "grad_norm": 0.2472769320011139, + "learning_rate": 1.81389261743248e-07, + "loss": 0.1876, + "step": 3860 + }, + { + "epoch": 1.0274081958488557, + "grad_norm": 0.29288822412490845, + "learning_rate": 1.8137944812439744e-07, + "loss": 0.2129, + "step": 3861 + }, + { + "epoch": 1.0276742948376796, + "grad_norm": 0.3265572488307953, + "learning_rate": 1.813696321844397e-07, + "loss": 0.2286, + "step": 3862 + }, + { + "epoch": 1.0279403938265035, + "grad_norm": 0.27661439776420593, + "learning_rate": 1.813598139236547e-07, + "loss": 0.1829, + "step": 3863 + }, + { + "epoch": 1.0282064928153274, + "grad_norm": 0.39921867847442627, + "learning_rate": 1.8134999334232248e-07, + "loss": 0.2337, + "step": 3864 + }, + { + "epoch": 1.028472591804151, + "grad_norm": 0.31225305795669556, + "learning_rate": 1.8134017044072314e-07, + "loss": 0.2133, + "step": 3865 + }, + { + "epoch": 1.028738690792975, + "grad_norm": 0.2790337800979614, + "learning_rate": 1.8133034521913682e-07, + "loss": 0.1715, + "step": 3866 + }, + { + "epoch": 1.0290047897817989, + "grad_norm": 0.35349586606025696, + "learning_rate": 1.813205176778438e-07, + "loss": 0.2128, + "step": 3867 + }, + { + "epoch": 1.0292708887706228, + "grad_norm": 0.2849845290184021, + "learning_rate": 1.8131068781712438e-07, + "loss": 0.2045, + "step": 3868 + }, + { + "epoch": 1.0295369877594465, + "grad_norm": 0.338146835565567, + "learning_rate": 1.8130085563725888e-07, + "loss": 0.214, + "step": 3869 + }, + { + "epoch": 1.0298030867482704, + "grad_norm": 0.4894586503505707, + "learning_rate": 1.812910211385278e-07, + "loss": 0.2042, + "step": 3870 + }, + { + "epoch": 1.0300691857370943, + "grad_norm": 0.33835074305534363, + "learning_rate": 1.8128118432121157e-07, + "loss": 0.2192, + "step": 3871 + }, + { + "epoch": 1.030335284725918, + "grad_norm": 0.3514658212661743, + "learning_rate": 1.812713451855908e-07, + "loss": 0.2193, + "step": 3872 + }, + { + "epoch": 1.0306013837147419, + "grad_norm": 0.34141337871551514, + "learning_rate": 1.8126150373194615e-07, + "loss": 0.1983, + "step": 3873 + }, + { + "epoch": 1.0308674827035658, + "grad_norm": 0.3959517180919647, + "learning_rate": 1.8125165996055825e-07, + "loss": 0.1992, + "step": 3874 + }, + { + "epoch": 1.0311335816923897, + "grad_norm": 0.292021781206131, + "learning_rate": 1.812418138717079e-07, + "loss": 0.2152, + "step": 3875 + }, + { + "epoch": 1.0313996806812133, + "grad_norm": 0.34938985109329224, + "learning_rate": 1.8123196546567595e-07, + "loss": 0.2078, + "step": 3876 + }, + { + "epoch": 1.0316657796700373, + "grad_norm": 0.24335214495658875, + "learning_rate": 1.8122211474274326e-07, + "loss": 0.1715, + "step": 3877 + }, + { + "epoch": 1.0319318786588612, + "grad_norm": 0.3995177149772644, + "learning_rate": 1.8121226170319078e-07, + "loss": 0.2022, + "step": 3878 + }, + { + "epoch": 1.0321979776476848, + "grad_norm": 0.28033560514450073, + "learning_rate": 1.8120240634729958e-07, + "loss": 0.208, + "step": 3879 + }, + { + "epoch": 1.0324640766365087, + "grad_norm": 0.27771562337875366, + "learning_rate": 1.8119254867535075e-07, + "loss": 0.19, + "step": 3880 + }, + { + "epoch": 1.0327301756253326, + "grad_norm": 0.2637884318828583, + "learning_rate": 1.8118268868762544e-07, + "loss": 0.1808, + "step": 3881 + }, + { + "epoch": 1.0329962746141566, + "grad_norm": 0.32658836245536804, + "learning_rate": 1.8117282638440487e-07, + "loss": 0.1815, + "step": 3882 + }, + { + "epoch": 1.0332623736029802, + "grad_norm": 0.25819316506385803, + "learning_rate": 1.8116296176597034e-07, + "loss": 0.1977, + "step": 3883 + }, + { + "epoch": 1.0335284725918041, + "grad_norm": 0.41676393151283264, + "learning_rate": 1.8115309483260322e-07, + "loss": 0.2035, + "step": 3884 + }, + { + "epoch": 1.033794571580628, + "grad_norm": 0.28796055912971497, + "learning_rate": 1.8114322558458492e-07, + "loss": 0.2155, + "step": 3885 + }, + { + "epoch": 1.0340606705694517, + "grad_norm": 0.2581664025783539, + "learning_rate": 1.8113335402219692e-07, + "loss": 0.1879, + "step": 3886 + }, + { + "epoch": 1.0343267695582756, + "grad_norm": 0.33124563097953796, + "learning_rate": 1.8112348014572084e-07, + "loss": 0.2199, + "step": 3887 + }, + { + "epoch": 1.0345928685470995, + "grad_norm": 0.3113897144794464, + "learning_rate": 1.811136039554382e-07, + "loss": 0.1926, + "step": 3888 + }, + { + "epoch": 1.0348589675359234, + "grad_norm": 0.25805431604385376, + "learning_rate": 1.8110372545163078e-07, + "loss": 0.1758, + "step": 3889 + }, + { + "epoch": 1.0351250665247471, + "grad_norm": 0.26743146777153015, + "learning_rate": 1.8109384463458026e-07, + "loss": 0.1954, + "step": 3890 + }, + { + "epoch": 1.035391165513571, + "grad_norm": 0.31486785411834717, + "learning_rate": 1.8108396150456855e-07, + "loss": 0.2109, + "step": 3891 + }, + { + "epoch": 1.035657264502395, + "grad_norm": 0.24073123931884766, + "learning_rate": 1.8107407606187746e-07, + "loss": 0.176, + "step": 3892 + }, + { + "epoch": 1.0359233634912188, + "grad_norm": 0.4097626507282257, + "learning_rate": 1.81064188306789e-07, + "loss": 0.1823, + "step": 3893 + }, + { + "epoch": 1.0361894624800425, + "grad_norm": 0.42220011353492737, + "learning_rate": 1.810542982395851e-07, + "loss": 0.2157, + "step": 3894 + }, + { + "epoch": 1.0364555614688664, + "grad_norm": 0.3515620529651642, + "learning_rate": 1.8104440586054794e-07, + "loss": 0.2009, + "step": 3895 + }, + { + "epoch": 1.0367216604576903, + "grad_norm": 0.25343313813209534, + "learning_rate": 1.810345111699596e-07, + "loss": 0.1793, + "step": 3896 + }, + { + "epoch": 1.036987759446514, + "grad_norm": 0.27576181292533875, + "learning_rate": 1.810246141681024e-07, + "loss": 0.1972, + "step": 3897 + }, + { + "epoch": 1.037253858435338, + "grad_norm": 0.2659960091114044, + "learning_rate": 1.810147148552585e-07, + "loss": 0.1944, + "step": 3898 + }, + { + "epoch": 1.0375199574241618, + "grad_norm": 0.28081879019737244, + "learning_rate": 1.8100481323171032e-07, + "loss": 0.2069, + "step": 3899 + }, + { + "epoch": 1.0377860564129857, + "grad_norm": 0.29594749212265015, + "learning_rate": 1.8099490929774026e-07, + "loss": 0.199, + "step": 3900 + }, + { + "epoch": 1.0380521554018094, + "grad_norm": 0.3164302706718445, + "learning_rate": 1.809850030536308e-07, + "loss": 0.2061, + "step": 3901 + }, + { + "epoch": 1.0383182543906333, + "grad_norm": 0.26296085119247437, + "learning_rate": 1.8097509449966446e-07, + "loss": 0.1888, + "step": 3902 + }, + { + "epoch": 1.0385843533794572, + "grad_norm": 0.28564006090164185, + "learning_rate": 1.809651836361239e-07, + "loss": 0.1842, + "step": 3903 + }, + { + "epoch": 1.038850452368281, + "grad_norm": 0.2704889476299286, + "learning_rate": 1.8095527046329179e-07, + "loss": 0.2155, + "step": 3904 + }, + { + "epoch": 1.0391165513571048, + "grad_norm": 0.2816019058227539, + "learning_rate": 1.809453549814508e-07, + "loss": 0.1933, + "step": 3905 + }, + { + "epoch": 1.0393826503459287, + "grad_norm": 0.3226528763771057, + "learning_rate": 1.8093543719088385e-07, + "loss": 0.1999, + "step": 3906 + }, + { + "epoch": 1.0396487493347526, + "grad_norm": 0.3667486310005188, + "learning_rate": 1.8092551709187373e-07, + "loss": 0.1957, + "step": 3907 + }, + { + "epoch": 1.0399148483235763, + "grad_norm": 0.2758514881134033, + "learning_rate": 1.8091559468470344e-07, + "loss": 0.2057, + "step": 3908 + }, + { + "epoch": 1.0401809473124002, + "grad_norm": 0.24267511069774628, + "learning_rate": 1.8090566996965593e-07, + "loss": 0.181, + "step": 3909 + }, + { + "epoch": 1.040447046301224, + "grad_norm": 0.2713896930217743, + "learning_rate": 1.8089574294701433e-07, + "loss": 0.2194, + "step": 3910 + }, + { + "epoch": 1.040713145290048, + "grad_norm": 0.2610212564468384, + "learning_rate": 1.8088581361706176e-07, + "loss": 0.1886, + "step": 3911 + }, + { + "epoch": 1.0409792442788717, + "grad_norm": 0.3727656602859497, + "learning_rate": 1.808758819800814e-07, + "loss": 0.2162, + "step": 3912 + }, + { + "epoch": 1.0412453432676956, + "grad_norm": 0.29267337918281555, + "learning_rate": 1.8086594803635653e-07, + "loss": 0.2113, + "step": 3913 + }, + { + "epoch": 1.0415114422565195, + "grad_norm": 0.28067660331726074, + "learning_rate": 1.808560117861705e-07, + "loss": 0.1946, + "step": 3914 + }, + { + "epoch": 1.0417775412453432, + "grad_norm": 0.32504335045814514, + "learning_rate": 1.808460732298067e-07, + "loss": 0.2086, + "step": 3915 + }, + { + "epoch": 1.042043640234167, + "grad_norm": 0.4251376688480377, + "learning_rate": 1.8083613236754863e-07, + "loss": 0.22, + "step": 3916 + }, + { + "epoch": 1.042309739222991, + "grad_norm": 0.3880545198917389, + "learning_rate": 1.808261891996798e-07, + "loss": 0.2064, + "step": 3917 + }, + { + "epoch": 1.0425758382118149, + "grad_norm": 0.36425042152404785, + "learning_rate": 1.8081624372648378e-07, + "loss": 0.2029, + "step": 3918 + }, + { + "epoch": 1.0428419372006386, + "grad_norm": 0.28274622559547424, + "learning_rate": 1.8080629594824427e-07, + "loss": 0.2018, + "step": 3919 + }, + { + "epoch": 1.0431080361894625, + "grad_norm": 0.402662456035614, + "learning_rate": 1.80796345865245e-07, + "loss": 0.221, + "step": 3920 + }, + { + "epoch": 1.0433741351782864, + "grad_norm": 0.3612014651298523, + "learning_rate": 1.8078639347776977e-07, + "loss": 0.2054, + "step": 3921 + }, + { + "epoch": 1.0436402341671103, + "grad_norm": 0.28548678755760193, + "learning_rate": 1.8077643878610243e-07, + "loss": 0.1922, + "step": 3922 + }, + { + "epoch": 1.043906333155934, + "grad_norm": 0.32099613547325134, + "learning_rate": 1.807664817905269e-07, + "loss": 0.1945, + "step": 3923 + }, + { + "epoch": 1.0441724321447579, + "grad_norm": 0.27859383821487427, + "learning_rate": 1.8075652249132717e-07, + "loss": 0.1953, + "step": 3924 + }, + { + "epoch": 1.0444385311335818, + "grad_norm": 0.2557529807090759, + "learning_rate": 1.8074656088878735e-07, + "loss": 0.2032, + "step": 3925 + }, + { + "epoch": 1.0447046301224054, + "grad_norm": 0.2760503888130188, + "learning_rate": 1.807365969831915e-07, + "loss": 0.2131, + "step": 3926 + }, + { + "epoch": 1.0449707291112293, + "grad_norm": 0.2801801562309265, + "learning_rate": 1.8072663077482385e-07, + "loss": 0.2008, + "step": 3927 + }, + { + "epoch": 1.0452368281000533, + "grad_norm": 0.2552754878997803, + "learning_rate": 1.8071666226396864e-07, + "loss": 0.1846, + "step": 3928 + }, + { + "epoch": 1.0455029270888772, + "grad_norm": 0.25444507598876953, + "learning_rate": 1.807066914509102e-07, + "loss": 0.1869, + "step": 3929 + }, + { + "epoch": 1.0457690260777008, + "grad_norm": 0.26814669370651245, + "learning_rate": 1.8069671833593293e-07, + "loss": 0.171, + "step": 3930 + }, + { + "epoch": 1.0460351250665247, + "grad_norm": 0.2598755955696106, + "learning_rate": 1.8068674291932125e-07, + "loss": 0.1896, + "step": 3931 + }, + { + "epoch": 1.0463012240553486, + "grad_norm": 0.2612512409687042, + "learning_rate": 1.8067676520135968e-07, + "loss": 0.1957, + "step": 3932 + }, + { + "epoch": 1.0465673230441723, + "grad_norm": 0.2692943811416626, + "learning_rate": 1.8066678518233287e-07, + "loss": 0.1944, + "step": 3933 + }, + { + "epoch": 1.0468334220329962, + "grad_norm": 0.2624310553073883, + "learning_rate": 1.8065680286252542e-07, + "loss": 0.1924, + "step": 3934 + }, + { + "epoch": 1.0470995210218201, + "grad_norm": 0.38877353072166443, + "learning_rate": 1.80646818242222e-07, + "loss": 0.1995, + "step": 3935 + }, + { + "epoch": 1.047365620010644, + "grad_norm": 0.2424582540988922, + "learning_rate": 1.8063683132170748e-07, + "loss": 0.1797, + "step": 3936 + }, + { + "epoch": 1.0476317189994677, + "grad_norm": 0.35941481590270996, + "learning_rate": 1.8062684210126666e-07, + "loss": 0.2196, + "step": 3937 + }, + { + "epoch": 1.0478978179882916, + "grad_norm": 0.28989294171333313, + "learning_rate": 1.8061685058118448e-07, + "loss": 0.1919, + "step": 3938 + }, + { + "epoch": 1.0481639169771155, + "grad_norm": 0.28303036093711853, + "learning_rate": 1.8060685676174584e-07, + "loss": 0.184, + "step": 3939 + }, + { + "epoch": 1.0484300159659394, + "grad_norm": 0.3254125118255615, + "learning_rate": 1.8059686064323593e-07, + "loss": 0.1905, + "step": 3940 + }, + { + "epoch": 1.0486961149547631, + "grad_norm": 0.3132189214229584, + "learning_rate": 1.8058686222593972e-07, + "loss": 0.2037, + "step": 3941 + }, + { + "epoch": 1.048962213943587, + "grad_norm": 0.4109899401664734, + "learning_rate": 1.8057686151014244e-07, + "loss": 0.2189, + "step": 3942 + }, + { + "epoch": 1.049228312932411, + "grad_norm": 0.2698252201080322, + "learning_rate": 1.8056685849612935e-07, + "loss": 0.1934, + "step": 3943 + }, + { + "epoch": 1.0494944119212346, + "grad_norm": 0.29102835059165955, + "learning_rate": 1.8055685318418572e-07, + "loss": 0.1851, + "step": 3944 + }, + { + "epoch": 1.0497605109100585, + "grad_norm": 0.37105226516723633, + "learning_rate": 1.8054684557459695e-07, + "loss": 0.1929, + "step": 3945 + }, + { + "epoch": 1.0500266098988824, + "grad_norm": 0.36004456877708435, + "learning_rate": 1.8053683566764848e-07, + "loss": 0.1995, + "step": 3946 + }, + { + "epoch": 1.0502927088877063, + "grad_norm": 0.2612389326095581, + "learning_rate": 1.8052682346362577e-07, + "loss": 0.1977, + "step": 3947 + }, + { + "epoch": 1.05055880787653, + "grad_norm": 0.28361156582832336, + "learning_rate": 1.805168089628144e-07, + "loss": 0.194, + "step": 3948 + }, + { + "epoch": 1.050824906865354, + "grad_norm": 0.2558179199695587, + "learning_rate": 1.8050679216550005e-07, + "loss": 0.1871, + "step": 3949 + }, + { + "epoch": 1.0510910058541778, + "grad_norm": 0.29322928190231323, + "learning_rate": 1.8049677307196837e-07, + "loss": 0.1958, + "step": 3950 + }, + { + "epoch": 1.0513571048430017, + "grad_norm": 0.23690755665302277, + "learning_rate": 1.8048675168250518e-07, + "loss": 0.1698, + "step": 3951 + }, + { + "epoch": 1.0516232038318254, + "grad_norm": 0.27247801423072815, + "learning_rate": 1.8047672799739625e-07, + "loss": 0.1938, + "step": 3952 + }, + { + "epoch": 1.0518893028206493, + "grad_norm": 0.4520096182823181, + "learning_rate": 1.804667020169275e-07, + "loss": 0.2067, + "step": 3953 + }, + { + "epoch": 1.0521554018094732, + "grad_norm": 0.36335861682891846, + "learning_rate": 1.804566737413849e-07, + "loss": 0.2171, + "step": 3954 + }, + { + "epoch": 1.0524215007982969, + "grad_norm": 0.2586638033390045, + "learning_rate": 1.8044664317105447e-07, + "loss": 0.1812, + "step": 3955 + }, + { + "epoch": 1.0526875997871208, + "grad_norm": 0.34718480706214905, + "learning_rate": 1.804366103062223e-07, + "loss": 0.1993, + "step": 3956 + }, + { + "epoch": 1.0529536987759447, + "grad_norm": 0.33448198437690735, + "learning_rate": 1.8042657514717457e-07, + "loss": 0.2086, + "step": 3957 + }, + { + "epoch": 1.0532197977647686, + "grad_norm": 0.2563773989677429, + "learning_rate": 1.8041653769419743e-07, + "loss": 0.187, + "step": 3958 + }, + { + "epoch": 1.0534858967535923, + "grad_norm": 0.3381064236164093, + "learning_rate": 1.8040649794757728e-07, + "loss": 0.2094, + "step": 3959 + }, + { + "epoch": 1.0537519957424162, + "grad_norm": 0.35160019993782043, + "learning_rate": 1.803964559076004e-07, + "loss": 0.1977, + "step": 3960 + }, + { + "epoch": 1.05401809473124, + "grad_norm": 0.38972553610801697, + "learning_rate": 1.8038641157455316e-07, + "loss": 0.1897, + "step": 3961 + }, + { + "epoch": 1.0542841937200638, + "grad_norm": 0.3481815457344055, + "learning_rate": 1.8037636494872216e-07, + "loss": 0.2253, + "step": 3962 + }, + { + "epoch": 1.0545502927088877, + "grad_norm": 0.2810094654560089, + "learning_rate": 1.8036631603039392e-07, + "loss": 0.1938, + "step": 3963 + }, + { + "epoch": 1.0548163916977116, + "grad_norm": 0.35427799820899963, + "learning_rate": 1.80356264819855e-07, + "loss": 0.2174, + "step": 3964 + }, + { + "epoch": 1.0550824906865355, + "grad_norm": 0.2613235116004944, + "learning_rate": 1.8034621131739213e-07, + "loss": 0.1932, + "step": 3965 + }, + { + "epoch": 1.0553485896753592, + "grad_norm": 0.415336012840271, + "learning_rate": 1.8033615552329205e-07, + "loss": 0.2025, + "step": 3966 + }, + { + "epoch": 1.055614688664183, + "grad_norm": 0.2888806164264679, + "learning_rate": 1.8032609743784153e-07, + "loss": 0.2192, + "step": 3967 + }, + { + "epoch": 1.055880787653007, + "grad_norm": 0.36064791679382324, + "learning_rate": 1.8031603706132753e-07, + "loss": 0.1905, + "step": 3968 + }, + { + "epoch": 1.0561468866418307, + "grad_norm": 0.3427048921585083, + "learning_rate": 1.803059743940369e-07, + "loss": 0.2021, + "step": 3969 + }, + { + "epoch": 1.0564129856306546, + "grad_norm": 0.2725357711315155, + "learning_rate": 1.802959094362567e-07, + "loss": 0.1984, + "step": 3970 + }, + { + "epoch": 1.0566790846194785, + "grad_norm": 0.411103218793869, + "learning_rate": 1.80285842188274e-07, + "loss": 0.2281, + "step": 3971 + }, + { + "epoch": 1.0569451836083024, + "grad_norm": 0.46222174167633057, + "learning_rate": 1.8027577265037594e-07, + "loss": 0.2264, + "step": 3972 + }, + { + "epoch": 1.057211282597126, + "grad_norm": 0.2622698247432709, + "learning_rate": 1.802657008228497e-07, + "loss": 0.1989, + "step": 3973 + }, + { + "epoch": 1.05747738158595, + "grad_norm": 0.2879634499549866, + "learning_rate": 1.8025562670598258e-07, + "loss": 0.1987, + "step": 3974 + }, + { + "epoch": 1.0577434805747739, + "grad_norm": 0.2687787115573883, + "learning_rate": 1.802455503000619e-07, + "loss": 0.1988, + "step": 3975 + }, + { + "epoch": 1.0580095795635978, + "grad_norm": 0.6007152795791626, + "learning_rate": 1.8023547160537504e-07, + "loss": 0.243, + "step": 3976 + }, + { + "epoch": 1.0582756785524214, + "grad_norm": 0.32203176617622375, + "learning_rate": 1.802253906222095e-07, + "loss": 0.2077, + "step": 3977 + }, + { + "epoch": 1.0585417775412453, + "grad_norm": 0.3778374493122101, + "learning_rate": 1.8021530735085278e-07, + "loss": 0.218, + "step": 3978 + }, + { + "epoch": 1.0588078765300692, + "grad_norm": 0.2562639117240906, + "learning_rate": 1.802052217915925e-07, + "loss": 0.2022, + "step": 3979 + }, + { + "epoch": 1.059073975518893, + "grad_norm": 0.30773934721946716, + "learning_rate": 1.801951339447163e-07, + "loss": 0.194, + "step": 3980 + }, + { + "epoch": 1.0593400745077168, + "grad_norm": 0.3600577116012573, + "learning_rate": 1.8018504381051195e-07, + "loss": 0.2086, + "step": 3981 + }, + { + "epoch": 1.0596061734965407, + "grad_norm": 0.2931470274925232, + "learning_rate": 1.8017495138926716e-07, + "loss": 0.2007, + "step": 3982 + }, + { + "epoch": 1.0598722724853646, + "grad_norm": 0.27822577953338623, + "learning_rate": 1.8016485668126985e-07, + "loss": 0.1902, + "step": 3983 + }, + { + "epoch": 1.0601383714741883, + "grad_norm": 0.3511017858982086, + "learning_rate": 1.8015475968680794e-07, + "loss": 0.201, + "step": 3984 + }, + { + "epoch": 1.0604044704630122, + "grad_norm": 0.2781083583831787, + "learning_rate": 1.8014466040616938e-07, + "loss": 0.2046, + "step": 3985 + }, + { + "epoch": 1.0606705694518361, + "grad_norm": 0.36474961042404175, + "learning_rate": 1.8013455883964227e-07, + "loss": 0.2149, + "step": 3986 + }, + { + "epoch": 1.06093666844066, + "grad_norm": 0.3565638065338135, + "learning_rate": 1.8012445498751467e-07, + "loss": 0.2199, + "step": 3987 + }, + { + "epoch": 1.0612027674294837, + "grad_norm": 0.35407620668411255, + "learning_rate": 1.801143488500748e-07, + "loss": 0.2068, + "step": 3988 + }, + { + "epoch": 1.0614688664183076, + "grad_norm": 0.2709994614124298, + "learning_rate": 1.8010424042761093e-07, + "loss": 0.1795, + "step": 3989 + }, + { + "epoch": 1.0617349654071315, + "grad_norm": 0.28785738348960876, + "learning_rate": 1.800941297204113e-07, + "loss": 0.1973, + "step": 3990 + }, + { + "epoch": 1.0620010643959552, + "grad_norm": 0.47926586866378784, + "learning_rate": 1.8008401672876438e-07, + "loss": 0.1927, + "step": 3991 + }, + { + "epoch": 1.0622671633847791, + "grad_norm": 0.32468047738075256, + "learning_rate": 1.800739014529585e-07, + "loss": 0.209, + "step": 3992 + }, + { + "epoch": 1.062533262373603, + "grad_norm": 0.272582083940506, + "learning_rate": 1.8006378389328227e-07, + "loss": 0.1977, + "step": 3993 + }, + { + "epoch": 1.062799361362427, + "grad_norm": 0.3472599983215332, + "learning_rate": 1.8005366405002423e-07, + "loss": 0.2029, + "step": 3994 + }, + { + "epoch": 1.0630654603512506, + "grad_norm": 0.2616603672504425, + "learning_rate": 1.80043541923473e-07, + "loss": 0.197, + "step": 3995 + }, + { + "epoch": 1.0633315593400745, + "grad_norm": 0.2929458022117615, + "learning_rate": 1.8003341751391732e-07, + "loss": 0.1828, + "step": 3996 + }, + { + "epoch": 1.0635976583288984, + "grad_norm": 0.3274747133255005, + "learning_rate": 1.800232908216459e-07, + "loss": 0.2108, + "step": 3997 + }, + { + "epoch": 1.063863757317722, + "grad_norm": 0.253935307264328, + "learning_rate": 1.8001316184694765e-07, + "loss": 0.2031, + "step": 3998 + }, + { + "epoch": 1.064129856306546, + "grad_norm": 0.26983746886253357, + "learning_rate": 1.8000303059011142e-07, + "loss": 0.1781, + "step": 3999 + }, + { + "epoch": 1.06439595529537, + "grad_norm": 0.3370818793773651, + "learning_rate": 1.7999289705142616e-07, + "loss": 0.2092, + "step": 4000 + }, + { + "epoch": 1.0646620542841938, + "grad_norm": 0.31836870312690735, + "learning_rate": 1.7998276123118098e-07, + "loss": 0.1905, + "step": 4001 + }, + { + "epoch": 1.0649281532730175, + "grad_norm": 0.30651915073394775, + "learning_rate": 1.7997262312966486e-07, + "loss": 0.2094, + "step": 4002 + }, + { + "epoch": 1.0651942522618414, + "grad_norm": 0.28199219703674316, + "learning_rate": 1.7996248274716703e-07, + "loss": 0.2084, + "step": 4003 + }, + { + "epoch": 1.0654603512506653, + "grad_norm": 0.2973892092704773, + "learning_rate": 1.799523400839767e-07, + "loss": 0.2131, + "step": 4004 + }, + { + "epoch": 1.065726450239489, + "grad_norm": 0.2691825330257416, + "learning_rate": 1.7994219514038317e-07, + "loss": 0.1926, + "step": 4005 + }, + { + "epoch": 1.0659925492283129, + "grad_norm": 0.3435136079788208, + "learning_rate": 1.7993204791667577e-07, + "loss": 0.1953, + "step": 4006 + }, + { + "epoch": 1.0662586482171368, + "grad_norm": 0.43113675713539124, + "learning_rate": 1.7992189841314396e-07, + "loss": 0.2049, + "step": 4007 + }, + { + "epoch": 1.0665247472059607, + "grad_norm": 0.2557879090309143, + "learning_rate": 1.7991174663007717e-07, + "loss": 0.2015, + "step": 4008 + }, + { + "epoch": 1.0667908461947844, + "grad_norm": 0.2629900276660919, + "learning_rate": 1.79901592567765e-07, + "loss": 0.165, + "step": 4009 + }, + { + "epoch": 1.0670569451836083, + "grad_norm": 0.2633045017719269, + "learning_rate": 1.7989143622649703e-07, + "loss": 0.2038, + "step": 4010 + }, + { + "epoch": 1.0673230441724322, + "grad_norm": 0.2551839351654053, + "learning_rate": 1.7988127760656293e-07, + "loss": 0.2024, + "step": 4011 + }, + { + "epoch": 1.067589143161256, + "grad_norm": 0.29602494835853577, + "learning_rate": 1.798711167082525e-07, + "loss": 0.2103, + "step": 4012 + }, + { + "epoch": 1.0678552421500798, + "grad_norm": 0.35392075777053833, + "learning_rate": 1.7986095353185552e-07, + "loss": 0.2132, + "step": 4013 + }, + { + "epoch": 1.0681213411389037, + "grad_norm": 0.2834084630012512, + "learning_rate": 1.7985078807766182e-07, + "loss": 0.2154, + "step": 4014 + }, + { + "epoch": 1.0683874401277276, + "grad_norm": 0.374447762966156, + "learning_rate": 1.798406203459614e-07, + "loss": 0.2005, + "step": 4015 + }, + { + "epoch": 1.0686535391165513, + "grad_norm": 0.31976696848869324, + "learning_rate": 1.7983045033704424e-07, + "loss": 0.1932, + "step": 4016 + }, + { + "epoch": 1.0689196381053752, + "grad_norm": 0.3773699402809143, + "learning_rate": 1.7982027805120042e-07, + "loss": 0.2125, + "step": 4017 + }, + { + "epoch": 1.069185737094199, + "grad_norm": 0.2573809027671814, + "learning_rate": 1.798101034887201e-07, + "loss": 0.1925, + "step": 4018 + }, + { + "epoch": 1.069451836083023, + "grad_norm": 0.2867128252983093, + "learning_rate": 1.797999266498934e-07, + "loss": 0.203, + "step": 4019 + }, + { + "epoch": 1.0697179350718466, + "grad_norm": 0.39853164553642273, + "learning_rate": 1.7978974753501063e-07, + "loss": 0.1987, + "step": 4020 + }, + { + "epoch": 1.0699840340606706, + "grad_norm": 0.26036015152931213, + "learning_rate": 1.7977956614436213e-07, + "loss": 0.1869, + "step": 4021 + }, + { + "epoch": 1.0702501330494945, + "grad_norm": 0.36858275532722473, + "learning_rate": 1.797693824782383e-07, + "loss": 0.2063, + "step": 4022 + }, + { + "epoch": 1.0705162320383184, + "grad_norm": 0.26343727111816406, + "learning_rate": 1.7975919653692956e-07, + "loss": 0.1913, + "step": 4023 + }, + { + "epoch": 1.070782331027142, + "grad_norm": 0.27581432461738586, + "learning_rate": 1.797490083207265e-07, + "loss": 0.1858, + "step": 4024 + }, + { + "epoch": 1.071048430015966, + "grad_norm": 0.2884671688079834, + "learning_rate": 1.797388178299196e-07, + "loss": 0.2033, + "step": 4025 + }, + { + "epoch": 1.0713145290047899, + "grad_norm": 0.25526249408721924, + "learning_rate": 1.7972862506479963e-07, + "loss": 0.1832, + "step": 4026 + }, + { + "epoch": 1.0715806279936135, + "grad_norm": 0.3596367835998535, + "learning_rate": 1.7971843002565723e-07, + "loss": 0.1967, + "step": 4027 + }, + { + "epoch": 1.0718467269824374, + "grad_norm": 0.2815338373184204, + "learning_rate": 1.797082327127832e-07, + "loss": 0.1929, + "step": 4028 + }, + { + "epoch": 1.0721128259712613, + "grad_norm": 0.2746942341327667, + "learning_rate": 1.7969803312646844e-07, + "loss": 0.2284, + "step": 4029 + }, + { + "epoch": 1.0723789249600852, + "grad_norm": 0.8572117686271667, + "learning_rate": 1.7968783126700377e-07, + "loss": 0.1964, + "step": 4030 + }, + { + "epoch": 1.072645023948909, + "grad_norm": 0.3398043215274811, + "learning_rate": 1.7967762713468028e-07, + "loss": 0.2057, + "step": 4031 + }, + { + "epoch": 1.0729111229377328, + "grad_norm": 0.31588438153266907, + "learning_rate": 1.7966742072978888e-07, + "loss": 0.2079, + "step": 4032 + }, + { + "epoch": 1.0731772219265567, + "grad_norm": 0.43963751196861267, + "learning_rate": 1.7965721205262082e-07, + "loss": 0.2211, + "step": 4033 + }, + { + "epoch": 1.0734433209153806, + "grad_norm": 0.3631182909011841, + "learning_rate": 1.7964700110346716e-07, + "loss": 0.1906, + "step": 4034 + }, + { + "epoch": 1.0737094199042043, + "grad_norm": 0.31668820977211, + "learning_rate": 1.796367878826192e-07, + "loss": 0.1934, + "step": 4035 + }, + { + "epoch": 1.0739755188930282, + "grad_norm": 0.2952709496021271, + "learning_rate": 1.796265723903682e-07, + "loss": 0.2029, + "step": 4036 + }, + { + "epoch": 1.0742416178818521, + "grad_norm": 0.24903808534145355, + "learning_rate": 1.7961635462700552e-07, + "loss": 0.182, + "step": 4037 + }, + { + "epoch": 1.0745077168706758, + "grad_norm": 0.26925894618034363, + "learning_rate": 1.7960613459282267e-07, + "loss": 0.1943, + "step": 4038 + }, + { + "epoch": 1.0747738158594997, + "grad_norm": 0.2558692991733551, + "learning_rate": 1.7959591228811107e-07, + "loss": 0.1846, + "step": 4039 + }, + { + "epoch": 1.0750399148483236, + "grad_norm": 0.3574623167514801, + "learning_rate": 1.7958568771316228e-07, + "loss": 0.2078, + "step": 4040 + }, + { + "epoch": 1.0753060138371473, + "grad_norm": 0.3431940972805023, + "learning_rate": 1.79575460868268e-07, + "loss": 0.2042, + "step": 4041 + }, + { + "epoch": 1.0755721128259712, + "grad_norm": 0.2858409881591797, + "learning_rate": 1.7956523175371985e-07, + "loss": 0.2043, + "step": 4042 + }, + { + "epoch": 1.075838211814795, + "grad_norm": 0.2848481833934784, + "learning_rate": 1.795550003698096e-07, + "loss": 0.2042, + "step": 4043 + }, + { + "epoch": 1.076104310803619, + "grad_norm": 0.25539645552635193, + "learning_rate": 1.795447667168291e-07, + "loss": 0.195, + "step": 4044 + }, + { + "epoch": 1.0763704097924427, + "grad_norm": 0.2696840763092041, + "learning_rate": 1.795345307950702e-07, + "loss": 0.188, + "step": 4045 + }, + { + "epoch": 1.0766365087812666, + "grad_norm": 0.2759878933429718, + "learning_rate": 1.7952429260482482e-07, + "loss": 0.1755, + "step": 4046 + }, + { + "epoch": 1.0769026077700905, + "grad_norm": 0.25801369547843933, + "learning_rate": 1.7951405214638506e-07, + "loss": 0.1929, + "step": 4047 + }, + { + "epoch": 1.0771687067589144, + "grad_norm": 0.3472664952278137, + "learning_rate": 1.7950380942004296e-07, + "loss": 0.1953, + "step": 4048 + }, + { + "epoch": 1.077434805747738, + "grad_norm": 0.3040691614151001, + "learning_rate": 1.794935644260906e-07, + "loss": 0.2001, + "step": 4049 + }, + { + "epoch": 1.077700904736562, + "grad_norm": 0.2472582906484604, + "learning_rate": 1.7948331716482028e-07, + "loss": 0.1713, + "step": 4050 + }, + { + "epoch": 1.077967003725386, + "grad_norm": 0.27802690863609314, + "learning_rate": 1.7947306763652423e-07, + "loss": 0.1924, + "step": 4051 + }, + { + "epoch": 1.0782331027142096, + "grad_norm": 0.45094701647758484, + "learning_rate": 1.794628158414948e-07, + "loss": 0.1881, + "step": 4052 + }, + { + "epoch": 1.0784992017030335, + "grad_norm": 0.27862003445625305, + "learning_rate": 1.794525617800244e-07, + "loss": 0.2163, + "step": 4053 + }, + { + "epoch": 1.0787653006918574, + "grad_norm": 0.4438345432281494, + "learning_rate": 1.7944230545240545e-07, + "loss": 0.2144, + "step": 4054 + }, + { + "epoch": 1.0790313996806813, + "grad_norm": 0.3376157581806183, + "learning_rate": 1.7943204685893055e-07, + "loss": 0.2058, + "step": 4055 + }, + { + "epoch": 1.079297498669505, + "grad_norm": 0.27408015727996826, + "learning_rate": 1.7942178599989224e-07, + "loss": 0.1877, + "step": 4056 + }, + { + "epoch": 1.0795635976583289, + "grad_norm": 0.33618447184562683, + "learning_rate": 1.794115228755832e-07, + "loss": 0.2326, + "step": 4057 + }, + { + "epoch": 1.0798296966471528, + "grad_norm": 0.9148443937301636, + "learning_rate": 1.7940125748629616e-07, + "loss": 0.1805, + "step": 4058 + }, + { + "epoch": 1.0800957956359767, + "grad_norm": 0.6263395547866821, + "learning_rate": 1.793909898323239e-07, + "loss": 0.1821, + "step": 4059 + }, + { + "epoch": 1.0803618946248004, + "grad_norm": 0.3084578514099121, + "learning_rate": 1.793807199139593e-07, + "loss": 0.2152, + "step": 4060 + }, + { + "epoch": 1.0806279936136243, + "grad_norm": 0.2519201338291168, + "learning_rate": 1.7937044773149522e-07, + "loss": 0.1766, + "step": 4061 + }, + { + "epoch": 1.0808940926024482, + "grad_norm": 0.3601461946964264, + "learning_rate": 1.793601732852247e-07, + "loss": 0.2038, + "step": 4062 + }, + { + "epoch": 1.0811601915912719, + "grad_norm": 0.32908499240875244, + "learning_rate": 1.793498965754408e-07, + "loss": 0.205, + "step": 4063 + }, + { + "epoch": 1.0814262905800958, + "grad_norm": 0.4258566200733185, + "learning_rate": 1.7933961760243657e-07, + "loss": 0.2066, + "step": 4064 + }, + { + "epoch": 1.0816923895689197, + "grad_norm": 0.45581597089767456, + "learning_rate": 1.793293363665052e-07, + "loss": 0.2045, + "step": 4065 + }, + { + "epoch": 1.0819584885577436, + "grad_norm": 0.26394122838974, + "learning_rate": 1.7931905286793997e-07, + "loss": 0.1921, + "step": 4066 + }, + { + "epoch": 1.0822245875465673, + "grad_norm": 0.3415225148200989, + "learning_rate": 1.7930876710703422e-07, + "loss": 0.1974, + "step": 4067 + }, + { + "epoch": 1.0824906865353912, + "grad_norm": 0.26574185490608215, + "learning_rate": 1.792984790840812e-07, + "loss": 0.1639, + "step": 4068 + }, + { + "epoch": 1.082756785524215, + "grad_norm": 0.252924382686615, + "learning_rate": 1.7928818879937446e-07, + "loss": 0.1878, + "step": 4069 + }, + { + "epoch": 1.083022884513039, + "grad_norm": 0.25491565465927124, + "learning_rate": 1.7927789625320742e-07, + "loss": 0.1927, + "step": 4070 + }, + { + "epoch": 1.0832889835018626, + "grad_norm": 0.2541539967060089, + "learning_rate": 1.792676014458737e-07, + "loss": 0.2078, + "step": 4071 + }, + { + "epoch": 1.0835550824906865, + "grad_norm": 0.26844900846481323, + "learning_rate": 1.792573043776669e-07, + "loss": 0.1982, + "step": 4072 + }, + { + "epoch": 1.0838211814795105, + "grad_norm": 0.31112879514694214, + "learning_rate": 1.7924700504888072e-07, + "loss": 0.2005, + "step": 4073 + }, + { + "epoch": 1.0840872804683341, + "grad_norm": 0.39237263798713684, + "learning_rate": 1.7923670345980894e-07, + "loss": 0.2121, + "step": 4074 + }, + { + "epoch": 1.084353379457158, + "grad_norm": 0.323830783367157, + "learning_rate": 1.792263996107453e-07, + "loss": 0.1882, + "step": 4075 + }, + { + "epoch": 1.084619478445982, + "grad_norm": 0.3988247215747833, + "learning_rate": 1.792160935019838e-07, + "loss": 0.2218, + "step": 4076 + }, + { + "epoch": 1.0848855774348058, + "grad_norm": 0.2771044671535492, + "learning_rate": 1.7920578513381833e-07, + "loss": 0.2107, + "step": 4077 + }, + { + "epoch": 1.0851516764236295, + "grad_norm": 0.2852333188056946, + "learning_rate": 1.7919547450654292e-07, + "loss": 0.2154, + "step": 4078 + }, + { + "epoch": 1.0854177754124534, + "grad_norm": 0.27435559034347534, + "learning_rate": 1.791851616204516e-07, + "loss": 0.2068, + "step": 4079 + }, + { + "epoch": 1.0856838744012773, + "grad_norm": 0.256132572889328, + "learning_rate": 1.791748464758386e-07, + "loss": 0.1999, + "step": 4080 + }, + { + "epoch": 1.085949973390101, + "grad_norm": 0.34331852197647095, + "learning_rate": 1.7916452907299808e-07, + "loss": 0.1661, + "step": 4081 + }, + { + "epoch": 1.086216072378925, + "grad_norm": 0.2638603746891022, + "learning_rate": 1.7915420941222434e-07, + "loss": 0.1936, + "step": 4082 + }, + { + "epoch": 1.0864821713677488, + "grad_norm": 0.27704572677612305, + "learning_rate": 1.7914388749381168e-07, + "loss": 0.1994, + "step": 4083 + }, + { + "epoch": 1.0867482703565727, + "grad_norm": 0.24591459333896637, + "learning_rate": 1.7913356331805452e-07, + "loss": 0.1764, + "step": 4084 + }, + { + "epoch": 1.0870143693453964, + "grad_norm": 0.2896430492401123, + "learning_rate": 1.7912323688524733e-07, + "loss": 0.2087, + "step": 4085 + }, + { + "epoch": 1.0872804683342203, + "grad_norm": 0.2643834352493286, + "learning_rate": 1.7911290819568463e-07, + "loss": 0.1723, + "step": 4086 + }, + { + "epoch": 1.0875465673230442, + "grad_norm": 0.32830655574798584, + "learning_rate": 1.7910257724966102e-07, + "loss": 0.1887, + "step": 4087 + }, + { + "epoch": 1.087812666311868, + "grad_norm": 0.41068851947784424, + "learning_rate": 1.7909224404747118e-07, + "loss": 0.2137, + "step": 4088 + }, + { + "epoch": 1.0880787653006918, + "grad_norm": 0.37124302983283997, + "learning_rate": 1.790819085894098e-07, + "loss": 0.2004, + "step": 4089 + }, + { + "epoch": 1.0883448642895157, + "grad_norm": 0.26076215505599976, + "learning_rate": 1.790715708757717e-07, + "loss": 0.2049, + "step": 4090 + }, + { + "epoch": 1.0886109632783396, + "grad_norm": 0.2919061481952667, + "learning_rate": 1.7906123090685172e-07, + "loss": 0.2071, + "step": 4091 + }, + { + "epoch": 1.0888770622671633, + "grad_norm": 0.2719835340976715, + "learning_rate": 1.7905088868294477e-07, + "loss": 0.1942, + "step": 4092 + }, + { + "epoch": 1.0891431612559872, + "grad_norm": 0.2868877947330475, + "learning_rate": 1.7904054420434583e-07, + "loss": 0.1932, + "step": 4093 + }, + { + "epoch": 1.089409260244811, + "grad_norm": 0.3657400608062744, + "learning_rate": 1.7903019747134994e-07, + "loss": 0.2125, + "step": 4094 + }, + { + "epoch": 1.089675359233635, + "grad_norm": 0.2655043303966522, + "learning_rate": 1.7901984848425223e-07, + "loss": 0.1955, + "step": 4095 + }, + { + "epoch": 1.0899414582224587, + "grad_norm": 0.3053227365016937, + "learning_rate": 1.7900949724334787e-07, + "loss": 0.2047, + "step": 4096 + }, + { + "epoch": 1.0902075572112826, + "grad_norm": 0.2587873935699463, + "learning_rate": 1.7899914374893207e-07, + "loss": 0.1829, + "step": 4097 + }, + { + "epoch": 1.0904736562001065, + "grad_norm": 0.32666829228401184, + "learning_rate": 1.7898878800130018e-07, + "loss": 0.2233, + "step": 4098 + }, + { + "epoch": 1.0907397551889302, + "grad_norm": 0.3050656020641327, + "learning_rate": 1.7897843000074756e-07, + "loss": 0.1982, + "step": 4099 + }, + { + "epoch": 1.091005854177754, + "grad_norm": 0.25880905985832214, + "learning_rate": 1.789680697475696e-07, + "loss": 0.1826, + "step": 4100 + }, + { + "epoch": 1.091271953166578, + "grad_norm": 0.40463006496429443, + "learning_rate": 1.7895770724206182e-07, + "loss": 0.1914, + "step": 4101 + }, + { + "epoch": 1.091538052155402, + "grad_norm": 0.3789171874523163, + "learning_rate": 1.789473424845198e-07, + "loss": 0.2138, + "step": 4102 + }, + { + "epoch": 1.0918041511442256, + "grad_norm": 0.2558372914791107, + "learning_rate": 1.7893697547523914e-07, + "loss": 0.1861, + "step": 4103 + }, + { + "epoch": 1.0920702501330495, + "grad_norm": 0.3741857409477234, + "learning_rate": 1.789266062145155e-07, + "loss": 0.2055, + "step": 4104 + }, + { + "epoch": 1.0923363491218734, + "grad_norm": 0.2661583721637726, + "learning_rate": 1.7891623470264468e-07, + "loss": 0.1918, + "step": 4105 + }, + { + "epoch": 1.0926024481106973, + "grad_norm": 0.4232231676578522, + "learning_rate": 1.789058609399225e-07, + "loss": 0.197, + "step": 4106 + }, + { + "epoch": 1.092868547099521, + "grad_norm": 0.3106527626514435, + "learning_rate": 1.788954849266448e-07, + "loss": 0.1957, + "step": 4107 + }, + { + "epoch": 1.0931346460883449, + "grad_norm": 0.26636525988578796, + "learning_rate": 1.7888510666310754e-07, + "loss": 0.2, + "step": 4108 + }, + { + "epoch": 1.0934007450771688, + "grad_norm": 0.3042800724506378, + "learning_rate": 1.7887472614960677e-07, + "loss": 0.2081, + "step": 4109 + }, + { + "epoch": 1.0936668440659925, + "grad_norm": 0.4368587136268616, + "learning_rate": 1.788643433864385e-07, + "loss": 0.2123, + "step": 4110 + }, + { + "epoch": 1.0939329430548164, + "grad_norm": 0.3658977448940277, + "learning_rate": 1.7885395837389893e-07, + "loss": 0.2033, + "step": 4111 + }, + { + "epoch": 1.0941990420436403, + "grad_norm": 0.2665468156337738, + "learning_rate": 1.7884357111228419e-07, + "loss": 0.1794, + "step": 4112 + }, + { + "epoch": 1.0944651410324642, + "grad_norm": 0.2743661105632782, + "learning_rate": 1.788331816018906e-07, + "loss": 0.1964, + "step": 4113 + }, + { + "epoch": 1.0947312400212879, + "grad_norm": 0.28905895352363586, + "learning_rate": 1.7882278984301445e-07, + "loss": 0.2029, + "step": 4114 + }, + { + "epoch": 1.0949973390101118, + "grad_norm": 0.3764171898365021, + "learning_rate": 1.788123958359522e-07, + "loss": 0.1957, + "step": 4115 + }, + { + "epoch": 1.0952634379989357, + "grad_norm": 0.264236718416214, + "learning_rate": 1.7880199958100024e-07, + "loss": 0.193, + "step": 4116 + }, + { + "epoch": 1.0955295369877593, + "grad_norm": 0.3507288098335266, + "learning_rate": 1.7879160107845512e-07, + "loss": 0.1941, + "step": 4117 + }, + { + "epoch": 1.0957956359765832, + "grad_norm": 0.3603714108467102, + "learning_rate": 1.7878120032861342e-07, + "loss": 0.1999, + "step": 4118 + }, + { + "epoch": 1.0960617349654072, + "grad_norm": 0.35569992661476135, + "learning_rate": 1.7877079733177183e-07, + "loss": 0.2242, + "step": 4119 + }, + { + "epoch": 1.096327833954231, + "grad_norm": 0.35931330919265747, + "learning_rate": 1.7876039208822702e-07, + "loss": 0.2084, + "step": 4120 + }, + { + "epoch": 1.0965939329430547, + "grad_norm": 0.27686426043510437, + "learning_rate": 1.7874998459827576e-07, + "loss": 0.1741, + "step": 4121 + }, + { + "epoch": 1.0968600319318786, + "grad_norm": 0.24972212314605713, + "learning_rate": 1.7873957486221492e-07, + "loss": 0.1899, + "step": 4122 + }, + { + "epoch": 1.0971261309207025, + "grad_norm": 0.2619699239730835, + "learning_rate": 1.7872916288034137e-07, + "loss": 0.1949, + "step": 4123 + }, + { + "epoch": 1.0973922299095262, + "grad_norm": 0.3151375949382782, + "learning_rate": 1.7871874865295214e-07, + "loss": 0.2065, + "step": 4124 + }, + { + "epoch": 1.0976583288983501, + "grad_norm": 0.2974262237548828, + "learning_rate": 1.7870833218034422e-07, + "loss": 0.1973, + "step": 4125 + }, + { + "epoch": 1.097924427887174, + "grad_norm": 0.27657169103622437, + "learning_rate": 1.7869791346281473e-07, + "loss": 0.2071, + "step": 4126 + }, + { + "epoch": 1.098190526875998, + "grad_norm": 0.27141788601875305, + "learning_rate": 1.7868749250066085e-07, + "loss": 0.2119, + "step": 4127 + }, + { + "epoch": 1.0984566258648216, + "grad_norm": 0.2759656608104706, + "learning_rate": 1.7867706929417975e-07, + "loss": 0.1756, + "step": 4128 + }, + { + "epoch": 1.0987227248536455, + "grad_norm": 0.3465385437011719, + "learning_rate": 1.7866664384366877e-07, + "loss": 0.2156, + "step": 4129 + }, + { + "epoch": 1.0989888238424694, + "grad_norm": 0.29752689599990845, + "learning_rate": 1.7865621614942527e-07, + "loss": 0.2049, + "step": 4130 + }, + { + "epoch": 1.0992549228312933, + "grad_norm": 0.3407396376132965, + "learning_rate": 1.786457862117466e-07, + "loss": 0.1993, + "step": 4131 + }, + { + "epoch": 1.099521021820117, + "grad_norm": 0.8040041327476501, + "learning_rate": 1.786353540309303e-07, + "loss": 0.2003, + "step": 4132 + }, + { + "epoch": 1.099787120808941, + "grad_norm": 0.4391102194786072, + "learning_rate": 1.7862491960727392e-07, + "loss": 0.2185, + "step": 4133 + }, + { + "epoch": 1.1000532197977648, + "grad_norm": 0.34652289748191833, + "learning_rate": 1.7861448294107509e-07, + "loss": 0.1999, + "step": 4134 + }, + { + "epoch": 1.1003193187865885, + "grad_norm": 0.7309248447418213, + "learning_rate": 1.7860404403263142e-07, + "loss": 0.2273, + "step": 4135 + }, + { + "epoch": 1.1005854177754124, + "grad_norm": 0.286953330039978, + "learning_rate": 1.7859360288224067e-07, + "loss": 0.1989, + "step": 4136 + }, + { + "epoch": 1.1008515167642363, + "grad_norm": 0.3263785243034363, + "learning_rate": 1.7858315949020065e-07, + "loss": 0.2005, + "step": 4137 + }, + { + "epoch": 1.1011176157530602, + "grad_norm": 0.2708115577697754, + "learning_rate": 1.785727138568093e-07, + "loss": 0.2011, + "step": 4138 + }, + { + "epoch": 1.101383714741884, + "grad_norm": 0.26765066385269165, + "learning_rate": 1.7856226598236442e-07, + "loss": 0.2187, + "step": 4139 + }, + { + "epoch": 1.1016498137307078, + "grad_norm": 0.33958038687705994, + "learning_rate": 1.785518158671641e-07, + "loss": 0.1931, + "step": 4140 + }, + { + "epoch": 1.1019159127195317, + "grad_norm": 1.1575469970703125, + "learning_rate": 1.7854136351150634e-07, + "loss": 0.1911, + "step": 4141 + }, + { + "epoch": 1.1021820117083556, + "grad_norm": 0.26723769307136536, + "learning_rate": 1.7853090891568932e-07, + "loss": 0.1921, + "step": 4142 + }, + { + "epoch": 1.1024481106971793, + "grad_norm": 0.3474630117416382, + "learning_rate": 1.7852045208001117e-07, + "loss": 0.1971, + "step": 4143 + }, + { + "epoch": 1.1027142096860032, + "grad_norm": 0.2673819065093994, + "learning_rate": 1.7850999300477017e-07, + "loss": 0.1976, + "step": 4144 + }, + { + "epoch": 1.102980308674827, + "grad_norm": 0.4584439992904663, + "learning_rate": 1.7849953169026465e-07, + "loss": 0.2194, + "step": 4145 + }, + { + "epoch": 1.1032464076636508, + "grad_norm": 0.2610188126564026, + "learning_rate": 1.7848906813679295e-07, + "loss": 0.1932, + "step": 4146 + }, + { + "epoch": 1.1035125066524747, + "grad_norm": 0.28715071082115173, + "learning_rate": 1.7847860234465353e-07, + "loss": 0.2053, + "step": 4147 + }, + { + "epoch": 1.1037786056412986, + "grad_norm": 0.27626699209213257, + "learning_rate": 1.784681343141449e-07, + "loss": 0.188, + "step": 4148 + }, + { + "epoch": 1.1040447046301225, + "grad_norm": 0.2552007734775543, + "learning_rate": 1.7845766404556563e-07, + "loss": 0.1813, + "step": 4149 + }, + { + "epoch": 1.1043108036189462, + "grad_norm": 0.33749571442604065, + "learning_rate": 1.7844719153921433e-07, + "loss": 0.2281, + "step": 4150 + }, + { + "epoch": 1.10457690260777, + "grad_norm": 0.25386905670166016, + "learning_rate": 1.7843671679538975e-07, + "loss": 0.1852, + "step": 4151 + }, + { + "epoch": 1.104843001596594, + "grad_norm": 0.30036085844039917, + "learning_rate": 1.7842623981439058e-07, + "loss": 0.2017, + "step": 4152 + }, + { + "epoch": 1.105109100585418, + "grad_norm": 0.2773389518260956, + "learning_rate": 1.7841576059651568e-07, + "loss": 0.203, + "step": 4153 + }, + { + "epoch": 1.1053751995742416, + "grad_norm": 0.26383334398269653, + "learning_rate": 1.7840527914206396e-07, + "loss": 0.196, + "step": 4154 + }, + { + "epoch": 1.1056412985630655, + "grad_norm": 1.446102499961853, + "learning_rate": 1.7839479545133433e-07, + "loss": 0.2022, + "step": 4155 + }, + { + "epoch": 1.1059073975518894, + "grad_norm": 0.26939111948013306, + "learning_rate": 1.7838430952462584e-07, + "loss": 0.1904, + "step": 4156 + }, + { + "epoch": 1.106173496540713, + "grad_norm": 0.27932462096214294, + "learning_rate": 1.7837382136223758e-07, + "loss": 0.194, + "step": 4157 + }, + { + "epoch": 1.106439595529537, + "grad_norm": 0.33645597100257874, + "learning_rate": 1.7836333096446864e-07, + "loss": 0.179, + "step": 4158 + }, + { + "epoch": 1.1067056945183609, + "grad_norm": 0.32951444387435913, + "learning_rate": 1.7835283833161825e-07, + "loss": 0.2249, + "step": 4159 + }, + { + "epoch": 1.1069717935071848, + "grad_norm": 0.32786139845848083, + "learning_rate": 1.7834234346398575e-07, + "loss": 0.1868, + "step": 4160 + }, + { + "epoch": 1.1072378924960085, + "grad_norm": 0.2542473375797272, + "learning_rate": 1.7833184636187032e-07, + "loss": 0.1945, + "step": 4161 + }, + { + "epoch": 1.1075039914848324, + "grad_norm": 0.2694680690765381, + "learning_rate": 1.7832134702557154e-07, + "loss": 0.2168, + "step": 4162 + }, + { + "epoch": 1.1077700904736563, + "grad_norm": 0.2644069194793701, + "learning_rate": 1.7831084545538874e-07, + "loss": 0.2065, + "step": 4163 + }, + { + "epoch": 1.10803618946248, + "grad_norm": 0.29801592230796814, + "learning_rate": 1.7830034165162151e-07, + "loss": 0.2076, + "step": 4164 + }, + { + "epoch": 1.1083022884513039, + "grad_norm": 0.29869213700294495, + "learning_rate": 1.7828983561456941e-07, + "loss": 0.2125, + "step": 4165 + }, + { + "epoch": 1.1085683874401278, + "grad_norm": 0.3826741874217987, + "learning_rate": 1.7827932734453213e-07, + "loss": 0.2119, + "step": 4166 + }, + { + "epoch": 1.1088344864289517, + "grad_norm": 0.26465335488319397, + "learning_rate": 1.7826881684180934e-07, + "loss": 0.1934, + "step": 4167 + }, + { + "epoch": 1.1091005854177753, + "grad_norm": 0.25430572032928467, + "learning_rate": 1.7825830410670084e-07, + "loss": 0.172, + "step": 4168 + }, + { + "epoch": 1.1093666844065992, + "grad_norm": 0.28356829285621643, + "learning_rate": 1.7824778913950648e-07, + "loss": 0.2129, + "step": 4169 + }, + { + "epoch": 1.1096327833954232, + "grad_norm": 0.2924308776855469, + "learning_rate": 1.782372719405262e-07, + "loss": 0.1962, + "step": 4170 + }, + { + "epoch": 1.1098988823842468, + "grad_norm": 0.27177226543426514, + "learning_rate": 1.782267525100599e-07, + "loss": 0.2021, + "step": 4171 + }, + { + "epoch": 1.1101649813730707, + "grad_norm": 0.24904459714889526, + "learning_rate": 1.7821623084840768e-07, + "loss": 0.1787, + "step": 4172 + }, + { + "epoch": 1.1104310803618946, + "grad_norm": 0.2674484848976135, + "learning_rate": 1.782057069558696e-07, + "loss": 0.1976, + "step": 4173 + }, + { + "epoch": 1.1106971793507185, + "grad_norm": 0.3828698992729187, + "learning_rate": 1.7819518083274586e-07, + "loss": 0.1952, + "step": 4174 + }, + { + "epoch": 1.1109632783395422, + "grad_norm": 0.265987753868103, + "learning_rate": 1.7818465247933662e-07, + "loss": 0.1943, + "step": 4175 + }, + { + "epoch": 1.1112293773283661, + "grad_norm": 0.278767466545105, + "learning_rate": 1.7817412189594222e-07, + "loss": 0.1926, + "step": 4176 + }, + { + "epoch": 1.11149547631719, + "grad_norm": 0.2583598792552948, + "learning_rate": 1.7816358908286303e-07, + "loss": 0.1709, + "step": 4177 + }, + { + "epoch": 1.111761575306014, + "grad_norm": 0.32599395513534546, + "learning_rate": 1.7815305404039945e-07, + "loss": 0.1858, + "step": 4178 + }, + { + "epoch": 1.1120276742948376, + "grad_norm": 0.38698020577430725, + "learning_rate": 1.781425167688519e-07, + "loss": 0.2177, + "step": 4179 + }, + { + "epoch": 1.1122937732836615, + "grad_norm": 0.45895659923553467, + "learning_rate": 1.7813197726852103e-07, + "loss": 0.2053, + "step": 4180 + }, + { + "epoch": 1.1125598722724854, + "grad_norm": 0.31311455368995667, + "learning_rate": 1.7812143553970737e-07, + "loss": 0.1921, + "step": 4181 + }, + { + "epoch": 1.112825971261309, + "grad_norm": 0.25844135880470276, + "learning_rate": 1.7811089158271163e-07, + "loss": 0.1877, + "step": 4182 + }, + { + "epoch": 1.113092070250133, + "grad_norm": 0.355949342250824, + "learning_rate": 1.7810034539783453e-07, + "loss": 0.1966, + "step": 4183 + }, + { + "epoch": 1.113358169238957, + "grad_norm": 0.2535010576248169, + "learning_rate": 1.7808979698537688e-07, + "loss": 0.1977, + "step": 4184 + }, + { + "epoch": 1.1136242682277808, + "grad_norm": 0.2540786862373352, + "learning_rate": 1.7807924634563953e-07, + "loss": 0.1854, + "step": 4185 + }, + { + "epoch": 1.1138903672166045, + "grad_norm": 0.3015730381011963, + "learning_rate": 1.7806869347892342e-07, + "loss": 0.1726, + "step": 4186 + }, + { + "epoch": 1.1141564662054284, + "grad_norm": 0.33994343876838684, + "learning_rate": 1.7805813838552952e-07, + "loss": 0.1906, + "step": 4187 + }, + { + "epoch": 1.1144225651942523, + "grad_norm": 0.27823808789253235, + "learning_rate": 1.7804758106575887e-07, + "loss": 0.1891, + "step": 4188 + }, + { + "epoch": 1.1146886641830762, + "grad_norm": 0.3042999505996704, + "learning_rate": 1.7803702151991265e-07, + "loss": 0.2096, + "step": 4189 + }, + { + "epoch": 1.1149547631719, + "grad_norm": 0.2746676206588745, + "learning_rate": 1.7802645974829197e-07, + "loss": 0.1794, + "step": 4190 + }, + { + "epoch": 1.1152208621607238, + "grad_norm": 0.30616846680641174, + "learning_rate": 1.780158957511981e-07, + "loss": 0.2068, + "step": 4191 + }, + { + "epoch": 1.1154869611495477, + "grad_norm": 0.2727515995502472, + "learning_rate": 1.7800532952893236e-07, + "loss": 0.197, + "step": 4192 + }, + { + "epoch": 1.1157530601383714, + "grad_norm": 0.3270769715309143, + "learning_rate": 1.7799476108179607e-07, + "loss": 0.2068, + "step": 4193 + }, + { + "epoch": 1.1160191591271953, + "grad_norm": 0.2827956974506378, + "learning_rate": 1.7798419041009075e-07, + "loss": 0.2003, + "step": 4194 + }, + { + "epoch": 1.1162852581160192, + "grad_norm": 0.3727482557296753, + "learning_rate": 1.779736175141178e-07, + "loss": 0.2135, + "step": 4195 + }, + { + "epoch": 1.116551357104843, + "grad_norm": 0.34263983368873596, + "learning_rate": 1.7796304239417886e-07, + "loss": 0.2254, + "step": 4196 + }, + { + "epoch": 1.1168174560936668, + "grad_norm": 0.29894402623176575, + "learning_rate": 1.7795246505057554e-07, + "loss": 0.222, + "step": 4197 + }, + { + "epoch": 1.1170835550824907, + "grad_norm": 0.2876875102519989, + "learning_rate": 1.779418854836095e-07, + "loss": 0.1991, + "step": 4198 + }, + { + "epoch": 1.1173496540713146, + "grad_norm": 0.27414172887802124, + "learning_rate": 1.7793130369358247e-07, + "loss": 0.1929, + "step": 4199 + }, + { + "epoch": 1.1176157530601383, + "grad_norm": 0.2906729578971863, + "learning_rate": 1.779207196807963e-07, + "loss": 0.22, + "step": 4200 + }, + { + "epoch": 1.1178818520489622, + "grad_norm": 0.29417768120765686, + "learning_rate": 1.7791013344555287e-07, + "loss": 0.188, + "step": 4201 + }, + { + "epoch": 1.118147951037786, + "grad_norm": 0.34117141366004944, + "learning_rate": 1.7789954498815413e-07, + "loss": 0.2007, + "step": 4202 + }, + { + "epoch": 1.11841405002661, + "grad_norm": 0.2609763741493225, + "learning_rate": 1.7788895430890207e-07, + "loss": 0.1848, + "step": 4203 + }, + { + "epoch": 1.1186801490154337, + "grad_norm": 0.3414206802845001, + "learning_rate": 1.7787836140809874e-07, + "loss": 0.2122, + "step": 4204 + }, + { + "epoch": 1.1189462480042576, + "grad_norm": 0.3679276704788208, + "learning_rate": 1.778677662860463e-07, + "loss": 0.217, + "step": 4205 + }, + { + "epoch": 1.1192123469930815, + "grad_norm": 0.3582223057746887, + "learning_rate": 1.778571689430469e-07, + "loss": 0.2161, + "step": 4206 + }, + { + "epoch": 1.1194784459819052, + "grad_norm": 0.262408047914505, + "learning_rate": 1.7784656937940285e-07, + "loss": 0.1954, + "step": 4207 + }, + { + "epoch": 1.119744544970729, + "grad_norm": 0.25939619541168213, + "learning_rate": 1.7783596759541645e-07, + "loss": 0.1919, + "step": 4208 + }, + { + "epoch": 1.120010643959553, + "grad_norm": 0.31304726004600525, + "learning_rate": 1.7782536359139008e-07, + "loss": 0.2111, + "step": 4209 + }, + { + "epoch": 1.1202767429483769, + "grad_norm": 0.30406954884529114, + "learning_rate": 1.778147573676262e-07, + "loss": 0.2223, + "step": 4210 + }, + { + "epoch": 1.1205428419372006, + "grad_norm": 0.3263770043849945, + "learning_rate": 1.778041489244273e-07, + "loss": 0.2069, + "step": 4211 + }, + { + "epoch": 1.1208089409260245, + "grad_norm": 0.2521401047706604, + "learning_rate": 1.7779353826209595e-07, + "loss": 0.1901, + "step": 4212 + }, + { + "epoch": 1.1210750399148484, + "grad_norm": 0.2775024175643921, + "learning_rate": 1.7778292538093485e-07, + "loss": 0.202, + "step": 4213 + }, + { + "epoch": 1.1213411389036723, + "grad_norm": 0.6049270629882812, + "learning_rate": 1.7777231028124662e-07, + "loss": 0.2098, + "step": 4214 + }, + { + "epoch": 1.121607237892496, + "grad_norm": 0.2498023957014084, + "learning_rate": 1.7776169296333409e-07, + "loss": 0.1741, + "step": 4215 + }, + { + "epoch": 1.1218733368813198, + "grad_norm": 0.3179378807544708, + "learning_rate": 1.7775107342750003e-07, + "loss": 0.1934, + "step": 4216 + }, + { + "epoch": 1.1221394358701438, + "grad_norm": 0.25584831833839417, + "learning_rate": 1.7774045167404738e-07, + "loss": 0.1853, + "step": 4217 + }, + { + "epoch": 1.1224055348589674, + "grad_norm": 0.3845981955528259, + "learning_rate": 1.7772982770327904e-07, + "loss": 0.1979, + "step": 4218 + }, + { + "epoch": 1.1226716338477913, + "grad_norm": 0.2647477090358734, + "learning_rate": 1.7771920151549808e-07, + "loss": 0.1899, + "step": 4219 + }, + { + "epoch": 1.1229377328366152, + "grad_norm": 0.32968494296073914, + "learning_rate": 1.7770857311100756e-07, + "loss": 0.1968, + "step": 4220 + }, + { + "epoch": 1.1232038318254391, + "grad_norm": 0.27944880723953247, + "learning_rate": 1.776979424901106e-07, + "loss": 0.2043, + "step": 4221 + }, + { + "epoch": 1.1234699308142628, + "grad_norm": 0.2829529643058777, + "learning_rate": 1.7768730965311048e-07, + "loss": 0.1995, + "step": 4222 + }, + { + "epoch": 1.1237360298030867, + "grad_norm": 0.2985123097896576, + "learning_rate": 1.776766746003104e-07, + "loss": 0.1926, + "step": 4223 + }, + { + "epoch": 1.1240021287919106, + "grad_norm": 0.28873202204704285, + "learning_rate": 1.7766603733201367e-07, + "loss": 0.1965, + "step": 4224 + }, + { + "epoch": 1.1242682277807345, + "grad_norm": 0.28863009810447693, + "learning_rate": 1.776553978485238e-07, + "loss": 0.2066, + "step": 4225 + }, + { + "epoch": 1.1245343267695582, + "grad_norm": 0.3378449082374573, + "learning_rate": 1.7764475615014415e-07, + "loss": 0.2088, + "step": 4226 + }, + { + "epoch": 1.1248004257583821, + "grad_norm": 0.449275940656662, + "learning_rate": 1.7763411223717825e-07, + "loss": 0.2242, + "step": 4227 + }, + { + "epoch": 1.125066524747206, + "grad_norm": 0.34827920794487, + "learning_rate": 1.7762346610992976e-07, + "loss": 0.1892, + "step": 4228 + }, + { + "epoch": 1.1253326237360297, + "grad_norm": 0.49800199270248413, + "learning_rate": 1.7761281776870224e-07, + "loss": 0.2093, + "step": 4229 + }, + { + "epoch": 1.1255987227248536, + "grad_norm": 0.26650992035865784, + "learning_rate": 1.7760216721379943e-07, + "loss": 0.1996, + "step": 4230 + }, + { + "epoch": 1.1258648217136775, + "grad_norm": 0.2458305060863495, + "learning_rate": 1.7759151444552515e-07, + "loss": 0.1873, + "step": 4231 + }, + { + "epoch": 1.1261309207025014, + "grad_norm": 0.31223779916763306, + "learning_rate": 1.7758085946418319e-07, + "loss": 0.1952, + "step": 4232 + }, + { + "epoch": 1.126397019691325, + "grad_norm": 0.3234296441078186, + "learning_rate": 1.7757020227007745e-07, + "loss": 0.2024, + "step": 4233 + }, + { + "epoch": 1.126663118680149, + "grad_norm": 0.24920503795146942, + "learning_rate": 1.7755954286351194e-07, + "loss": 0.1806, + "step": 4234 + }, + { + "epoch": 1.126929217668973, + "grad_norm": 0.32872626185417175, + "learning_rate": 1.7754888124479063e-07, + "loss": 0.2058, + "step": 4235 + }, + { + "epoch": 1.1271953166577968, + "grad_norm": 0.2631506025791168, + "learning_rate": 1.7753821741421767e-07, + "loss": 0.1956, + "step": 4236 + }, + { + "epoch": 1.1274614156466205, + "grad_norm": 0.2713625729084015, + "learning_rate": 1.7752755137209717e-07, + "loss": 0.1979, + "step": 4237 + }, + { + "epoch": 1.1277275146354444, + "grad_norm": 0.36577102541923523, + "learning_rate": 1.7751688311873333e-07, + "loss": 0.2332, + "step": 4238 + }, + { + "epoch": 1.1279936136242683, + "grad_norm": 0.2742888033390045, + "learning_rate": 1.7750621265443052e-07, + "loss": 0.2032, + "step": 4239 + }, + { + "epoch": 1.128259712613092, + "grad_norm": 0.4146541953086853, + "learning_rate": 1.7749553997949298e-07, + "loss": 0.1983, + "step": 4240 + }, + { + "epoch": 1.128525811601916, + "grad_norm": 0.27598896622657776, + "learning_rate": 1.774848650942252e-07, + "loss": 0.1821, + "step": 4241 + }, + { + "epoch": 1.1287919105907398, + "grad_norm": 0.25460338592529297, + "learning_rate": 1.7747418799893156e-07, + "loss": 0.1803, + "step": 4242 + }, + { + "epoch": 1.1290580095795635, + "grad_norm": 0.35645541548728943, + "learning_rate": 1.7746350869391668e-07, + "loss": 0.2014, + "step": 4243 + }, + { + "epoch": 1.1293241085683874, + "grad_norm": 0.29589641094207764, + "learning_rate": 1.7745282717948512e-07, + "loss": 0.2119, + "step": 4244 + }, + { + "epoch": 1.1295902075572113, + "grad_norm": 0.28279584646224976, + "learning_rate": 1.7744214345594153e-07, + "loss": 0.2069, + "step": 4245 + }, + { + "epoch": 1.1298563065460352, + "grad_norm": 0.2711944282054901, + "learning_rate": 1.7743145752359062e-07, + "loss": 0.189, + "step": 4246 + }, + { + "epoch": 1.1301224055348589, + "grad_norm": 0.277529239654541, + "learning_rate": 1.7742076938273723e-07, + "loss": 0.2129, + "step": 4247 + }, + { + "epoch": 1.1303885045236828, + "grad_norm": 0.2621673047542572, + "learning_rate": 1.7741007903368615e-07, + "loss": 0.1772, + "step": 4248 + }, + { + "epoch": 1.1306546035125067, + "grad_norm": 0.3450784683227539, + "learning_rate": 1.773993864767423e-07, + "loss": 0.1865, + "step": 4249 + }, + { + "epoch": 1.1309207025013306, + "grad_norm": 0.7094898819923401, + "learning_rate": 1.7738869171221068e-07, + "loss": 0.1893, + "step": 4250 + }, + { + "epoch": 1.1311868014901543, + "grad_norm": 0.4437689781188965, + "learning_rate": 1.773779947403963e-07, + "loss": 0.1885, + "step": 4251 + }, + { + "epoch": 1.1314529004789782, + "grad_norm": 0.26005738973617554, + "learning_rate": 1.7736729556160427e-07, + "loss": 0.191, + "step": 4252 + }, + { + "epoch": 1.131718999467802, + "grad_norm": 0.3459281325340271, + "learning_rate": 1.7735659417613976e-07, + "loss": 0.1859, + "step": 4253 + }, + { + "epoch": 1.1319850984566258, + "grad_norm": 0.26993027329444885, + "learning_rate": 1.77345890584308e-07, + "loss": 0.1921, + "step": 4254 + }, + { + "epoch": 1.1322511974454497, + "grad_norm": 0.2923433482646942, + "learning_rate": 1.7733518478641424e-07, + "loss": 0.2089, + "step": 4255 + }, + { + "epoch": 1.1325172964342736, + "grad_norm": 0.2821837365627289, + "learning_rate": 1.7732447678276389e-07, + "loss": 0.1971, + "step": 4256 + }, + { + "epoch": 1.1327833954230975, + "grad_norm": 0.3201492130756378, + "learning_rate": 1.773137665736623e-07, + "loss": 0.1979, + "step": 4257 + }, + { + "epoch": 1.1330494944119212, + "grad_norm": 0.3425295054912567, + "learning_rate": 1.77303054159415e-07, + "loss": 0.1998, + "step": 4258 + }, + { + "epoch": 1.133315593400745, + "grad_norm": 0.335182785987854, + "learning_rate": 1.772923395403275e-07, + "loss": 0.1887, + "step": 4259 + }, + { + "epoch": 1.133581692389569, + "grad_norm": 0.2738115191459656, + "learning_rate": 1.772816227167054e-07, + "loss": 0.1852, + "step": 4260 + }, + { + "epoch": 1.1338477913783929, + "grad_norm": 0.26693153381347656, + "learning_rate": 1.772709036888544e-07, + "loss": 0.1808, + "step": 4261 + }, + { + "epoch": 1.1341138903672165, + "grad_norm": 0.36483070254325867, + "learning_rate": 1.7726018245708017e-07, + "loss": 0.2196, + "step": 4262 + }, + { + "epoch": 1.1343799893560405, + "grad_norm": 0.29173630475997925, + "learning_rate": 1.7724945902168857e-07, + "loss": 0.2044, + "step": 4263 + }, + { + "epoch": 1.1346460883448644, + "grad_norm": 0.3882601261138916, + "learning_rate": 1.7723873338298542e-07, + "loss": 0.2371, + "step": 4264 + }, + { + "epoch": 1.134912187333688, + "grad_norm": 0.3863663971424103, + "learning_rate": 1.7722800554127663e-07, + "loss": 0.2035, + "step": 4265 + }, + { + "epoch": 1.135178286322512, + "grad_norm": 0.34640154242515564, + "learning_rate": 1.7721727549686816e-07, + "loss": 0.1929, + "step": 4266 + }, + { + "epoch": 1.1354443853113358, + "grad_norm": 0.3354243338108063, + "learning_rate": 1.7720654325006609e-07, + "loss": 0.2131, + "step": 4267 + }, + { + "epoch": 1.1357104843001598, + "grad_norm": 0.26535603404045105, + "learning_rate": 1.7719580880117655e-07, + "loss": 0.2062, + "step": 4268 + }, + { + "epoch": 1.1359765832889834, + "grad_norm": 0.26894956827163696, + "learning_rate": 1.7718507215050564e-07, + "loss": 0.212, + "step": 4269 + }, + { + "epoch": 1.1362426822778073, + "grad_norm": 0.37313124537467957, + "learning_rate": 1.7717433329835964e-07, + "loss": 0.204, + "step": 4270 + }, + { + "epoch": 1.1365087812666312, + "grad_norm": 0.27039632201194763, + "learning_rate": 1.7716359224504482e-07, + "loss": 0.1851, + "step": 4271 + }, + { + "epoch": 1.1367748802554551, + "grad_norm": 0.4344452917575836, + "learning_rate": 1.7715284899086756e-07, + "loss": 0.1757, + "step": 4272 + }, + { + "epoch": 1.1370409792442788, + "grad_norm": 0.3139859139919281, + "learning_rate": 1.7714210353613423e-07, + "loss": 0.2102, + "step": 4273 + }, + { + "epoch": 1.1373070782331027, + "grad_norm": 0.29045864939689636, + "learning_rate": 1.771313558811514e-07, + "loss": 0.2187, + "step": 4274 + }, + { + "epoch": 1.1375731772219266, + "grad_norm": 0.2907758951187134, + "learning_rate": 1.771206060262255e-07, + "loss": 0.2218, + "step": 4275 + }, + { + "epoch": 1.1378392762107503, + "grad_norm": 0.2827487885951996, + "learning_rate": 1.7710985397166326e-07, + "loss": 0.1902, + "step": 4276 + }, + { + "epoch": 1.1381053751995742, + "grad_norm": 0.24712468683719635, + "learning_rate": 1.7709909971777125e-07, + "loss": 0.1816, + "step": 4277 + }, + { + "epoch": 1.1383714741883981, + "grad_norm": 0.3502947986125946, + "learning_rate": 1.7708834326485627e-07, + "loss": 0.1877, + "step": 4278 + }, + { + "epoch": 1.1386375731772218, + "grad_norm": 0.25235772132873535, + "learning_rate": 1.7707758461322506e-07, + "loss": 0.1969, + "step": 4279 + }, + { + "epoch": 1.1389036721660457, + "grad_norm": 0.2539938688278198, + "learning_rate": 1.7706682376318455e-07, + "loss": 0.1837, + "step": 4280 + }, + { + "epoch": 1.1391697711548696, + "grad_norm": 0.2695600092411041, + "learning_rate": 1.7705606071504158e-07, + "loss": 0.1897, + "step": 4281 + }, + { + "epoch": 1.1394358701436935, + "grad_norm": 0.3825121819972992, + "learning_rate": 1.7704529546910317e-07, + "loss": 0.1984, + "step": 4282 + }, + { + "epoch": 1.1397019691325172, + "grad_norm": 0.23035778105258942, + "learning_rate": 1.7703452802567638e-07, + "loss": 0.1747, + "step": 4283 + }, + { + "epoch": 1.139968068121341, + "grad_norm": 0.27384790778160095, + "learning_rate": 1.770237583850683e-07, + "loss": 0.1948, + "step": 4284 + }, + { + "epoch": 1.140234167110165, + "grad_norm": 0.25178298354148865, + "learning_rate": 1.7701298654758613e-07, + "loss": 0.1787, + "step": 4285 + }, + { + "epoch": 1.140500266098989, + "grad_norm": 0.25915905833244324, + "learning_rate": 1.7700221251353708e-07, + "loss": 0.1833, + "step": 4286 + }, + { + "epoch": 1.1407663650878126, + "grad_norm": 0.4705076217651367, + "learning_rate": 1.7699143628322846e-07, + "loss": 0.2138, + "step": 4287 + }, + { + "epoch": 1.1410324640766365, + "grad_norm": 0.278073251247406, + "learning_rate": 1.769806578569676e-07, + "loss": 0.2042, + "step": 4288 + }, + { + "epoch": 1.1412985630654604, + "grad_norm": 0.280000776052475, + "learning_rate": 1.7696987723506197e-07, + "loss": 0.2004, + "step": 4289 + }, + { + "epoch": 1.141564662054284, + "grad_norm": 0.3227968215942383, + "learning_rate": 1.7695909441781903e-07, + "loss": 0.194, + "step": 4290 + }, + { + "epoch": 1.141830761043108, + "grad_norm": 0.40289589762687683, + "learning_rate": 1.7694830940554634e-07, + "loss": 0.1929, + "step": 4291 + }, + { + "epoch": 1.142096860031932, + "grad_norm": 0.26388636231422424, + "learning_rate": 1.7693752219855146e-07, + "loss": 0.1939, + "step": 4292 + }, + { + "epoch": 1.1423629590207558, + "grad_norm": 0.2483675330877304, + "learning_rate": 1.769267327971421e-07, + "loss": 0.1919, + "step": 4293 + }, + { + "epoch": 1.1426290580095795, + "grad_norm": 0.2613779604434967, + "learning_rate": 1.7691594120162607e-07, + "loss": 0.1961, + "step": 4294 + }, + { + "epoch": 1.1428951569984034, + "grad_norm": 0.3418949246406555, + "learning_rate": 1.76905147412311e-07, + "loss": 0.2111, + "step": 4295 + }, + { + "epoch": 1.1431612559872273, + "grad_norm": 0.25856053829193115, + "learning_rate": 1.7689435142950494e-07, + "loss": 0.1944, + "step": 4296 + }, + { + "epoch": 1.1434273549760512, + "grad_norm": 0.32250329852104187, + "learning_rate": 1.7688355325351564e-07, + "loss": 0.2159, + "step": 4297 + }, + { + "epoch": 1.1436934539648749, + "grad_norm": 0.3963697552680969, + "learning_rate": 1.7687275288465121e-07, + "loss": 0.1981, + "step": 4298 + }, + { + "epoch": 1.1439595529536988, + "grad_norm": 0.2428608536720276, + "learning_rate": 1.7686195032321964e-07, + "loss": 0.1824, + "step": 4299 + }, + { + "epoch": 1.1442256519425227, + "grad_norm": 0.2512792646884918, + "learning_rate": 1.768511455695291e-07, + "loss": 0.1899, + "step": 4300 + }, + { + "epoch": 1.1444917509313464, + "grad_norm": 0.2474978268146515, + "learning_rate": 1.7684033862388765e-07, + "loss": 0.1862, + "step": 4301 + }, + { + "epoch": 1.1447578499201703, + "grad_norm": 0.42562711238861084, + "learning_rate": 1.7682952948660363e-07, + "loss": 0.2255, + "step": 4302 + }, + { + "epoch": 1.1450239489089942, + "grad_norm": 0.35120150446891785, + "learning_rate": 1.768187181579853e-07, + "loss": 0.1947, + "step": 4303 + }, + { + "epoch": 1.145290047897818, + "grad_norm": 0.3075923025608063, + "learning_rate": 1.7680790463834102e-07, + "loss": 0.2093, + "step": 4304 + }, + { + "epoch": 1.1455561468866418, + "grad_norm": 0.2606615722179413, + "learning_rate": 1.7679708892797925e-07, + "loss": 0.2058, + "step": 4305 + }, + { + "epoch": 1.1458222458754657, + "grad_norm": 0.29781776666641235, + "learning_rate": 1.7678627102720843e-07, + "loss": 0.2147, + "step": 4306 + }, + { + "epoch": 1.1460883448642896, + "grad_norm": 0.3252119719982147, + "learning_rate": 1.767754509363371e-07, + "loss": 0.1814, + "step": 4307 + }, + { + "epoch": 1.1463544438531135, + "grad_norm": 0.33583369851112366, + "learning_rate": 1.767646286556739e-07, + "loss": 0.1942, + "step": 4308 + }, + { + "epoch": 1.1466205428419372, + "grad_norm": 0.34720364212989807, + "learning_rate": 1.767538041855275e-07, + "loss": 0.2093, + "step": 4309 + }, + { + "epoch": 1.146886641830761, + "grad_norm": 0.3216802477836609, + "learning_rate": 1.7674297752620665e-07, + "loss": 0.1905, + "step": 4310 + }, + { + "epoch": 1.147152740819585, + "grad_norm": 0.3006526827812195, + "learning_rate": 1.767321486780201e-07, + "loss": 0.1994, + "step": 4311 + }, + { + "epoch": 1.1474188398084086, + "grad_norm": 0.33405759930610657, + "learning_rate": 1.7672131764127678e-07, + "loss": 0.2156, + "step": 4312 + }, + { + "epoch": 1.1476849387972325, + "grad_norm": 0.29448366165161133, + "learning_rate": 1.7671048441628552e-07, + "loss": 0.1984, + "step": 4313 + }, + { + "epoch": 1.1479510377860564, + "grad_norm": 0.28404489159584045, + "learning_rate": 1.7669964900335543e-07, + "loss": 0.1958, + "step": 4314 + }, + { + "epoch": 1.1482171367748801, + "grad_norm": 0.36466559767723083, + "learning_rate": 1.7668881140279543e-07, + "loss": 0.2106, + "step": 4315 + }, + { + "epoch": 1.148483235763704, + "grad_norm": 0.27884578704833984, + "learning_rate": 1.7667797161491475e-07, + "loss": 0.2188, + "step": 4316 + }, + { + "epoch": 1.148749334752528, + "grad_norm": 0.2894324064254761, + "learning_rate": 1.7666712964002246e-07, + "loss": 0.2026, + "step": 4317 + }, + { + "epoch": 1.1490154337413518, + "grad_norm": 0.499979704618454, + "learning_rate": 1.7665628547842784e-07, + "loss": 0.2255, + "step": 4318 + }, + { + "epoch": 1.1492815327301757, + "grad_norm": 0.3041483759880066, + "learning_rate": 1.766454391304402e-07, + "loss": 0.1967, + "step": 4319 + }, + { + "epoch": 1.1495476317189994, + "grad_norm": 0.25159522891044617, + "learning_rate": 1.7663459059636886e-07, + "loss": 0.1771, + "step": 4320 + }, + { + "epoch": 1.1498137307078233, + "grad_norm": 0.26639509201049805, + "learning_rate": 1.766237398765233e-07, + "loss": 0.208, + "step": 4321 + }, + { + "epoch": 1.1500798296966472, + "grad_norm": 0.25909990072250366, + "learning_rate": 1.7661288697121296e-07, + "loss": 0.1887, + "step": 4322 + }, + { + "epoch": 1.150345928685471, + "grad_norm": 0.2627456486225128, + "learning_rate": 1.7660203188074741e-07, + "loss": 0.1909, + "step": 4323 + }, + { + "epoch": 1.1506120276742948, + "grad_norm": 0.29254573583602905, + "learning_rate": 1.7659117460543622e-07, + "loss": 0.199, + "step": 4324 + }, + { + "epoch": 1.1508781266631187, + "grad_norm": 0.28054842352867126, + "learning_rate": 1.765803151455891e-07, + "loss": 0.1984, + "step": 4325 + }, + { + "epoch": 1.1511442256519424, + "grad_norm": 0.27507853507995605, + "learning_rate": 1.7656945350151577e-07, + "loss": 0.2135, + "step": 4326 + }, + { + "epoch": 1.1514103246407663, + "grad_norm": 0.2893020510673523, + "learning_rate": 1.7655858967352603e-07, + "loss": 0.1962, + "step": 4327 + }, + { + "epoch": 1.1516764236295902, + "grad_norm": 0.24550259113311768, + "learning_rate": 1.7654772366192975e-07, + "loss": 0.1818, + "step": 4328 + }, + { + "epoch": 1.1519425226184141, + "grad_norm": 0.2844108045101166, + "learning_rate": 1.7653685546703683e-07, + "loss": 0.1724, + "step": 4329 + }, + { + "epoch": 1.1522086216072378, + "grad_norm": 0.37730276584625244, + "learning_rate": 1.7652598508915729e-07, + "loss": 0.1952, + "step": 4330 + }, + { + "epoch": 1.1524747205960617, + "grad_norm": 0.3790351152420044, + "learning_rate": 1.7651511252860113e-07, + "loss": 0.2021, + "step": 4331 + }, + { + "epoch": 1.1527408195848856, + "grad_norm": 0.28278425335884094, + "learning_rate": 1.7650423778567845e-07, + "loss": 0.1831, + "step": 4332 + }, + { + "epoch": 1.1530069185737095, + "grad_norm": 0.27183911204338074, + "learning_rate": 1.7649336086069945e-07, + "loss": 0.2051, + "step": 4333 + }, + { + "epoch": 1.1532730175625332, + "grad_norm": 0.27417120337486267, + "learning_rate": 1.764824817539744e-07, + "loss": 0.224, + "step": 4334 + }, + { + "epoch": 1.153539116551357, + "grad_norm": 0.35815122723579407, + "learning_rate": 1.7647160046581349e-07, + "loss": 0.219, + "step": 4335 + }, + { + "epoch": 1.153805215540181, + "grad_norm": 0.2776469886302948, + "learning_rate": 1.7646071699652718e-07, + "loss": 0.2044, + "step": 4336 + }, + { + "epoch": 1.1540713145290047, + "grad_norm": 0.2566561996936798, + "learning_rate": 1.7644983134642583e-07, + "loss": 0.21, + "step": 4337 + }, + { + "epoch": 1.1543374135178286, + "grad_norm": 0.32335594296455383, + "learning_rate": 1.7643894351581992e-07, + "loss": 0.1855, + "step": 4338 + }, + { + "epoch": 1.1546035125066525, + "grad_norm": 0.270759254693985, + "learning_rate": 1.7642805350502003e-07, + "loss": 0.1768, + "step": 4339 + }, + { + "epoch": 1.1548696114954764, + "grad_norm": 0.38258764147758484, + "learning_rate": 1.7641716131433672e-07, + "loss": 0.2411, + "step": 4340 + }, + { + "epoch": 1.1551357104843, + "grad_norm": 0.29564905166625977, + "learning_rate": 1.7640626694408073e-07, + "loss": 0.2128, + "step": 4341 + }, + { + "epoch": 1.155401809473124, + "grad_norm": 0.35309919714927673, + "learning_rate": 1.763953703945627e-07, + "loss": 0.1957, + "step": 4342 + }, + { + "epoch": 1.155667908461948, + "grad_norm": 0.32427486777305603, + "learning_rate": 1.7638447166609346e-07, + "loss": 0.2163, + "step": 4343 + }, + { + "epoch": 1.1559340074507718, + "grad_norm": 0.28175294399261475, + "learning_rate": 1.7637357075898392e-07, + "loss": 0.2038, + "step": 4344 + }, + { + "epoch": 1.1562001064395955, + "grad_norm": 0.2649657726287842, + "learning_rate": 1.763626676735449e-07, + "loss": 0.1939, + "step": 4345 + }, + { + "epoch": 1.1564662054284194, + "grad_norm": 0.3754815459251404, + "learning_rate": 1.7635176241008743e-07, + "loss": 0.185, + "step": 4346 + }, + { + "epoch": 1.1567323044172433, + "grad_norm": 0.2507289946079254, + "learning_rate": 1.7634085496892255e-07, + "loss": 0.1876, + "step": 4347 + }, + { + "epoch": 1.156998403406067, + "grad_norm": 0.27600711584091187, + "learning_rate": 1.7632994535036134e-07, + "loss": 0.2043, + "step": 4348 + }, + { + "epoch": 1.1572645023948909, + "grad_norm": 0.3119935393333435, + "learning_rate": 1.76319033554715e-07, + "loss": 0.1852, + "step": 4349 + }, + { + "epoch": 1.1575306013837148, + "grad_norm": 0.3211592435836792, + "learning_rate": 1.7630811958229473e-07, + "loss": 0.1846, + "step": 4350 + }, + { + "epoch": 1.1577967003725387, + "grad_norm": 0.2775719165802002, + "learning_rate": 1.7629720343341178e-07, + "loss": 0.2008, + "step": 4351 + }, + { + "epoch": 1.1580627993613624, + "grad_norm": 0.26033058762550354, + "learning_rate": 1.762862851083776e-07, + "loss": 0.182, + "step": 4352 + }, + { + "epoch": 1.1583288983501863, + "grad_norm": 0.32786062359809875, + "learning_rate": 1.7627536460750352e-07, + "loss": 0.2013, + "step": 4353 + }, + { + "epoch": 1.1585949973390102, + "grad_norm": 0.28268343210220337, + "learning_rate": 1.7626444193110105e-07, + "loss": 0.1946, + "step": 4354 + }, + { + "epoch": 1.158861096327834, + "grad_norm": 0.3067837953567505, + "learning_rate": 1.762535170794817e-07, + "loss": 0.2097, + "step": 4355 + }, + { + "epoch": 1.1591271953166578, + "grad_norm": 0.27650219202041626, + "learning_rate": 1.7624259005295713e-07, + "loss": 0.1877, + "step": 4356 + }, + { + "epoch": 1.1593932943054817, + "grad_norm": 0.2883758246898651, + "learning_rate": 1.762316608518389e-07, + "loss": 0.2043, + "step": 4357 + }, + { + "epoch": 1.1596593932943056, + "grad_norm": 0.28544238209724426, + "learning_rate": 1.7622072947643885e-07, + "loss": 0.2128, + "step": 4358 + }, + { + "epoch": 1.1599254922831292, + "grad_norm": 0.2784377932548523, + "learning_rate": 1.762097959270687e-07, + "loss": 0.2022, + "step": 4359 + }, + { + "epoch": 1.1601915912719531, + "grad_norm": 0.3458016812801361, + "learning_rate": 1.7619886020404028e-07, + "loss": 0.1924, + "step": 4360 + }, + { + "epoch": 1.160457690260777, + "grad_norm": 0.2673640251159668, + "learning_rate": 1.7618792230766552e-07, + "loss": 0.2015, + "step": 4361 + }, + { + "epoch": 1.1607237892496007, + "grad_norm": 0.2501892149448395, + "learning_rate": 1.7617698223825644e-07, + "loss": 0.1917, + "step": 4362 + }, + { + "epoch": 1.1609898882384246, + "grad_norm": 0.2988698184490204, + "learning_rate": 1.7616603999612499e-07, + "loss": 0.2016, + "step": 4363 + }, + { + "epoch": 1.1612559872272485, + "grad_norm": 0.2553655505180359, + "learning_rate": 1.761550955815833e-07, + "loss": 0.1935, + "step": 4364 + }, + { + "epoch": 1.1615220862160724, + "grad_norm": 0.3506869077682495, + "learning_rate": 1.7614414899494353e-07, + "loss": 0.2025, + "step": 4365 + }, + { + "epoch": 1.1617881852048961, + "grad_norm": 0.3031727373600006, + "learning_rate": 1.7613320023651793e-07, + "loss": 0.1839, + "step": 4366 + }, + { + "epoch": 1.16205428419372, + "grad_norm": 0.31000667810440063, + "learning_rate": 1.761222493066187e-07, + "loss": 0.1814, + "step": 4367 + }, + { + "epoch": 1.162320383182544, + "grad_norm": 0.3456270694732666, + "learning_rate": 1.7611129620555827e-07, + "loss": 0.2168, + "step": 4368 + }, + { + "epoch": 1.1625864821713678, + "grad_norm": 0.2679484784603119, + "learning_rate": 1.76100340933649e-07, + "loss": 0.1921, + "step": 4369 + }, + { + "epoch": 1.1628525811601915, + "grad_norm": 0.41829413175582886, + "learning_rate": 1.7608938349120335e-07, + "loss": 0.1915, + "step": 4370 + }, + { + "epoch": 1.1631186801490154, + "grad_norm": 0.6594303846359253, + "learning_rate": 1.7607842387853386e-07, + "loss": 0.2182, + "step": 4371 + }, + { + "epoch": 1.1633847791378393, + "grad_norm": 0.3831595480442047, + "learning_rate": 1.7606746209595316e-07, + "loss": 0.2113, + "step": 4372 + }, + { + "epoch": 1.163650878126663, + "grad_norm": 0.2555779814720154, + "learning_rate": 1.7605649814377382e-07, + "loss": 0.2141, + "step": 4373 + }, + { + "epoch": 1.163916977115487, + "grad_norm": 0.2599748969078064, + "learning_rate": 1.760455320223086e-07, + "loss": 0.1849, + "step": 4374 + }, + { + "epoch": 1.1641830761043108, + "grad_norm": 0.33774489164352417, + "learning_rate": 1.760345637318703e-07, + "loss": 0.2028, + "step": 4375 + }, + { + "epoch": 1.1644491750931347, + "grad_norm": 0.2803836464881897, + "learning_rate": 1.7602359327277174e-07, + "loss": 0.1992, + "step": 4376 + }, + { + "epoch": 1.1647152740819584, + "grad_norm": 0.25178828835487366, + "learning_rate": 1.760126206453258e-07, + "loss": 0.1945, + "step": 4377 + }, + { + "epoch": 1.1649813730707823, + "grad_norm": 0.3869815170764923, + "learning_rate": 1.7600164584984544e-07, + "loss": 0.2108, + "step": 4378 + }, + { + "epoch": 1.1652474720596062, + "grad_norm": 0.3501225411891937, + "learning_rate": 1.7599066888664372e-07, + "loss": 0.1976, + "step": 4379 + }, + { + "epoch": 1.1655135710484301, + "grad_norm": 0.28599855303764343, + "learning_rate": 1.7597968975603368e-07, + "loss": 0.2167, + "step": 4380 + }, + { + "epoch": 1.1657796700372538, + "grad_norm": 0.29562583565711975, + "learning_rate": 1.7596870845832847e-07, + "loss": 0.213, + "step": 4381 + }, + { + "epoch": 1.1660457690260777, + "grad_norm": 0.48402687907218933, + "learning_rate": 1.7595772499384136e-07, + "loss": 0.2129, + "step": 4382 + }, + { + "epoch": 1.1663118680149016, + "grad_norm": 0.4063764810562134, + "learning_rate": 1.7594673936288558e-07, + "loss": 0.2279, + "step": 4383 + }, + { + "epoch": 1.1665779670037253, + "grad_norm": 0.3640184700489044, + "learning_rate": 1.7593575156577446e-07, + "loss": 0.2064, + "step": 4384 + }, + { + "epoch": 1.1668440659925492, + "grad_norm": 0.2834213674068451, + "learning_rate": 1.759247616028214e-07, + "loss": 0.2008, + "step": 4385 + }, + { + "epoch": 1.167110164981373, + "grad_norm": 0.2570473849773407, + "learning_rate": 1.7591376947433983e-07, + "loss": 0.202, + "step": 4386 + }, + { + "epoch": 1.167376263970197, + "grad_norm": 0.2633996903896332, + "learning_rate": 1.7590277518064326e-07, + "loss": 0.1889, + "step": 4387 + }, + { + "epoch": 1.1676423629590207, + "grad_norm": 0.3682323098182678, + "learning_rate": 1.7589177872204533e-07, + "loss": 0.2154, + "step": 4388 + }, + { + "epoch": 1.1679084619478446, + "grad_norm": 0.4306809604167938, + "learning_rate": 1.7588078009885968e-07, + "loss": 0.2075, + "step": 4389 + }, + { + "epoch": 1.1681745609366685, + "grad_norm": 0.25200238823890686, + "learning_rate": 1.7586977931139998e-07, + "loss": 0.1946, + "step": 4390 + }, + { + "epoch": 1.1684406599254924, + "grad_norm": 0.28653907775878906, + "learning_rate": 1.7585877635998e-07, + "loss": 0.2197, + "step": 4391 + }, + { + "epoch": 1.168706758914316, + "grad_norm": 0.3726713955402374, + "learning_rate": 1.7584777124491353e-07, + "loss": 0.211, + "step": 4392 + }, + { + "epoch": 1.16897285790314, + "grad_norm": 0.25389379262924194, + "learning_rate": 1.7583676396651452e-07, + "loss": 0.1721, + "step": 4393 + }, + { + "epoch": 1.1692389568919639, + "grad_norm": 0.3647293150424957, + "learning_rate": 1.758257545250969e-07, + "loss": 0.1999, + "step": 4394 + }, + { + "epoch": 1.1695050558807876, + "grad_norm": 0.3821409046649933, + "learning_rate": 1.7581474292097467e-07, + "loss": 0.1921, + "step": 4395 + }, + { + "epoch": 1.1697711548696115, + "grad_norm": 0.26780569553375244, + "learning_rate": 1.7580372915446188e-07, + "loss": 0.1977, + "step": 4396 + }, + { + "epoch": 1.1700372538584354, + "grad_norm": 0.3549160957336426, + "learning_rate": 1.7579271322587273e-07, + "loss": 0.1918, + "step": 4397 + }, + { + "epoch": 1.170303352847259, + "grad_norm": 0.37346503138542175, + "learning_rate": 1.7578169513552136e-07, + "loss": 0.1981, + "step": 4398 + }, + { + "epoch": 1.170569451836083, + "grad_norm": 0.36010101437568665, + "learning_rate": 1.7577067488372207e-07, + "loss": 0.2104, + "step": 4399 + }, + { + "epoch": 1.1708355508249069, + "grad_norm": 0.2608146667480469, + "learning_rate": 1.7575965247078913e-07, + "loss": 0.1911, + "step": 4400 + }, + { + "epoch": 1.1711016498137308, + "grad_norm": 0.24685895442962646, + "learning_rate": 1.75748627897037e-07, + "loss": 0.1819, + "step": 4401 + }, + { + "epoch": 1.1713677488025545, + "grad_norm": 0.46477264165878296, + "learning_rate": 1.7573760116278003e-07, + "loss": 0.1942, + "step": 4402 + }, + { + "epoch": 1.1716338477913784, + "grad_norm": 0.3110126256942749, + "learning_rate": 1.7572657226833276e-07, + "loss": 0.1932, + "step": 4403 + }, + { + "epoch": 1.1718999467802023, + "grad_norm": 0.2850252389907837, + "learning_rate": 1.757155412140098e-07, + "loss": 0.1907, + "step": 4404 + }, + { + "epoch": 1.1721660457690262, + "grad_norm": 0.26214319467544556, + "learning_rate": 1.7570450800012572e-07, + "loss": 0.1974, + "step": 4405 + }, + { + "epoch": 1.1724321447578498, + "grad_norm": 0.2674776613712311, + "learning_rate": 1.7569347262699523e-07, + "loss": 0.1682, + "step": 4406 + }, + { + "epoch": 1.1726982437466738, + "grad_norm": 0.34326356649398804, + "learning_rate": 1.756824350949331e-07, + "loss": 0.2189, + "step": 4407 + }, + { + "epoch": 1.1729643427354977, + "grad_norm": 0.2747967541217804, + "learning_rate": 1.7567139540425414e-07, + "loss": 0.168, + "step": 4408 + }, + { + "epoch": 1.1732304417243213, + "grad_norm": 0.25360211730003357, + "learning_rate": 1.7566035355527317e-07, + "loss": 0.1818, + "step": 4409 + }, + { + "epoch": 1.1734965407131452, + "grad_norm": 0.3518718183040619, + "learning_rate": 1.756493095483052e-07, + "loss": 0.2254, + "step": 4410 + }, + { + "epoch": 1.1737626397019691, + "grad_norm": 0.3583095073699951, + "learning_rate": 1.756382633836652e-07, + "loss": 0.2067, + "step": 4411 + }, + { + "epoch": 1.174028738690793, + "grad_norm": 0.2463936060667038, + "learning_rate": 1.7562721506166823e-07, + "loss": 0.1764, + "step": 4412 + }, + { + "epoch": 1.1742948376796167, + "grad_norm": 0.29680925607681274, + "learning_rate": 1.7561616458262935e-07, + "loss": 0.1911, + "step": 4413 + }, + { + "epoch": 1.1745609366684406, + "grad_norm": 0.2509850263595581, + "learning_rate": 1.7560511194686385e-07, + "loss": 0.2006, + "step": 4414 + }, + { + "epoch": 1.1748270356572645, + "grad_norm": 0.2610057592391968, + "learning_rate": 1.7559405715468688e-07, + "loss": 0.2013, + "step": 4415 + }, + { + "epoch": 1.1750931346460884, + "grad_norm": 0.3318222165107727, + "learning_rate": 1.7558300020641382e-07, + "loss": 0.2197, + "step": 4416 + }, + { + "epoch": 1.1753592336349121, + "grad_norm": 0.27718299627304077, + "learning_rate": 1.7557194110236e-07, + "loss": 0.2104, + "step": 4417 + }, + { + "epoch": 1.175625332623736, + "grad_norm": 0.30106043815612793, + "learning_rate": 1.7556087984284083e-07, + "loss": 0.2068, + "step": 4418 + }, + { + "epoch": 1.17589143161256, + "grad_norm": 0.2672925293445587, + "learning_rate": 1.7554981642817182e-07, + "loss": 0.1886, + "step": 4419 + }, + { + "epoch": 1.1761575306013836, + "grad_norm": 0.3471897542476654, + "learning_rate": 1.7553875085866855e-07, + "loss": 0.189, + "step": 4420 + }, + { + "epoch": 1.1764236295902075, + "grad_norm": 0.26506471633911133, + "learning_rate": 1.7552768313464658e-07, + "loss": 0.2022, + "step": 4421 + }, + { + "epoch": 1.1766897285790314, + "grad_norm": 0.26556503772735596, + "learning_rate": 1.755166132564216e-07, + "loss": 0.1728, + "step": 4422 + }, + { + "epoch": 1.1769558275678553, + "grad_norm": 0.3552241027355194, + "learning_rate": 1.7550554122430937e-07, + "loss": 0.1916, + "step": 4423 + }, + { + "epoch": 1.177221926556679, + "grad_norm": 0.28557583689689636, + "learning_rate": 1.7549446703862565e-07, + "loss": 0.1931, + "step": 4424 + }, + { + "epoch": 1.177488025545503, + "grad_norm": 0.3149477243423462, + "learning_rate": 1.7548339069968635e-07, + "loss": 0.1954, + "step": 4425 + }, + { + "epoch": 1.1777541245343268, + "grad_norm": 0.5177450180053711, + "learning_rate": 1.7547231220780732e-07, + "loss": 0.1887, + "step": 4426 + }, + { + "epoch": 1.1780202235231507, + "grad_norm": 0.3706255853176117, + "learning_rate": 1.754612315633046e-07, + "loss": 0.2017, + "step": 4427 + }, + { + "epoch": 1.1782863225119744, + "grad_norm": 0.30667123198509216, + "learning_rate": 1.754501487664942e-07, + "loss": 0.2129, + "step": 4428 + }, + { + "epoch": 1.1785524215007983, + "grad_norm": 0.333488404750824, + "learning_rate": 1.7543906381769226e-07, + "loss": 0.1972, + "step": 4429 + }, + { + "epoch": 1.1788185204896222, + "grad_norm": 0.2831836938858032, + "learning_rate": 1.754279767172149e-07, + "loss": 0.2198, + "step": 4430 + }, + { + "epoch": 1.179084619478446, + "grad_norm": 0.29213932156562805, + "learning_rate": 1.7541688746537838e-07, + "loss": 0.2018, + "step": 4431 + }, + { + "epoch": 1.1793507184672698, + "grad_norm": 0.2811020314693451, + "learning_rate": 1.7540579606249896e-07, + "loss": 0.1931, + "step": 4432 + }, + { + "epoch": 1.1796168174560937, + "grad_norm": 0.4045526087284088, + "learning_rate": 1.7539470250889302e-07, + "loss": 0.2174, + "step": 4433 + }, + { + "epoch": 1.1798829164449174, + "grad_norm": 0.34051668643951416, + "learning_rate": 1.7538360680487694e-07, + "loss": 0.2069, + "step": 4434 + }, + { + "epoch": 1.1801490154337413, + "grad_norm": 0.2517695426940918, + "learning_rate": 1.753725089507672e-07, + "loss": 0.1955, + "step": 4435 + }, + { + "epoch": 1.1804151144225652, + "grad_norm": 0.28683778643608093, + "learning_rate": 1.7536140894688037e-07, + "loss": 0.1911, + "step": 4436 + }, + { + "epoch": 1.180681213411389, + "grad_norm": 0.3783280551433563, + "learning_rate": 1.7535030679353299e-07, + "loss": 0.2014, + "step": 4437 + }, + { + "epoch": 1.180947312400213, + "grad_norm": 0.28665226697921753, + "learning_rate": 1.7533920249104175e-07, + "loss": 0.2009, + "step": 4438 + }, + { + "epoch": 1.1812134113890367, + "grad_norm": 0.2966429591178894, + "learning_rate": 1.7532809603972335e-07, + "loss": 0.1911, + "step": 4439 + }, + { + "epoch": 1.1814795103778606, + "grad_norm": 0.28948330879211426, + "learning_rate": 1.7531698743989458e-07, + "loss": 0.2134, + "step": 4440 + }, + { + "epoch": 1.1817456093666845, + "grad_norm": 0.3676631450653076, + "learning_rate": 1.7530587669187228e-07, + "loss": 0.2087, + "step": 4441 + }, + { + "epoch": 1.1820117083555082, + "grad_norm": 0.28661006689071655, + "learning_rate": 1.7529476379597337e-07, + "loss": 0.2108, + "step": 4442 + }, + { + "epoch": 1.182277807344332, + "grad_norm": 0.45445865392684937, + "learning_rate": 1.7528364875251475e-07, + "loss": 0.2018, + "step": 4443 + }, + { + "epoch": 1.182543906333156, + "grad_norm": 0.2669259309768677, + "learning_rate": 1.7527253156181353e-07, + "loss": 0.1903, + "step": 4444 + }, + { + "epoch": 1.1828100053219797, + "grad_norm": 0.2827967405319214, + "learning_rate": 1.752614122241867e-07, + "loss": 0.2125, + "step": 4445 + }, + { + "epoch": 1.1830761043108036, + "grad_norm": 0.2769511044025421, + "learning_rate": 1.7525029073995147e-07, + "loss": 0.2006, + "step": 4446 + }, + { + "epoch": 1.1833422032996275, + "grad_norm": 0.27335020899772644, + "learning_rate": 1.7523916710942506e-07, + "loss": 0.2066, + "step": 4447 + }, + { + "epoch": 1.1836083022884514, + "grad_norm": 0.2913345992565155, + "learning_rate": 1.7522804133292471e-07, + "loss": 0.1936, + "step": 4448 + }, + { + "epoch": 1.183874401277275, + "grad_norm": 0.2790926992893219, + "learning_rate": 1.7521691341076773e-07, + "loss": 0.2192, + "step": 4449 + }, + { + "epoch": 1.184140500266099, + "grad_norm": 0.2795494496822357, + "learning_rate": 1.7520578334327153e-07, + "loss": 0.2152, + "step": 4450 + }, + { + "epoch": 1.1844065992549229, + "grad_norm": 0.3797011077404022, + "learning_rate": 1.7519465113075354e-07, + "loss": 0.2023, + "step": 4451 + }, + { + "epoch": 1.1846726982437468, + "grad_norm": 0.27808770537376404, + "learning_rate": 1.7518351677353133e-07, + "loss": 0.1961, + "step": 4452 + }, + { + "epoch": 1.1849387972325705, + "grad_norm": 0.28257590532302856, + "learning_rate": 1.7517238027192244e-07, + "loss": 0.2003, + "step": 4453 + }, + { + "epoch": 1.1852048962213944, + "grad_norm": 0.4181922674179077, + "learning_rate": 1.751612416262445e-07, + "loss": 0.2015, + "step": 4454 + }, + { + "epoch": 1.1854709952102183, + "grad_norm": 0.30579736828804016, + "learning_rate": 1.751501008368152e-07, + "loss": 0.2048, + "step": 4455 + }, + { + "epoch": 1.185737094199042, + "grad_norm": 0.2587042450904846, + "learning_rate": 1.7513895790395234e-07, + "loss": 0.1776, + "step": 4456 + }, + { + "epoch": 1.1860031931878658, + "grad_norm": 0.35883715748786926, + "learning_rate": 1.751278128279737e-07, + "loss": 0.1909, + "step": 4457 + }, + { + "epoch": 1.1862692921766897, + "grad_norm": 0.4256885349750519, + "learning_rate": 1.7511666560919718e-07, + "loss": 0.2402, + "step": 4458 + }, + { + "epoch": 1.1865353911655137, + "grad_norm": 0.2713233232498169, + "learning_rate": 1.7510551624794068e-07, + "loss": 0.2027, + "step": 4459 + }, + { + "epoch": 1.1868014901543373, + "grad_norm": 0.26749858260154724, + "learning_rate": 1.7509436474452226e-07, + "loss": 0.178, + "step": 4460 + }, + { + "epoch": 1.1870675891431612, + "grad_norm": 0.2728916108608246, + "learning_rate": 1.7508321109925993e-07, + "loss": 0.1943, + "step": 4461 + }, + { + "epoch": 1.1873336881319851, + "grad_norm": 0.2373960167169571, + "learning_rate": 1.750720553124719e-07, + "loss": 0.1967, + "step": 4462 + }, + { + "epoch": 1.187599787120809, + "grad_norm": 0.5184907913208008, + "learning_rate": 1.7506089738447628e-07, + "loss": 0.1792, + "step": 4463 + }, + { + "epoch": 1.1878658861096327, + "grad_norm": 0.3836194574832916, + "learning_rate": 1.750497373155913e-07, + "loss": 0.1968, + "step": 4464 + }, + { + "epoch": 1.1881319850984566, + "grad_norm": 0.3436713218688965, + "learning_rate": 1.7503857510613532e-07, + "loss": 0.1993, + "step": 4465 + }, + { + "epoch": 1.1883980840872805, + "grad_norm": 0.2440103441476822, + "learning_rate": 1.750274107564267e-07, + "loss": 0.1725, + "step": 4466 + }, + { + "epoch": 1.1886641830761042, + "grad_norm": 0.27169185876846313, + "learning_rate": 1.7501624426678387e-07, + "loss": 0.1978, + "step": 4467 + }, + { + "epoch": 1.1889302820649281, + "grad_norm": 0.2637210190296173, + "learning_rate": 1.7500507563752528e-07, + "loss": 0.2072, + "step": 4468 + }, + { + "epoch": 1.189196381053752, + "grad_norm": 0.31678929924964905, + "learning_rate": 1.7499390486896957e-07, + "loss": 0.2003, + "step": 4469 + }, + { + "epoch": 1.189462480042576, + "grad_norm": 0.24973823130130768, + "learning_rate": 1.7498273196143526e-07, + "loss": 0.1914, + "step": 4470 + }, + { + "epoch": 1.1897285790313996, + "grad_norm": 0.36784815788269043, + "learning_rate": 1.7497155691524106e-07, + "loss": 0.2196, + "step": 4471 + }, + { + "epoch": 1.1899946780202235, + "grad_norm": 0.30989155173301697, + "learning_rate": 1.7496037973070573e-07, + "loss": 0.203, + "step": 4472 + }, + { + "epoch": 1.1902607770090474, + "grad_norm": 0.2653443515300751, + "learning_rate": 1.74949200408148e-07, + "loss": 0.1804, + "step": 4473 + }, + { + "epoch": 1.1905268759978713, + "grad_norm": 0.27981117367744446, + "learning_rate": 1.7493801894788682e-07, + "loss": 0.1905, + "step": 4474 + }, + { + "epoch": 1.190792974986695, + "grad_norm": 0.8701534867286682, + "learning_rate": 1.7492683535024106e-07, + "loss": 0.2016, + "step": 4475 + }, + { + "epoch": 1.191059073975519, + "grad_norm": 0.33657974004745483, + "learning_rate": 1.7491564961552966e-07, + "loss": 0.1948, + "step": 4476 + }, + { + "epoch": 1.1913251729643428, + "grad_norm": 0.3829604983329773, + "learning_rate": 1.7490446174407172e-07, + "loss": 0.1993, + "step": 4477 + }, + { + "epoch": 1.1915912719531665, + "grad_norm": 0.27994319796562195, + "learning_rate": 1.748932717361863e-07, + "loss": 0.1754, + "step": 4478 + }, + { + "epoch": 1.1918573709419904, + "grad_norm": 0.28496095538139343, + "learning_rate": 1.748820795921926e-07, + "loss": 0.1927, + "step": 4479 + }, + { + "epoch": 1.1921234699308143, + "grad_norm": 0.2720790505409241, + "learning_rate": 1.7487088531240982e-07, + "loss": 0.2008, + "step": 4480 + }, + { + "epoch": 1.192389568919638, + "grad_norm": 0.3103451132774353, + "learning_rate": 1.7485968889715723e-07, + "loss": 0.1815, + "step": 4481 + }, + { + "epoch": 1.192655667908462, + "grad_norm": 0.25638502836227417, + "learning_rate": 1.748484903467542e-07, + "loss": 0.1873, + "step": 4482 + }, + { + "epoch": 1.1929217668972858, + "grad_norm": 0.24921470880508423, + "learning_rate": 1.7483728966152011e-07, + "loss": 0.184, + "step": 4483 + }, + { + "epoch": 1.1931878658861097, + "grad_norm": 0.2499820441007614, + "learning_rate": 1.7482608684177444e-07, + "loss": 0.201, + "step": 4484 + }, + { + "epoch": 1.1934539648749334, + "grad_norm": 0.24301311373710632, + "learning_rate": 1.7481488188783676e-07, + "loss": 0.1687, + "step": 4485 + }, + { + "epoch": 1.1937200638637573, + "grad_norm": 0.2933514714241028, + "learning_rate": 1.748036748000266e-07, + "loss": 0.2179, + "step": 4486 + }, + { + "epoch": 1.1939861628525812, + "grad_norm": 0.3490231931209564, + "learning_rate": 1.7479246557866362e-07, + "loss": 0.1946, + "step": 4487 + }, + { + "epoch": 1.194252261841405, + "grad_norm": 0.3738548457622528, + "learning_rate": 1.747812542240675e-07, + "loss": 0.1874, + "step": 4488 + }, + { + "epoch": 1.1945183608302288, + "grad_norm": 0.2584708333015442, + "learning_rate": 1.7477004073655813e-07, + "loss": 0.1817, + "step": 4489 + }, + { + "epoch": 1.1947844598190527, + "grad_norm": 0.28954747319221497, + "learning_rate": 1.7475882511645518e-07, + "loss": 0.206, + "step": 4490 + }, + { + "epoch": 1.1950505588078766, + "grad_norm": 0.2597924470901489, + "learning_rate": 1.7474760736407866e-07, + "loss": 0.1917, + "step": 4491 + }, + { + "epoch": 1.1953166577967003, + "grad_norm": 0.395788311958313, + "learning_rate": 1.747363874797485e-07, + "loss": 0.2052, + "step": 4492 + }, + { + "epoch": 1.1955827567855242, + "grad_norm": 0.3735829293727875, + "learning_rate": 1.7472516546378465e-07, + "loss": 0.2153, + "step": 4493 + }, + { + "epoch": 1.195848855774348, + "grad_norm": 0.7558378577232361, + "learning_rate": 1.7471394131650726e-07, + "loss": 0.2443, + "step": 4494 + }, + { + "epoch": 1.196114954763172, + "grad_norm": 0.2642480134963989, + "learning_rate": 1.7470271503823644e-07, + "loss": 0.2008, + "step": 4495 + }, + { + "epoch": 1.1963810537519957, + "grad_norm": 0.2881333827972412, + "learning_rate": 1.7469148662929236e-07, + "loss": 0.2134, + "step": 4496 + }, + { + "epoch": 1.1966471527408196, + "grad_norm": 0.28677234053611755, + "learning_rate": 1.7468025608999532e-07, + "loss": 0.2125, + "step": 4497 + }, + { + "epoch": 1.1969132517296435, + "grad_norm": 0.2544083595275879, + "learning_rate": 1.7466902342066562e-07, + "loss": 0.1975, + "step": 4498 + }, + { + "epoch": 1.1971793507184674, + "grad_norm": 0.2676467001438141, + "learning_rate": 1.7465778862162364e-07, + "loss": 0.1829, + "step": 4499 + }, + { + "epoch": 1.197445449707291, + "grad_norm": 0.3365795910358429, + "learning_rate": 1.746465516931898e-07, + "loss": 0.1977, + "step": 4500 + }, + { + "epoch": 1.197711548696115, + "grad_norm": 0.49269160628318787, + "learning_rate": 1.7463531263568462e-07, + "loss": 0.2081, + "step": 4501 + }, + { + "epoch": 1.1979776476849389, + "grad_norm": 0.2669275999069214, + "learning_rate": 1.7462407144942868e-07, + "loss": 0.2051, + "step": 4502 + }, + { + "epoch": 1.1982437466737625, + "grad_norm": 0.28807389736175537, + "learning_rate": 1.7461282813474255e-07, + "loss": 0.2093, + "step": 4503 + }, + { + "epoch": 1.1985098456625864, + "grad_norm": 0.2686583995819092, + "learning_rate": 1.7460158269194697e-07, + "loss": 0.1878, + "step": 4504 + }, + { + "epoch": 1.1987759446514104, + "grad_norm": 0.27890509366989136, + "learning_rate": 1.7459033512136264e-07, + "loss": 0.2045, + "step": 4505 + }, + { + "epoch": 1.1990420436402343, + "grad_norm": 0.2818925976753235, + "learning_rate": 1.7457908542331036e-07, + "loss": 0.194, + "step": 4506 + }, + { + "epoch": 1.199308142629058, + "grad_norm": 0.2731863856315613, + "learning_rate": 1.7456783359811102e-07, + "loss": 0.197, + "step": 4507 + }, + { + "epoch": 1.1995742416178818, + "grad_norm": 0.33918601274490356, + "learning_rate": 1.7455657964608555e-07, + "loss": 0.2073, + "step": 4508 + }, + { + "epoch": 1.1998403406067057, + "grad_norm": 0.26687318086624146, + "learning_rate": 1.7454532356755492e-07, + "loss": 0.1965, + "step": 4509 + }, + { + "epoch": 1.2001064395955297, + "grad_norm": 0.25284141302108765, + "learning_rate": 1.7453406536284019e-07, + "loss": 0.1963, + "step": 4510 + }, + { + "epoch": 1.2003725385843533, + "grad_norm": 0.25348004698753357, + "learning_rate": 1.7452280503226241e-07, + "loss": 0.192, + "step": 4511 + }, + { + "epoch": 1.2006386375731772, + "grad_norm": 0.34957215189933777, + "learning_rate": 1.7451154257614284e-07, + "loss": 0.1967, + "step": 4512 + }, + { + "epoch": 1.2009047365620011, + "grad_norm": 0.9511183500289917, + "learning_rate": 1.7450027799480264e-07, + "loss": 0.1978, + "step": 4513 + }, + { + "epoch": 1.2011708355508248, + "grad_norm": 0.27859026193618774, + "learning_rate": 1.7448901128856312e-07, + "loss": 0.2, + "step": 4514 + }, + { + "epoch": 1.2014369345396487, + "grad_norm": 0.281772255897522, + "learning_rate": 1.7447774245774566e-07, + "loss": 0.2152, + "step": 4515 + }, + { + "epoch": 1.2017030335284726, + "grad_norm": 0.2693953514099121, + "learning_rate": 1.744664715026716e-07, + "loss": 0.2049, + "step": 4516 + }, + { + "epoch": 1.2019691325172963, + "grad_norm": 0.35333728790283203, + "learning_rate": 1.7445519842366244e-07, + "loss": 0.1975, + "step": 4517 + }, + { + "epoch": 1.2022352315061202, + "grad_norm": 0.27377739548683167, + "learning_rate": 1.7444392322103977e-07, + "loss": 0.22, + "step": 4518 + }, + { + "epoch": 1.2025013304949441, + "grad_norm": 0.27146872878074646, + "learning_rate": 1.744326458951251e-07, + "loss": 0.2052, + "step": 4519 + }, + { + "epoch": 1.202767429483768, + "grad_norm": 0.28487497568130493, + "learning_rate": 1.7442136644624012e-07, + "loss": 0.1962, + "step": 4520 + }, + { + "epoch": 1.203033528472592, + "grad_norm": 0.2638050615787506, + "learning_rate": 1.7441008487470654e-07, + "loss": 0.1805, + "step": 4521 + }, + { + "epoch": 1.2032996274614156, + "grad_norm": 0.26915597915649414, + "learning_rate": 1.743988011808461e-07, + "loss": 0.2056, + "step": 4522 + }, + { + "epoch": 1.2035657264502395, + "grad_norm": 0.31710439920425415, + "learning_rate": 1.743875153649807e-07, + "loss": 0.2074, + "step": 4523 + }, + { + "epoch": 1.2038318254390634, + "grad_norm": 0.27339741587638855, + "learning_rate": 1.743762274274322e-07, + "loss": 0.197, + "step": 4524 + }, + { + "epoch": 1.204097924427887, + "grad_norm": 0.2510736286640167, + "learning_rate": 1.7436493736852254e-07, + "loss": 0.1796, + "step": 4525 + }, + { + "epoch": 1.204364023416711, + "grad_norm": 0.25148990750312805, + "learning_rate": 1.743536451885738e-07, + "loss": 0.1672, + "step": 4526 + }, + { + "epoch": 1.204630122405535, + "grad_norm": 0.3694445788860321, + "learning_rate": 1.743423508879079e-07, + "loss": 0.1996, + "step": 4527 + }, + { + "epoch": 1.2048962213943586, + "grad_norm": 0.2830895483493805, + "learning_rate": 1.7433105446684717e-07, + "loss": 0.1896, + "step": 4528 + }, + { + "epoch": 1.2051623203831825, + "grad_norm": 0.38824397325515747, + "learning_rate": 1.7431975592571368e-07, + "loss": 0.1978, + "step": 4529 + }, + { + "epoch": 1.2054284193720064, + "grad_norm": 0.27666476368904114, + "learning_rate": 1.7430845526482976e-07, + "loss": 0.1995, + "step": 4530 + }, + { + "epoch": 1.2056945183608303, + "grad_norm": 0.2850768268108368, + "learning_rate": 1.7429715248451765e-07, + "loss": 0.1825, + "step": 4531 + }, + { + "epoch": 1.205960617349654, + "grad_norm": 0.3552098274230957, + "learning_rate": 1.7428584758509982e-07, + "loss": 0.211, + "step": 4532 + }, + { + "epoch": 1.2062267163384779, + "grad_norm": 0.30316001176834106, + "learning_rate": 1.7427454056689863e-07, + "loss": 0.2048, + "step": 4533 + }, + { + "epoch": 1.2064928153273018, + "grad_norm": 0.2739748954772949, + "learning_rate": 1.7426323143023662e-07, + "loss": 0.1965, + "step": 4534 + }, + { + "epoch": 1.2067589143161257, + "grad_norm": 0.2627122402191162, + "learning_rate": 1.7425192017543632e-07, + "loss": 0.1831, + "step": 4535 + }, + { + "epoch": 1.2070250133049494, + "grad_norm": 0.2408248782157898, + "learning_rate": 1.7424060680282037e-07, + "loss": 0.1658, + "step": 4536 + }, + { + "epoch": 1.2072911122937733, + "grad_norm": 0.2770346999168396, + "learning_rate": 1.7422929131271148e-07, + "loss": 0.199, + "step": 4537 + }, + { + "epoch": 1.2075572112825972, + "grad_norm": 0.33939647674560547, + "learning_rate": 1.7421797370543235e-07, + "loss": 0.1971, + "step": 4538 + }, + { + "epoch": 1.2078233102714209, + "grad_norm": 0.42364683747291565, + "learning_rate": 1.7420665398130578e-07, + "loss": 0.193, + "step": 4539 + }, + { + "epoch": 1.2080894092602448, + "grad_norm": 0.26491308212280273, + "learning_rate": 1.7419533214065466e-07, + "loss": 0.1956, + "step": 4540 + }, + { + "epoch": 1.2083555082490687, + "grad_norm": 0.2899610102176666, + "learning_rate": 1.7418400818380188e-07, + "loss": 0.1739, + "step": 4541 + }, + { + "epoch": 1.2086216072378926, + "grad_norm": 0.27789995074272156, + "learning_rate": 1.7417268211107044e-07, + "loss": 0.1945, + "step": 4542 + }, + { + "epoch": 1.2088877062267163, + "grad_norm": 0.3400980234146118, + "learning_rate": 1.7416135392278337e-07, + "loss": 0.1985, + "step": 4543 + }, + { + "epoch": 1.2091538052155402, + "grad_norm": 0.2858947813510895, + "learning_rate": 1.7415002361926381e-07, + "loss": 0.1904, + "step": 4544 + }, + { + "epoch": 1.209419904204364, + "grad_norm": 0.6115896105766296, + "learning_rate": 1.7413869120083486e-07, + "loss": 0.2458, + "step": 4545 + }, + { + "epoch": 1.209686003193188, + "grad_norm": 0.27227795124053955, + "learning_rate": 1.741273566678198e-07, + "loss": 0.1968, + "step": 4546 + }, + { + "epoch": 1.2099521021820117, + "grad_norm": 0.3040112555027008, + "learning_rate": 1.7411602002054191e-07, + "loss": 0.2085, + "step": 4547 + }, + { + "epoch": 1.2102182011708356, + "grad_norm": 0.2753799259662628, + "learning_rate": 1.741046812593245e-07, + "loss": 0.1782, + "step": 4548 + }, + { + "epoch": 1.2104843001596595, + "grad_norm": 0.2689739167690277, + "learning_rate": 1.74093340384491e-07, + "loss": 0.2028, + "step": 4549 + }, + { + "epoch": 1.2107503991484831, + "grad_norm": 0.3525231182575226, + "learning_rate": 1.7408199739636483e-07, + "loss": 0.2014, + "step": 4550 + }, + { + "epoch": 1.211016498137307, + "grad_norm": 0.3178929388523102, + "learning_rate": 1.740706522952696e-07, + "loss": 0.1888, + "step": 4551 + }, + { + "epoch": 1.211282597126131, + "grad_norm": 0.2881659269332886, + "learning_rate": 1.7405930508152883e-07, + "loss": 0.2058, + "step": 4552 + }, + { + "epoch": 1.2115486961149549, + "grad_norm": 0.30697473883628845, + "learning_rate": 1.740479557554662e-07, + "loss": 0.2012, + "step": 4553 + }, + { + "epoch": 1.2118147951037785, + "grad_norm": 0.3614782392978668, + "learning_rate": 1.7403660431740538e-07, + "loss": 0.2033, + "step": 4554 + }, + { + "epoch": 1.2120808940926024, + "grad_norm": 0.28463974595069885, + "learning_rate": 1.7402525076767017e-07, + "loss": 0.1941, + "step": 4555 + }, + { + "epoch": 1.2123469930814263, + "grad_norm": 0.2786436378955841, + "learning_rate": 1.7401389510658438e-07, + "loss": 0.2056, + "step": 4556 + }, + { + "epoch": 1.2126130920702503, + "grad_norm": 2.035756826400757, + "learning_rate": 1.7400253733447188e-07, + "loss": 0.2116, + "step": 4557 + }, + { + "epoch": 1.212879191059074, + "grad_norm": 0.3478240966796875, + "learning_rate": 1.7399117745165665e-07, + "loss": 0.1979, + "step": 4558 + }, + { + "epoch": 1.2131452900478978, + "grad_norm": 0.41415947675704956, + "learning_rate": 1.739798154584627e-07, + "loss": 0.1982, + "step": 4559 + }, + { + "epoch": 1.2134113890367217, + "grad_norm": 0.2542966604232788, + "learning_rate": 1.7396845135521402e-07, + "loss": 0.2037, + "step": 4560 + }, + { + "epoch": 1.2136774880255454, + "grad_norm": 0.2664873003959656, + "learning_rate": 1.7395708514223485e-07, + "loss": 0.1942, + "step": 4561 + }, + { + "epoch": 1.2139435870143693, + "grad_norm": 0.2586786448955536, + "learning_rate": 1.7394571681984932e-07, + "loss": 0.1919, + "step": 4562 + }, + { + "epoch": 1.2142096860031932, + "grad_norm": 0.275093674659729, + "learning_rate": 1.7393434638838164e-07, + "loss": 0.1869, + "step": 4563 + }, + { + "epoch": 1.214475784992017, + "grad_norm": 0.39333194494247437, + "learning_rate": 1.739229738481562e-07, + "loss": 0.1979, + "step": 4564 + }, + { + "epoch": 1.2147418839808408, + "grad_norm": 0.30341973900794983, + "learning_rate": 1.739115991994973e-07, + "loss": 0.1838, + "step": 4565 + }, + { + "epoch": 1.2150079829696647, + "grad_norm": 0.2955318093299866, + "learning_rate": 1.739002224427294e-07, + "loss": 0.1876, + "step": 4566 + }, + { + "epoch": 1.2152740819584886, + "grad_norm": 0.25919464230537415, + "learning_rate": 1.73888843578177e-07, + "loss": 0.1881, + "step": 4567 + }, + { + "epoch": 1.2155401809473123, + "grad_norm": 0.35974863171577454, + "learning_rate": 1.7387746260616462e-07, + "loss": 0.1998, + "step": 4568 + }, + { + "epoch": 1.2158062799361362, + "grad_norm": 0.43629732728004456, + "learning_rate": 1.738660795270169e-07, + "loss": 0.2077, + "step": 4569 + }, + { + "epoch": 1.2160723789249601, + "grad_norm": 0.3575083017349243, + "learning_rate": 1.7385469434105847e-07, + "loss": 0.1994, + "step": 4570 + }, + { + "epoch": 1.216338477913784, + "grad_norm": 0.3002742826938629, + "learning_rate": 1.7384330704861409e-07, + "loss": 0.1744, + "step": 4571 + }, + { + "epoch": 1.2166045769026077, + "grad_norm": 0.2900792062282562, + "learning_rate": 1.7383191765000854e-07, + "loss": 0.1956, + "step": 4572 + }, + { + "epoch": 1.2168706758914316, + "grad_norm": 0.3504222333431244, + "learning_rate": 1.7382052614556662e-07, + "loss": 0.2021, + "step": 4573 + }, + { + "epoch": 1.2171367748802555, + "grad_norm": 0.2796315848827362, + "learning_rate": 1.7380913253561334e-07, + "loss": 0.2141, + "step": 4574 + }, + { + "epoch": 1.2174028738690792, + "grad_norm": 0.2873952090740204, + "learning_rate": 1.737977368204736e-07, + "loss": 0.1935, + "step": 4575 + }, + { + "epoch": 1.217668972857903, + "grad_norm": 0.28148186206817627, + "learning_rate": 1.7378633900047238e-07, + "loss": 0.1975, + "step": 4576 + }, + { + "epoch": 1.217935071846727, + "grad_norm": 0.2741760015487671, + "learning_rate": 1.737749390759349e-07, + "loss": 0.1921, + "step": 4577 + }, + { + "epoch": 1.218201170835551, + "grad_norm": 0.3646749258041382, + "learning_rate": 1.7376353704718623e-07, + "loss": 0.2131, + "step": 4578 + }, + { + "epoch": 1.2184672698243746, + "grad_norm": 0.44691240787506104, + "learning_rate": 1.7375213291455157e-07, + "loss": 0.2169, + "step": 4579 + }, + { + "epoch": 1.2187333688131985, + "grad_norm": 0.3757668137550354, + "learning_rate": 1.7374072667835621e-07, + "loss": 0.1949, + "step": 4580 + }, + { + "epoch": 1.2189994678020224, + "grad_norm": 0.34043386578559875, + "learning_rate": 1.737293183389255e-07, + "loss": 0.2079, + "step": 4581 + }, + { + "epoch": 1.2192655667908463, + "grad_norm": 0.3191604018211365, + "learning_rate": 1.7371790789658477e-07, + "loss": 0.2103, + "step": 4582 + }, + { + "epoch": 1.21953166577967, + "grad_norm": 0.3720548748970032, + "learning_rate": 1.7370649535165955e-07, + "loss": 0.2101, + "step": 4583 + }, + { + "epoch": 1.2197977647684939, + "grad_norm": 0.2569558620452881, + "learning_rate": 1.7369508070447528e-07, + "loss": 0.1847, + "step": 4584 + }, + { + "epoch": 1.2200638637573178, + "grad_norm": 0.297958642244339, + "learning_rate": 1.7368366395535753e-07, + "loss": 0.1959, + "step": 4585 + }, + { + "epoch": 1.2203299627461415, + "grad_norm": 0.27579569816589355, + "learning_rate": 1.7367224510463198e-07, + "loss": 0.1976, + "step": 4586 + }, + { + "epoch": 1.2205960617349654, + "grad_norm": 0.26350951194763184, + "learning_rate": 1.7366082415262428e-07, + "loss": 0.184, + "step": 4587 + }, + { + "epoch": 1.2208621607237893, + "grad_norm": 0.2803489863872528, + "learning_rate": 1.7364940109966022e-07, + "loss": 0.2048, + "step": 4588 + }, + { + "epoch": 1.2211282597126132, + "grad_norm": 0.2897370159626007, + "learning_rate": 1.7363797594606552e-07, + "loss": 0.1908, + "step": 4589 + }, + { + "epoch": 1.2213943587014369, + "grad_norm": 0.3050122857093811, + "learning_rate": 1.7362654869216615e-07, + "loss": 0.1949, + "step": 4590 + }, + { + "epoch": 1.2216604576902608, + "grad_norm": 0.43992340564727783, + "learning_rate": 1.7361511933828798e-07, + "loss": 0.2383, + "step": 4591 + }, + { + "epoch": 1.2219265566790847, + "grad_norm": 0.2763534486293793, + "learning_rate": 1.7360368788475702e-07, + "loss": 0.183, + "step": 4592 + }, + { + "epoch": 1.2221926556679086, + "grad_norm": 0.27124640345573425, + "learning_rate": 1.7359225433189933e-07, + "loss": 0.2021, + "step": 4593 + }, + { + "epoch": 1.2224587546567323, + "grad_norm": 0.26006823778152466, + "learning_rate": 1.7358081868004096e-07, + "loss": 0.1856, + "step": 4594 + }, + { + "epoch": 1.2227248536455562, + "grad_norm": 0.3200024366378784, + "learning_rate": 1.7356938092950813e-07, + "loss": 0.1921, + "step": 4595 + }, + { + "epoch": 1.22299095263438, + "grad_norm": 0.29520097374916077, + "learning_rate": 1.7355794108062708e-07, + "loss": 0.2034, + "step": 4596 + }, + { + "epoch": 1.2232570516232038, + "grad_norm": 0.2736699879169464, + "learning_rate": 1.7354649913372405e-07, + "loss": 0.1931, + "step": 4597 + }, + { + "epoch": 1.2235231506120277, + "grad_norm": 0.5858630537986755, + "learning_rate": 1.7353505508912538e-07, + "loss": 0.1951, + "step": 4598 + }, + { + "epoch": 1.2237892496008516, + "grad_norm": 0.3476867079734802, + "learning_rate": 1.7352360894715756e-07, + "loss": 0.2179, + "step": 4599 + }, + { + "epoch": 1.2240553485896752, + "grad_norm": 0.2692701816558838, + "learning_rate": 1.73512160708147e-07, + "loss": 0.1946, + "step": 4600 + }, + { + "epoch": 1.2243214475784991, + "grad_norm": 0.2741781771183014, + "learning_rate": 1.735007103724202e-07, + "loss": 0.1845, + "step": 4601 + }, + { + "epoch": 1.224587546567323, + "grad_norm": 0.25234201550483704, + "learning_rate": 1.7348925794030383e-07, + "loss": 0.1862, + "step": 4602 + }, + { + "epoch": 1.224853645556147, + "grad_norm": 0.32778725028038025, + "learning_rate": 1.7347780341212442e-07, + "loss": 0.2105, + "step": 4603 + }, + { + "epoch": 1.2251197445449706, + "grad_norm": 0.3374875485897064, + "learning_rate": 1.734663467882088e-07, + "loss": 0.2156, + "step": 4604 + }, + { + "epoch": 1.2253858435337945, + "grad_norm": 0.27290478348731995, + "learning_rate": 1.7345488806888366e-07, + "loss": 0.2047, + "step": 4605 + }, + { + "epoch": 1.2256519425226184, + "grad_norm": 0.28455302119255066, + "learning_rate": 1.7344342725447584e-07, + "loss": 0.1902, + "step": 4606 + }, + { + "epoch": 1.2259180415114423, + "grad_norm": 0.3285924196243286, + "learning_rate": 1.7343196434531224e-07, + "loss": 0.1945, + "step": 4607 + }, + { + "epoch": 1.226184140500266, + "grad_norm": 0.414681613445282, + "learning_rate": 1.7342049934171978e-07, + "loss": 0.2028, + "step": 4608 + }, + { + "epoch": 1.22645023948909, + "grad_norm": 0.3249066174030304, + "learning_rate": 1.734090322440255e-07, + "loss": 0.215, + "step": 4609 + }, + { + "epoch": 1.2267163384779138, + "grad_norm": 0.32981568574905396, + "learning_rate": 1.7339756305255647e-07, + "loss": 0.1969, + "step": 4610 + }, + { + "epoch": 1.2269824374667375, + "grad_norm": 0.2960037887096405, + "learning_rate": 1.7338609176763973e-07, + "loss": 0.2148, + "step": 4611 + }, + { + "epoch": 1.2272485364555614, + "grad_norm": 0.35726848244667053, + "learning_rate": 1.7337461838960254e-07, + "loss": 0.1909, + "step": 4612 + }, + { + "epoch": 1.2275146354443853, + "grad_norm": 0.27086296677589417, + "learning_rate": 1.7336314291877217e-07, + "loss": 0.2171, + "step": 4613 + }, + { + "epoch": 1.2277807344332092, + "grad_norm": 0.39568865299224854, + "learning_rate": 1.7335166535547586e-07, + "loss": 0.2187, + "step": 4614 + }, + { + "epoch": 1.228046833422033, + "grad_norm": 0.27989548444747925, + "learning_rate": 1.73340185700041e-07, + "loss": 0.1869, + "step": 4615 + }, + { + "epoch": 1.2283129324108568, + "grad_norm": 0.353346586227417, + "learning_rate": 1.7332870395279497e-07, + "loss": 0.2097, + "step": 4616 + }, + { + "epoch": 1.2285790313996807, + "grad_norm": 0.2676091194152832, + "learning_rate": 1.7331722011406534e-07, + "loss": 0.1688, + "step": 4617 + }, + { + "epoch": 1.2288451303885046, + "grad_norm": 0.26701441407203674, + "learning_rate": 1.7330573418417958e-07, + "loss": 0.196, + "step": 4618 + }, + { + "epoch": 1.2291112293773283, + "grad_norm": 0.3989965617656708, + "learning_rate": 1.7329424616346532e-07, + "loss": 0.1862, + "step": 4619 + }, + { + "epoch": 1.2293773283661522, + "grad_norm": 0.33684906363487244, + "learning_rate": 1.7328275605225025e-07, + "loss": 0.2032, + "step": 4620 + }, + { + "epoch": 1.2296434273549761, + "grad_norm": 0.3222401738166809, + "learning_rate": 1.73271263850862e-07, + "loss": 0.184, + "step": 4621 + }, + { + "epoch": 1.2299095263437998, + "grad_norm": 0.2663836181163788, + "learning_rate": 1.7325976955962846e-07, + "loss": 0.2017, + "step": 4622 + }, + { + "epoch": 1.2301756253326237, + "grad_norm": 0.28040239214897156, + "learning_rate": 1.732482731788774e-07, + "loss": 0.2073, + "step": 4623 + }, + { + "epoch": 1.2304417243214476, + "grad_norm": 0.4161216914653778, + "learning_rate": 1.7323677470893675e-07, + "loss": 0.228, + "step": 4624 + }, + { + "epoch": 1.2307078233102715, + "grad_norm": 0.27564796805381775, + "learning_rate": 1.7322527415013445e-07, + "loss": 0.1865, + "step": 4625 + }, + { + "epoch": 1.2309739222990952, + "grad_norm": 0.29464560747146606, + "learning_rate": 1.7321377150279854e-07, + "loss": 0.1974, + "step": 4626 + }, + { + "epoch": 1.231240021287919, + "grad_norm": 0.26817017793655396, + "learning_rate": 1.732022667672571e-07, + "loss": 0.1922, + "step": 4627 + }, + { + "epoch": 1.231506120276743, + "grad_norm": 0.2686633765697479, + "learning_rate": 1.7319075994383826e-07, + "loss": 0.1981, + "step": 4628 + }, + { + "epoch": 1.231772219265567, + "grad_norm": 0.40527522563934326, + "learning_rate": 1.731792510328702e-07, + "loss": 0.2154, + "step": 4629 + }, + { + "epoch": 1.2320383182543906, + "grad_norm": 0.2569325864315033, + "learning_rate": 1.7316774003468117e-07, + "loss": 0.2046, + "step": 4630 + }, + { + "epoch": 1.2323044172432145, + "grad_norm": 0.3237255811691284, + "learning_rate": 1.7315622694959955e-07, + "loss": 0.1978, + "step": 4631 + }, + { + "epoch": 1.2325705162320384, + "grad_norm": 0.29248058795928955, + "learning_rate": 1.7314471177795364e-07, + "loss": 0.1913, + "step": 4632 + }, + { + "epoch": 1.232836615220862, + "grad_norm": 0.35978013277053833, + "learning_rate": 1.7313319452007194e-07, + "loss": 0.2068, + "step": 4633 + }, + { + "epoch": 1.233102714209686, + "grad_norm": 0.26545292139053345, + "learning_rate": 1.731216751762829e-07, + "loss": 0.1906, + "step": 4634 + }, + { + "epoch": 1.2333688131985099, + "grad_norm": 0.3664827346801758, + "learning_rate": 1.731101537469151e-07, + "loss": 0.1889, + "step": 4635 + }, + { + "epoch": 1.2336349121873336, + "grad_norm": 0.286532461643219, + "learning_rate": 1.7309863023229716e-07, + "loss": 0.1956, + "step": 4636 + }, + { + "epoch": 1.2339010111761575, + "grad_norm": 0.2683810591697693, + "learning_rate": 1.7308710463275775e-07, + "loss": 0.19, + "step": 4637 + }, + { + "epoch": 1.2341671101649814, + "grad_norm": 0.3452762961387634, + "learning_rate": 1.7307557694862558e-07, + "loss": 0.2054, + "step": 4638 + }, + { + "epoch": 1.2344332091538053, + "grad_norm": 0.25693172216415405, + "learning_rate": 1.7306404718022948e-07, + "loss": 0.1659, + "step": 4639 + }, + { + "epoch": 1.2346993081426292, + "grad_norm": 0.2640230655670166, + "learning_rate": 1.7305251532789823e-07, + "loss": 0.1917, + "step": 4640 + }, + { + "epoch": 1.2349654071314529, + "grad_norm": 0.28865060210227966, + "learning_rate": 1.7304098139196085e-07, + "loss": 0.1975, + "step": 4641 + }, + { + "epoch": 1.2352315061202768, + "grad_norm": 0.3486877381801605, + "learning_rate": 1.7302944537274625e-07, + "loss": 0.214, + "step": 4642 + }, + { + "epoch": 1.2354976051091007, + "grad_norm": 0.36282312870025635, + "learning_rate": 1.7301790727058343e-07, + "loss": 0.2125, + "step": 4643 + }, + { + "epoch": 1.2357637040979244, + "grad_norm": 0.25129738450050354, + "learning_rate": 1.7300636708580151e-07, + "loss": 0.1843, + "step": 4644 + }, + { + "epoch": 1.2360298030867483, + "grad_norm": 0.3674907088279724, + "learning_rate": 1.7299482481872968e-07, + "loss": 0.2047, + "step": 4645 + }, + { + "epoch": 1.2362959020755722, + "grad_norm": 0.4330374002456665, + "learning_rate": 1.7298328046969707e-07, + "loss": 0.2204, + "step": 4646 + }, + { + "epoch": 1.2365620010643958, + "grad_norm": 0.3683096170425415, + "learning_rate": 1.7297173403903304e-07, + "loss": 0.1966, + "step": 4647 + }, + { + "epoch": 1.2368281000532197, + "grad_norm": 0.3441014885902405, + "learning_rate": 1.7296018552706685e-07, + "loss": 0.189, + "step": 4648 + }, + { + "epoch": 1.2370941990420437, + "grad_norm": 0.25675371289253235, + "learning_rate": 1.729486349341279e-07, + "loss": 0.1854, + "step": 4649 + }, + { + "epoch": 1.2373602980308676, + "grad_norm": 0.3732759952545166, + "learning_rate": 1.7293708226054563e-07, + "loss": 0.1983, + "step": 4650 + }, + { + "epoch": 1.2376263970196912, + "grad_norm": 0.3738357424736023, + "learning_rate": 1.7292552750664958e-07, + "loss": 0.211, + "step": 4651 + }, + { + "epoch": 1.2378924960085151, + "grad_norm": 0.32749903202056885, + "learning_rate": 1.7291397067276932e-07, + "loss": 0.2041, + "step": 4652 + }, + { + "epoch": 1.238158594997339, + "grad_norm": 0.28865158557891846, + "learning_rate": 1.729024117592344e-07, + "loss": 0.1962, + "step": 4653 + }, + { + "epoch": 1.238424693986163, + "grad_norm": 0.32831355929374695, + "learning_rate": 1.7289085076637457e-07, + "loss": 0.212, + "step": 4654 + }, + { + "epoch": 1.2386907929749866, + "grad_norm": 0.25910836458206177, + "learning_rate": 1.728792876945196e-07, + "loss": 0.181, + "step": 4655 + }, + { + "epoch": 1.2389568919638105, + "grad_norm": 0.43815088272094727, + "learning_rate": 1.7286772254399916e-07, + "loss": 0.2267, + "step": 4656 + }, + { + "epoch": 1.2392229909526344, + "grad_norm": 0.25108930468559265, + "learning_rate": 1.7285615531514327e-07, + "loss": 0.1774, + "step": 4657 + }, + { + "epoch": 1.2394890899414581, + "grad_norm": 0.27617713809013367, + "learning_rate": 1.7284458600828172e-07, + "loss": 0.1868, + "step": 4658 + }, + { + "epoch": 1.239755188930282, + "grad_norm": 0.31674423813819885, + "learning_rate": 1.728330146237446e-07, + "loss": 0.2136, + "step": 4659 + }, + { + "epoch": 1.240021287919106, + "grad_norm": 0.2893883287906647, + "learning_rate": 1.7282144116186188e-07, + "loss": 0.203, + "step": 4660 + }, + { + "epoch": 1.2402873869079298, + "grad_norm": 0.35409238934516907, + "learning_rate": 1.7280986562296368e-07, + "loss": 0.2172, + "step": 4661 + }, + { + "epoch": 1.2405534858967535, + "grad_norm": 0.27895960211753845, + "learning_rate": 1.7279828800738017e-07, + "loss": 0.1912, + "step": 4662 + }, + { + "epoch": 1.2408195848855774, + "grad_norm": 0.3294803500175476, + "learning_rate": 1.727867083154415e-07, + "loss": 0.1979, + "step": 4663 + }, + { + "epoch": 1.2410856838744013, + "grad_norm": 0.42068102955818176, + "learning_rate": 1.7277512654747807e-07, + "loss": 0.2191, + "step": 4664 + }, + { + "epoch": 1.2413517828632252, + "grad_norm": 0.3682175278663635, + "learning_rate": 1.727635427038201e-07, + "loss": 0.1979, + "step": 4665 + }, + { + "epoch": 1.241617881852049, + "grad_norm": 0.32979482412338257, + "learning_rate": 1.7275195678479803e-07, + "loss": 0.2163, + "step": 4666 + }, + { + "epoch": 1.2418839808408728, + "grad_norm": 0.2659591734409332, + "learning_rate": 1.7274036879074232e-07, + "loss": 0.1899, + "step": 4667 + }, + { + "epoch": 1.2421500798296967, + "grad_norm": 0.27037763595581055, + "learning_rate": 1.7272877872198351e-07, + "loss": 0.2046, + "step": 4668 + }, + { + "epoch": 1.2424161788185204, + "grad_norm": 0.2598491311073303, + "learning_rate": 1.727171865788521e-07, + "loss": 0.1942, + "step": 4669 + }, + { + "epoch": 1.2426822778073443, + "grad_norm": 0.2748238146305084, + "learning_rate": 1.7270559236167877e-07, + "loss": 0.2011, + "step": 4670 + }, + { + "epoch": 1.2429483767961682, + "grad_norm": 0.3923819363117218, + "learning_rate": 1.7269399607079418e-07, + "loss": 0.1927, + "step": 4671 + }, + { + "epoch": 1.2432144757849921, + "grad_norm": 0.28938937187194824, + "learning_rate": 1.7268239770652913e-07, + "loss": 0.1898, + "step": 4672 + }, + { + "epoch": 1.2434805747738158, + "grad_norm": 0.24834474921226501, + "learning_rate": 1.726707972692144e-07, + "loss": 0.176, + "step": 4673 + }, + { + "epoch": 1.2437466737626397, + "grad_norm": 0.28535741567611694, + "learning_rate": 1.7265919475918086e-07, + "loss": 0.1904, + "step": 4674 + }, + { + "epoch": 1.2440127727514636, + "grad_norm": 0.284389466047287, + "learning_rate": 1.726475901767594e-07, + "loss": 0.1952, + "step": 4675 + }, + { + "epoch": 1.2442788717402875, + "grad_norm": 0.272819459438324, + "learning_rate": 1.7263598352228107e-07, + "loss": 0.1974, + "step": 4676 + }, + { + "epoch": 1.2445449707291112, + "grad_norm": 0.29208219051361084, + "learning_rate": 1.726243747960769e-07, + "loss": 0.2149, + "step": 4677 + }, + { + "epoch": 1.244811069717935, + "grad_norm": 0.2734481692314148, + "learning_rate": 1.7261276399847794e-07, + "loss": 0.2107, + "step": 4678 + }, + { + "epoch": 1.245077168706759, + "grad_norm": 0.2660684287548065, + "learning_rate": 1.726011511298154e-07, + "loss": 0.1938, + "step": 4679 + }, + { + "epoch": 1.2453432676955827, + "grad_norm": 0.4396940767765045, + "learning_rate": 1.7258953619042055e-07, + "loss": 0.2014, + "step": 4680 + }, + { + "epoch": 1.2456093666844066, + "grad_norm": 0.2412765473127365, + "learning_rate": 1.7257791918062456e-07, + "loss": 0.1589, + "step": 4681 + }, + { + "epoch": 1.2458754656732305, + "grad_norm": 0.3002229332923889, + "learning_rate": 1.7256630010075885e-07, + "loss": 0.2042, + "step": 4682 + }, + { + "epoch": 1.2461415646620542, + "grad_norm": 0.27879735827445984, + "learning_rate": 1.725546789511548e-07, + "loss": 0.1738, + "step": 4683 + }, + { + "epoch": 1.246407663650878, + "grad_norm": 0.31714755296707153, + "learning_rate": 1.7254305573214389e-07, + "loss": 0.1967, + "step": 4684 + }, + { + "epoch": 1.246673762639702, + "grad_norm": 0.33017000555992126, + "learning_rate": 1.7253143044405757e-07, + "loss": 0.1862, + "step": 4685 + }, + { + "epoch": 1.2469398616285259, + "grad_norm": 0.26267799735069275, + "learning_rate": 1.7251980308722752e-07, + "loss": 0.1685, + "step": 4686 + }, + { + "epoch": 1.2472059606173496, + "grad_norm": 0.3213536739349365, + "learning_rate": 1.725081736619853e-07, + "loss": 0.1972, + "step": 4687 + }, + { + "epoch": 1.2474720596061735, + "grad_norm": 0.2681410014629364, + "learning_rate": 1.724965421686626e-07, + "loss": 0.2013, + "step": 4688 + }, + { + "epoch": 1.2477381585949974, + "grad_norm": 0.35904964804649353, + "learning_rate": 1.724849086075912e-07, + "loss": 0.1881, + "step": 4689 + }, + { + "epoch": 1.2480042575838213, + "grad_norm": 0.35871702432632446, + "learning_rate": 1.7247327297910293e-07, + "loss": 0.2171, + "step": 4690 + }, + { + "epoch": 1.248270356572645, + "grad_norm": 0.34307196736335754, + "learning_rate": 1.7246163528352961e-07, + "loss": 0.1856, + "step": 4691 + }, + { + "epoch": 1.2485364555614689, + "grad_norm": 0.26677587628364563, + "learning_rate": 1.7244999552120322e-07, + "loss": 0.2038, + "step": 4692 + }, + { + "epoch": 1.2488025545502928, + "grad_norm": 0.3183039724826813, + "learning_rate": 1.7243835369245577e-07, + "loss": 0.1924, + "step": 4693 + }, + { + "epoch": 1.2490686535391164, + "grad_norm": 0.32955458760261536, + "learning_rate": 1.7242670979761923e-07, + "loss": 0.1927, + "step": 4694 + }, + { + "epoch": 1.2493347525279404, + "grad_norm": 0.5067923665046692, + "learning_rate": 1.7241506383702572e-07, + "loss": 0.1867, + "step": 4695 + }, + { + "epoch": 1.2496008515167643, + "grad_norm": 0.26268333196640015, + "learning_rate": 1.7240341581100748e-07, + "loss": 0.1803, + "step": 4696 + }, + { + "epoch": 1.2498669505055882, + "grad_norm": 0.2704629600048065, + "learning_rate": 1.7239176571989668e-07, + "loss": 0.1909, + "step": 4697 + }, + { + "epoch": 1.2501330494944118, + "grad_norm": 0.3324767053127289, + "learning_rate": 1.7238011356402563e-07, + "loss": 0.2067, + "step": 4698 + }, + { + "epoch": 1.2503991484832357, + "grad_norm": 0.25056275725364685, + "learning_rate": 1.7236845934372663e-07, + "loss": 0.1913, + "step": 4699 + }, + { + "epoch": 1.2506652474720596, + "grad_norm": 0.3266981840133667, + "learning_rate": 1.7235680305933213e-07, + "loss": 0.1995, + "step": 4700 + }, + { + "epoch": 1.2509313464608836, + "grad_norm": 0.26588764786720276, + "learning_rate": 1.7234514471117457e-07, + "loss": 0.2017, + "step": 4701 + }, + { + "epoch": 1.2511974454497072, + "grad_norm": 0.26175814867019653, + "learning_rate": 1.7233348429958644e-07, + "loss": 0.2157, + "step": 4702 + }, + { + "epoch": 1.2514635444385311, + "grad_norm": 0.2613925039768219, + "learning_rate": 1.7232182182490039e-07, + "loss": 0.1861, + "step": 4703 + }, + { + "epoch": 1.251729643427355, + "grad_norm": 0.32963302731513977, + "learning_rate": 1.72310157287449e-07, + "loss": 0.1796, + "step": 4704 + }, + { + "epoch": 1.2519957424161787, + "grad_norm": 0.3444957733154297, + "learning_rate": 1.72298490687565e-07, + "loss": 0.186, + "step": 4705 + }, + { + "epoch": 1.2522618414050026, + "grad_norm": 0.33151406049728394, + "learning_rate": 1.7228682202558109e-07, + "loss": 0.1905, + "step": 4706 + }, + { + "epoch": 1.2525279403938265, + "grad_norm": 0.2710644602775574, + "learning_rate": 1.7227515130183015e-07, + "loss": 0.1897, + "step": 4707 + }, + { + "epoch": 1.2527940393826502, + "grad_norm": 0.28254640102386475, + "learning_rate": 1.7226347851664502e-07, + "loss": 0.1656, + "step": 4708 + }, + { + "epoch": 1.2530601383714741, + "grad_norm": 0.3197149634361267, + "learning_rate": 1.7225180367035863e-07, + "loss": 0.2109, + "step": 4709 + }, + { + "epoch": 1.253326237360298, + "grad_norm": 0.2946774959564209, + "learning_rate": 1.7224012676330397e-07, + "loss": 0.2028, + "step": 4710 + }, + { + "epoch": 1.253592336349122, + "grad_norm": 0.2872462570667267, + "learning_rate": 1.7222844779581411e-07, + "loss": 0.2001, + "step": 4711 + }, + { + "epoch": 1.2538584353379458, + "grad_norm": 0.39819571375846863, + "learning_rate": 1.7221676676822213e-07, + "loss": 0.224, + "step": 4712 + }, + { + "epoch": 1.2541245343267695, + "grad_norm": 0.2614293396472931, + "learning_rate": 1.7220508368086124e-07, + "loss": 0.2047, + "step": 4713 + }, + { + "epoch": 1.2543906333155934, + "grad_norm": 0.33155396580696106, + "learning_rate": 1.721933985340646e-07, + "loss": 0.1845, + "step": 4714 + }, + { + "epoch": 1.2546567323044173, + "grad_norm": 0.39900216460227966, + "learning_rate": 1.7218171132816553e-07, + "loss": 0.2002, + "step": 4715 + }, + { + "epoch": 1.254922831293241, + "grad_norm": 0.4404732584953308, + "learning_rate": 1.721700220634974e-07, + "loss": 0.1877, + "step": 4716 + }, + { + "epoch": 1.255188930282065, + "grad_norm": 0.328723281621933, + "learning_rate": 1.721583307403936e-07, + "loss": 0.2032, + "step": 4717 + }, + { + "epoch": 1.2554550292708888, + "grad_norm": 0.2853264808654785, + "learning_rate": 1.7214663735918752e-07, + "loss": 0.1712, + "step": 4718 + }, + { + "epoch": 1.2557211282597125, + "grad_norm": 0.319513201713562, + "learning_rate": 1.7213494192021275e-07, + "loss": 0.1962, + "step": 4719 + }, + { + "epoch": 1.2559872272485364, + "grad_norm": 0.3298017382621765, + "learning_rate": 1.7212324442380285e-07, + "loss": 0.1957, + "step": 4720 + }, + { + "epoch": 1.2562533262373603, + "grad_norm": 0.447463721036911, + "learning_rate": 1.7211154487029148e-07, + "loss": 0.2241, + "step": 4721 + }, + { + "epoch": 1.2565194252261842, + "grad_norm": 0.33973363041877747, + "learning_rate": 1.720998432600123e-07, + "loss": 0.1978, + "step": 4722 + }, + { + "epoch": 1.256785524215008, + "grad_norm": 0.3074515163898468, + "learning_rate": 1.720881395932991e-07, + "loss": 0.1904, + "step": 4723 + }, + { + "epoch": 1.2570516232038318, + "grad_norm": 0.4059058427810669, + "learning_rate": 1.7207643387048564e-07, + "loss": 0.2047, + "step": 4724 + }, + { + "epoch": 1.2573177221926557, + "grad_norm": 0.35773766040802, + "learning_rate": 1.7206472609190584e-07, + "loss": 0.22, + "step": 4725 + }, + { + "epoch": 1.2575838211814796, + "grad_norm": 0.7546595931053162, + "learning_rate": 1.7205301625789357e-07, + "loss": 0.2058, + "step": 4726 + }, + { + "epoch": 1.2578499201703033, + "grad_norm": 0.2706814408302307, + "learning_rate": 1.7204130436878291e-07, + "loss": 0.2007, + "step": 4727 + }, + { + "epoch": 1.2581160191591272, + "grad_norm": 0.35128042101860046, + "learning_rate": 1.720295904249078e-07, + "loss": 0.2038, + "step": 4728 + }, + { + "epoch": 1.258382118147951, + "grad_norm": 0.3030253052711487, + "learning_rate": 1.7201787442660244e-07, + "loss": 0.1925, + "step": 4729 + }, + { + "epoch": 1.2586482171367748, + "grad_norm": 0.29520195722579956, + "learning_rate": 1.7200615637420094e-07, + "loss": 0.1933, + "step": 4730 + }, + { + "epoch": 1.2589143161255987, + "grad_norm": 0.3605925142765045, + "learning_rate": 1.7199443626803753e-07, + "loss": 0.2176, + "step": 4731 + }, + { + "epoch": 1.2591804151144226, + "grad_norm": 0.33408084511756897, + "learning_rate": 1.7198271410844648e-07, + "loss": 0.1977, + "step": 4732 + }, + { + "epoch": 1.2594465141032465, + "grad_norm": 0.3335241973400116, + "learning_rate": 1.719709898957622e-07, + "loss": 0.1935, + "step": 4733 + }, + { + "epoch": 1.2597126130920704, + "grad_norm": 0.324561208486557, + "learning_rate": 1.71959263630319e-07, + "loss": 0.2046, + "step": 4734 + }, + { + "epoch": 1.259978712080894, + "grad_norm": 0.40318918228149414, + "learning_rate": 1.719475353124514e-07, + "loss": 0.2031, + "step": 4735 + }, + { + "epoch": 1.260244811069718, + "grad_norm": 0.37982454895973206, + "learning_rate": 1.7193580494249387e-07, + "loss": 0.1976, + "step": 4736 + }, + { + "epoch": 1.2605109100585419, + "grad_norm": 0.3375946879386902, + "learning_rate": 1.71924072520781e-07, + "loss": 0.1984, + "step": 4737 + }, + { + "epoch": 1.2607770090473656, + "grad_norm": 0.686668336391449, + "learning_rate": 1.7191233804764745e-07, + "loss": 0.1988, + "step": 4738 + }, + { + "epoch": 1.2610431080361895, + "grad_norm": 0.26938754320144653, + "learning_rate": 1.7190060152342785e-07, + "loss": 0.191, + "step": 4739 + }, + { + "epoch": 1.2613092070250134, + "grad_norm": 0.31448179483413696, + "learning_rate": 1.71888862948457e-07, + "loss": 0.1893, + "step": 4740 + }, + { + "epoch": 1.261575306013837, + "grad_norm": 0.2956158220767975, + "learning_rate": 1.7187712232306969e-07, + "loss": 0.2024, + "step": 4741 + }, + { + "epoch": 1.261841405002661, + "grad_norm": 0.2542344629764557, + "learning_rate": 1.718653796476008e-07, + "loss": 0.1882, + "step": 4742 + }, + { + "epoch": 1.2621075039914849, + "grad_norm": 0.36488577723503113, + "learning_rate": 1.7185363492238523e-07, + "loss": 0.2209, + "step": 4743 + }, + { + "epoch": 1.2623736029803085, + "grad_norm": 0.23775090277194977, + "learning_rate": 1.71841888147758e-07, + "loss": 0.1656, + "step": 4744 + }, + { + "epoch": 1.2626397019691324, + "grad_norm": 0.29581519961357117, + "learning_rate": 1.718301393240541e-07, + "loss": 0.2135, + "step": 4745 + }, + { + "epoch": 1.2629058009579563, + "grad_norm": 0.335602343082428, + "learning_rate": 1.7181838845160868e-07, + "loss": 0.196, + "step": 4746 + }, + { + "epoch": 1.2631718999467803, + "grad_norm": 0.3601905405521393, + "learning_rate": 1.718066355307569e-07, + "loss": 0.2075, + "step": 4747 + }, + { + "epoch": 1.2634379989356042, + "grad_norm": 0.3322058320045471, + "learning_rate": 1.717948805618339e-07, + "loss": 0.1944, + "step": 4748 + }, + { + "epoch": 1.2637040979244278, + "grad_norm": 0.32237154245376587, + "learning_rate": 1.7178312354517507e-07, + "loss": 0.1965, + "step": 4749 + }, + { + "epoch": 1.2639701969132517, + "grad_norm": 0.27279791235923767, + "learning_rate": 1.7177136448111567e-07, + "loss": 0.1776, + "step": 4750 + }, + { + "epoch": 1.2642362959020756, + "grad_norm": 0.2918681502342224, + "learning_rate": 1.717596033699911e-07, + "loss": 0.2094, + "step": 4751 + }, + { + "epoch": 1.2645023948908993, + "grad_norm": 0.3273528814315796, + "learning_rate": 1.717478402121368e-07, + "loss": 0.1863, + "step": 4752 + }, + { + "epoch": 1.2647684938797232, + "grad_norm": 0.34131279587745667, + "learning_rate": 1.7173607500788832e-07, + "loss": 0.1817, + "step": 4753 + }, + { + "epoch": 1.2650345928685471, + "grad_norm": 0.32170045375823975, + "learning_rate": 1.7172430775758122e-07, + "loss": 0.2036, + "step": 4754 + }, + { + "epoch": 1.2653006918573708, + "grad_norm": 0.2571232318878174, + "learning_rate": 1.717125384615511e-07, + "loss": 0.1878, + "step": 4755 + }, + { + "epoch": 1.2655667908461947, + "grad_norm": 0.32784557342529297, + "learning_rate": 1.7170076712013364e-07, + "loss": 0.199, + "step": 4756 + }, + { + "epoch": 1.2658328898350186, + "grad_norm": 0.25731077790260315, + "learning_rate": 1.716889937336646e-07, + "loss": 0.1862, + "step": 4757 + }, + { + "epoch": 1.2660989888238425, + "grad_norm": 0.28408822417259216, + "learning_rate": 1.716772183024798e-07, + "loss": 0.1936, + "step": 4758 + }, + { + "epoch": 1.2663650878126664, + "grad_norm": 0.327414870262146, + "learning_rate": 1.7166544082691507e-07, + "loss": 0.1957, + "step": 4759 + }, + { + "epoch": 1.2666311868014901, + "grad_norm": 0.2584860920906067, + "learning_rate": 1.7165366130730633e-07, + "loss": 0.1838, + "step": 4760 + }, + { + "epoch": 1.266897285790314, + "grad_norm": 0.3008761703968048, + "learning_rate": 1.7164187974398957e-07, + "loss": 0.2132, + "step": 4761 + }, + { + "epoch": 1.267163384779138, + "grad_norm": 0.327202171087265, + "learning_rate": 1.7163009613730082e-07, + "loss": 0.1951, + "step": 4762 + }, + { + "epoch": 1.2674294837679616, + "grad_norm": 0.2482883632183075, + "learning_rate": 1.7161831048757618e-07, + "loss": 0.1919, + "step": 4763 + }, + { + "epoch": 1.2676955827567855, + "grad_norm": 0.36884617805480957, + "learning_rate": 1.7160652279515178e-07, + "loss": 0.2282, + "step": 4764 + }, + { + "epoch": 1.2679616817456094, + "grad_norm": 0.25854942202568054, + "learning_rate": 1.7159473306036384e-07, + "loss": 0.1837, + "step": 4765 + }, + { + "epoch": 1.268227780734433, + "grad_norm": 0.33825406432151794, + "learning_rate": 1.715829412835486e-07, + "loss": 0.2183, + "step": 4766 + }, + { + "epoch": 1.268493879723257, + "grad_norm": 0.40041297674179077, + "learning_rate": 1.7157114746504245e-07, + "loss": 0.196, + "step": 4767 + }, + { + "epoch": 1.268759978712081, + "grad_norm": 0.2809421420097351, + "learning_rate": 1.7155935160518168e-07, + "loss": 0.1961, + "step": 4768 + }, + { + "epoch": 1.2690260777009048, + "grad_norm": 0.2935275733470917, + "learning_rate": 1.7154755370430285e-07, + "loss": 0.2064, + "step": 4769 + }, + { + "epoch": 1.2692921766897287, + "grad_norm": 0.2821356952190399, + "learning_rate": 1.7153575376274236e-07, + "loss": 0.1932, + "step": 4770 + }, + { + "epoch": 1.2695582756785524, + "grad_norm": 0.29932886362075806, + "learning_rate": 1.715239517808368e-07, + "loss": 0.2296, + "step": 4771 + }, + { + "epoch": 1.2698243746673763, + "grad_norm": 0.2695576846599579, + "learning_rate": 1.7151214775892282e-07, + "loss": 0.1968, + "step": 4772 + }, + { + "epoch": 1.2700904736562002, + "grad_norm": 0.3856523036956787, + "learning_rate": 1.7150034169733704e-07, + "loss": 0.1875, + "step": 4773 + }, + { + "epoch": 1.2703565726450239, + "grad_norm": 0.2853524386882782, + "learning_rate": 1.7148853359641623e-07, + "loss": 0.1921, + "step": 4774 + }, + { + "epoch": 1.2706226716338478, + "grad_norm": 0.28011682629585266, + "learning_rate": 1.714767234564972e-07, + "loss": 0.193, + "step": 4775 + }, + { + "epoch": 1.2708887706226717, + "grad_norm": 0.27650943398475647, + "learning_rate": 1.7146491127791673e-07, + "loss": 0.2134, + "step": 4776 + }, + { + "epoch": 1.2711548696114954, + "grad_norm": 0.40806689858436584, + "learning_rate": 1.7145309706101176e-07, + "loss": 0.2054, + "step": 4777 + }, + { + "epoch": 1.2714209686003193, + "grad_norm": 0.32174810767173767, + "learning_rate": 1.714412808061193e-07, + "loss": 0.2045, + "step": 4778 + }, + { + "epoch": 1.2716870675891432, + "grad_norm": 0.2636895477771759, + "learning_rate": 1.714294625135763e-07, + "loss": 0.1788, + "step": 4779 + }, + { + "epoch": 1.2719531665779669, + "grad_norm": 0.24986398220062256, + "learning_rate": 1.714176421837199e-07, + "loss": 0.1781, + "step": 4780 + }, + { + "epoch": 1.2722192655667908, + "grad_norm": 0.28838348388671875, + "learning_rate": 1.714058198168872e-07, + "loss": 0.2088, + "step": 4781 + }, + { + "epoch": 1.2724853645556147, + "grad_norm": 0.3029356002807617, + "learning_rate": 1.713939954134154e-07, + "loss": 0.1666, + "step": 4782 + }, + { + "epoch": 1.2727514635444386, + "grad_norm": 0.384539395570755, + "learning_rate": 1.7138216897364181e-07, + "loss": 0.1817, + "step": 4783 + }, + { + "epoch": 1.2730175625332625, + "grad_norm": 0.2810957431793213, + "learning_rate": 1.7137034049790372e-07, + "loss": 0.1861, + "step": 4784 + }, + { + "epoch": 1.2732836615220862, + "grad_norm": 0.2633908689022064, + "learning_rate": 1.7135850998653844e-07, + "loss": 0.1961, + "step": 4785 + }, + { + "epoch": 1.27354976051091, + "grad_norm": 0.34910818934440613, + "learning_rate": 1.7134667743988348e-07, + "loss": 0.218, + "step": 4786 + }, + { + "epoch": 1.273815859499734, + "grad_norm": 0.26924970746040344, + "learning_rate": 1.7133484285827628e-07, + "loss": 0.1988, + "step": 4787 + }, + { + "epoch": 1.2740819584885577, + "grad_norm": 0.3670582175254822, + "learning_rate": 1.713230062420544e-07, + "loss": 0.2103, + "step": 4788 + }, + { + "epoch": 1.2743480574773816, + "grad_norm": 0.2717945873737335, + "learning_rate": 1.7131116759155546e-07, + "loss": 0.1915, + "step": 4789 + }, + { + "epoch": 1.2746141564662055, + "grad_norm": 0.2648572325706482, + "learning_rate": 1.712993269071171e-07, + "loss": 0.2044, + "step": 4790 + }, + { + "epoch": 1.2748802554550291, + "grad_norm": 0.2755540609359741, + "learning_rate": 1.7128748418907702e-07, + "loss": 0.2045, + "step": 4791 + }, + { + "epoch": 1.275146354443853, + "grad_norm": 0.26459941267967224, + "learning_rate": 1.712756394377731e-07, + "loss": 0.194, + "step": 4792 + }, + { + "epoch": 1.275412453432677, + "grad_norm": 0.4075152575969696, + "learning_rate": 1.7126379265354304e-07, + "loss": 0.2148, + "step": 4793 + }, + { + "epoch": 1.2756785524215009, + "grad_norm": 0.26972058415412903, + "learning_rate": 1.712519438367248e-07, + "loss": 0.1774, + "step": 4794 + }, + { + "epoch": 1.2759446514103248, + "grad_norm": 0.31496238708496094, + "learning_rate": 1.7124009298765638e-07, + "loss": 0.1993, + "step": 4795 + }, + { + "epoch": 1.2762107503991484, + "grad_norm": 0.35669827461242676, + "learning_rate": 1.7122824010667573e-07, + "loss": 0.1979, + "step": 4796 + }, + { + "epoch": 1.2764768493879723, + "grad_norm": 0.38979271054267883, + "learning_rate": 1.712163851941209e-07, + "loss": 0.2125, + "step": 4797 + }, + { + "epoch": 1.2767429483767962, + "grad_norm": 0.34344246983528137, + "learning_rate": 1.7120452825033004e-07, + "loss": 0.2174, + "step": 4798 + }, + { + "epoch": 1.27700904736562, + "grad_norm": 0.24070796370506287, + "learning_rate": 1.7119266927564136e-07, + "loss": 0.1859, + "step": 4799 + }, + { + "epoch": 1.2772751463544438, + "grad_norm": 0.328001469373703, + "learning_rate": 1.7118080827039308e-07, + "loss": 0.2101, + "step": 4800 + }, + { + "epoch": 1.2775412453432677, + "grad_norm": 0.23886315524578094, + "learning_rate": 1.711689452349235e-07, + "loss": 0.1698, + "step": 4801 + }, + { + "epoch": 1.2778073443320914, + "grad_norm": 0.3375195264816284, + "learning_rate": 1.7115708016957097e-07, + "loss": 0.2204, + "step": 4802 + }, + { + "epoch": 1.2780734433209153, + "grad_norm": 0.25857001543045044, + "learning_rate": 1.7114521307467397e-07, + "loss": 0.1865, + "step": 4803 + }, + { + "epoch": 1.2783395423097392, + "grad_norm": 0.41098928451538086, + "learning_rate": 1.7113334395057084e-07, + "loss": 0.1958, + "step": 4804 + }, + { + "epoch": 1.2786056412985631, + "grad_norm": 0.2926747798919678, + "learning_rate": 1.7112147279760024e-07, + "loss": 0.1951, + "step": 4805 + }, + { + "epoch": 1.278871740287387, + "grad_norm": 0.2837531566619873, + "learning_rate": 1.7110959961610067e-07, + "loss": 0.1995, + "step": 4806 + }, + { + "epoch": 1.2791378392762107, + "grad_norm": 0.4281994700431824, + "learning_rate": 1.7109772440641085e-07, + "loss": 0.2027, + "step": 4807 + }, + { + "epoch": 1.2794039382650346, + "grad_norm": 0.27004462480545044, + "learning_rate": 1.7108584716886947e-07, + "loss": 0.1847, + "step": 4808 + }, + { + "epoch": 1.2796700372538585, + "grad_norm": 0.2865297496318817, + "learning_rate": 1.7107396790381526e-07, + "loss": 0.1909, + "step": 4809 + }, + { + "epoch": 1.2799361362426822, + "grad_norm": 0.2897714078426361, + "learning_rate": 1.71062086611587e-07, + "loss": 0.2034, + "step": 4810 + }, + { + "epoch": 1.2802022352315061, + "grad_norm": 0.34067612886428833, + "learning_rate": 1.710502032925237e-07, + "loss": 0.1942, + "step": 4811 + }, + { + "epoch": 1.28046833422033, + "grad_norm": 0.2576823830604553, + "learning_rate": 1.7103831794696418e-07, + "loss": 0.179, + "step": 4812 + }, + { + "epoch": 1.2807344332091537, + "grad_norm": 0.32400837540626526, + "learning_rate": 1.710264305752475e-07, + "loss": 0.1758, + "step": 4813 + }, + { + "epoch": 1.2810005321979776, + "grad_norm": 0.30534428358078003, + "learning_rate": 1.7101454117771268e-07, + "loss": 0.2088, + "step": 4814 + }, + { + "epoch": 1.2812666311868015, + "grad_norm": 0.27816951274871826, + "learning_rate": 1.7100264975469885e-07, + "loss": 0.1932, + "step": 4815 + }, + { + "epoch": 1.2815327301756254, + "grad_norm": 0.33873140811920166, + "learning_rate": 1.7099075630654513e-07, + "loss": 0.1913, + "step": 4816 + }, + { + "epoch": 1.281798829164449, + "grad_norm": 0.2722085118293762, + "learning_rate": 1.7097886083359078e-07, + "loss": 0.1841, + "step": 4817 + }, + { + "epoch": 1.282064928153273, + "grad_norm": 0.3571869432926178, + "learning_rate": 1.7096696333617512e-07, + "loss": 0.1922, + "step": 4818 + }, + { + "epoch": 1.282331027142097, + "grad_norm": 0.34577062726020813, + "learning_rate": 1.7095506381463746e-07, + "loss": 0.1942, + "step": 4819 + }, + { + "epoch": 1.2825971261309208, + "grad_norm": 0.27142685651779175, + "learning_rate": 1.7094316226931715e-07, + "loss": 0.1974, + "step": 4820 + }, + { + "epoch": 1.2828632251197445, + "grad_norm": 0.30213817954063416, + "learning_rate": 1.7093125870055368e-07, + "loss": 0.1896, + "step": 4821 + }, + { + "epoch": 1.2831293241085684, + "grad_norm": 0.2646564543247223, + "learning_rate": 1.7091935310868664e-07, + "loss": 0.1803, + "step": 4822 + }, + { + "epoch": 1.2833954230973923, + "grad_norm": 0.27617743611335754, + "learning_rate": 1.7090744549405548e-07, + "loss": 0.1993, + "step": 4823 + }, + { + "epoch": 1.283661522086216, + "grad_norm": 0.26707300543785095, + "learning_rate": 1.7089553585699988e-07, + "loss": 0.179, + "step": 4824 + }, + { + "epoch": 1.2839276210750399, + "grad_norm": 0.2786442041397095, + "learning_rate": 1.7088362419785956e-07, + "loss": 0.2045, + "step": 4825 + }, + { + "epoch": 1.2841937200638638, + "grad_norm": 0.3241187632083893, + "learning_rate": 1.7087171051697423e-07, + "loss": 0.1992, + "step": 4826 + }, + { + "epoch": 1.2844598190526875, + "grad_norm": 0.2679177522659302, + "learning_rate": 1.7085979481468368e-07, + "loss": 0.1699, + "step": 4827 + }, + { + "epoch": 1.2847259180415114, + "grad_norm": 0.2669777274131775, + "learning_rate": 1.708478770913278e-07, + "loss": 0.1814, + "step": 4828 + }, + { + "epoch": 1.2849920170303353, + "grad_norm": 0.33494284749031067, + "learning_rate": 1.708359573472465e-07, + "loss": 0.1918, + "step": 4829 + }, + { + "epoch": 1.2852581160191592, + "grad_norm": 0.24893009662628174, + "learning_rate": 1.7082403558277973e-07, + "loss": 0.1842, + "step": 4830 + }, + { + "epoch": 1.285524215007983, + "grad_norm": 0.3471750020980835, + "learning_rate": 1.7081211179826758e-07, + "loss": 0.1934, + "step": 4831 + }, + { + "epoch": 1.2857903139968068, + "grad_norm": 0.2843485474586487, + "learning_rate": 1.7080018599405008e-07, + "loss": 0.1923, + "step": 4832 + }, + { + "epoch": 1.2860564129856307, + "grad_norm": 0.277513325214386, + "learning_rate": 1.707882581704674e-07, + "loss": 0.1994, + "step": 4833 + }, + { + "epoch": 1.2863225119744546, + "grad_norm": 0.29675063490867615, + "learning_rate": 1.7077632832785976e-07, + "loss": 0.2092, + "step": 4834 + }, + { + "epoch": 1.2865886109632783, + "grad_norm": 0.524325966835022, + "learning_rate": 1.7076439646656739e-07, + "loss": 0.204, + "step": 4835 + }, + { + "epoch": 1.2868547099521022, + "grad_norm": 0.2630719542503357, + "learning_rate": 1.7075246258693065e-07, + "loss": 0.187, + "step": 4836 + }, + { + "epoch": 1.287120808940926, + "grad_norm": 0.2807718813419342, + "learning_rate": 1.7074052668928988e-07, + "loss": 0.2098, + "step": 4837 + }, + { + "epoch": 1.2873869079297497, + "grad_norm": 0.2692658603191376, + "learning_rate": 1.707285887739856e-07, + "loss": 0.1996, + "step": 4838 + }, + { + "epoch": 1.2876530069185737, + "grad_norm": 0.5677692890167236, + "learning_rate": 1.707166488413582e-07, + "loss": 0.1857, + "step": 4839 + }, + { + "epoch": 1.2879191059073976, + "grad_norm": 0.32517513632774353, + "learning_rate": 1.7070470689174825e-07, + "loss": 0.1874, + "step": 4840 + }, + { + "epoch": 1.2881852048962215, + "grad_norm": 0.2788250148296356, + "learning_rate": 1.706927629254964e-07, + "loss": 0.202, + "step": 4841 + }, + { + "epoch": 1.2884513038850454, + "grad_norm": 0.2677358388900757, + "learning_rate": 1.706808169429433e-07, + "loss": 0.194, + "step": 4842 + }, + { + "epoch": 1.288717402873869, + "grad_norm": 0.2777240574359894, + "learning_rate": 1.7066886894442967e-07, + "loss": 0.1925, + "step": 4843 + }, + { + "epoch": 1.288983501862693, + "grad_norm": 0.2724658250808716, + "learning_rate": 1.706569189302963e-07, + "loss": 0.2099, + "step": 4844 + }, + { + "epoch": 1.2892496008515169, + "grad_norm": 0.262777179479599, + "learning_rate": 1.7064496690088398e-07, + "loss": 0.1765, + "step": 4845 + }, + { + "epoch": 1.2895156998403405, + "grad_norm": 0.2800410985946655, + "learning_rate": 1.706330128565337e-07, + "loss": 0.1942, + "step": 4846 + }, + { + "epoch": 1.2897817988291644, + "grad_norm": 0.30761006474494934, + "learning_rate": 1.706210567975863e-07, + "loss": 0.2047, + "step": 4847 + }, + { + "epoch": 1.2900478978179883, + "grad_norm": 0.44353392720222473, + "learning_rate": 1.7060909872438292e-07, + "loss": 0.2363, + "step": 4848 + }, + { + "epoch": 1.290313996806812, + "grad_norm": 0.3104381859302521, + "learning_rate": 1.7059713863726452e-07, + "loss": 0.2075, + "step": 4849 + }, + { + "epoch": 1.290580095795636, + "grad_norm": 0.25530850887298584, + "learning_rate": 1.7058517653657228e-07, + "loss": 0.1667, + "step": 4850 + }, + { + "epoch": 1.2908461947844598, + "grad_norm": 0.30483558773994446, + "learning_rate": 1.7057321242264737e-07, + "loss": 0.2259, + "step": 4851 + }, + { + "epoch": 1.2911122937732837, + "grad_norm": 0.35742443799972534, + "learning_rate": 1.7056124629583103e-07, + "loss": 0.1854, + "step": 4852 + }, + { + "epoch": 1.2913783927621076, + "grad_norm": 0.27920299768447876, + "learning_rate": 1.7054927815646454e-07, + "loss": 0.1852, + "step": 4853 + }, + { + "epoch": 1.2916444917509313, + "grad_norm": 0.25677502155303955, + "learning_rate": 1.705373080048893e-07, + "loss": 0.1803, + "step": 4854 + }, + { + "epoch": 1.2919105907397552, + "grad_norm": 0.4229871928691864, + "learning_rate": 1.7052533584144668e-07, + "loss": 0.1972, + "step": 4855 + }, + { + "epoch": 1.2921766897285791, + "grad_norm": 0.31956547498703003, + "learning_rate": 1.7051336166647816e-07, + "loss": 0.1964, + "step": 4856 + }, + { + "epoch": 1.2924427887174028, + "grad_norm": 0.4129461646080017, + "learning_rate": 1.705013854803253e-07, + "loss": 0.2001, + "step": 4857 + }, + { + "epoch": 1.2927088877062267, + "grad_norm": 0.2602297067642212, + "learning_rate": 1.7048940728332962e-07, + "loss": 0.1902, + "step": 4858 + }, + { + "epoch": 1.2929749866950506, + "grad_norm": 0.32168012857437134, + "learning_rate": 1.704774270758328e-07, + "loss": 0.2136, + "step": 4859 + }, + { + "epoch": 1.2932410856838743, + "grad_norm": 0.31181225180625916, + "learning_rate": 1.704654448581766e-07, + "loss": 0.1853, + "step": 4860 + }, + { + "epoch": 1.2935071846726982, + "grad_norm": 0.28490981459617615, + "learning_rate": 1.704534606307027e-07, + "loss": 0.1834, + "step": 4861 + }, + { + "epoch": 1.2937732836615221, + "grad_norm": 0.26168015599250793, + "learning_rate": 1.704414743937529e-07, + "loss": 0.1822, + "step": 4862 + }, + { + "epoch": 1.2940393826503458, + "grad_norm": 0.3345613181591034, + "learning_rate": 1.7042948614766917e-07, + "loss": 0.1798, + "step": 4863 + }, + { + "epoch": 1.2943054816391697, + "grad_norm": 0.293201744556427, + "learning_rate": 1.7041749589279333e-07, + "loss": 0.1843, + "step": 4864 + }, + { + "epoch": 1.2945715806279936, + "grad_norm": 0.2994779944419861, + "learning_rate": 1.7040550362946742e-07, + "loss": 0.1915, + "step": 4865 + }, + { + "epoch": 1.2948376796168175, + "grad_norm": 0.28619736433029175, + "learning_rate": 1.7039350935803344e-07, + "loss": 0.2006, + "step": 4866 + }, + { + "epoch": 1.2951037786056414, + "grad_norm": 0.501125156879425, + "learning_rate": 1.703815130788336e-07, + "loss": 0.2112, + "step": 4867 + }, + { + "epoch": 1.295369877594465, + "grad_norm": 0.2721174359321594, + "learning_rate": 1.7036951479220992e-07, + "loss": 0.1928, + "step": 4868 + }, + { + "epoch": 1.295635976583289, + "grad_norm": 0.4003330171108246, + "learning_rate": 1.7035751449850473e-07, + "loss": 0.2115, + "step": 4869 + }, + { + "epoch": 1.295902075572113, + "grad_norm": 0.34192270040512085, + "learning_rate": 1.7034551219806024e-07, + "loss": 0.1849, + "step": 4870 + }, + { + "epoch": 1.2961681745609366, + "grad_norm": 0.26448309421539307, + "learning_rate": 1.703335078912188e-07, + "loss": 0.1792, + "step": 4871 + }, + { + "epoch": 1.2964342735497605, + "grad_norm": 0.3149232566356659, + "learning_rate": 1.7032150157832275e-07, + "loss": 0.2232, + "step": 4872 + }, + { + "epoch": 1.2967003725385844, + "grad_norm": 0.27277079224586487, + "learning_rate": 1.7030949325971458e-07, + "loss": 0.1893, + "step": 4873 + }, + { + "epoch": 1.296966471527408, + "grad_norm": 0.2874739468097687, + "learning_rate": 1.702974829357368e-07, + "loss": 0.1996, + "step": 4874 + }, + { + "epoch": 1.297232570516232, + "grad_norm": 0.36319002509117126, + "learning_rate": 1.7028547060673198e-07, + "loss": 0.1875, + "step": 4875 + }, + { + "epoch": 1.2974986695050559, + "grad_norm": 0.3791465759277344, + "learning_rate": 1.702734562730427e-07, + "loss": 0.2106, + "step": 4876 + }, + { + "epoch": 1.2977647684938798, + "grad_norm": 0.29044514894485474, + "learning_rate": 1.702614399350116e-07, + "loss": 0.1869, + "step": 4877 + }, + { + "epoch": 1.2980308674827037, + "grad_norm": 0.27887943387031555, + "learning_rate": 1.702494215929815e-07, + "loss": 0.2003, + "step": 4878 + }, + { + "epoch": 1.2982969664715274, + "grad_norm": 0.3121079206466675, + "learning_rate": 1.7023740124729512e-07, + "loss": 0.1969, + "step": 4879 + }, + { + "epoch": 1.2985630654603513, + "grad_norm": 0.2559910714626312, + "learning_rate": 1.7022537889829538e-07, + "loss": 0.1742, + "step": 4880 + }, + { + "epoch": 1.2988291644491752, + "grad_norm": 0.29559826850891113, + "learning_rate": 1.7021335454632508e-07, + "loss": 0.1923, + "step": 4881 + }, + { + "epoch": 1.2990952634379989, + "grad_norm": 0.2471504807472229, + "learning_rate": 1.7020132819172726e-07, + "loss": 0.1852, + "step": 4882 + }, + { + "epoch": 1.2993613624268228, + "grad_norm": 0.31838029623031616, + "learning_rate": 1.7018929983484487e-07, + "loss": 0.1953, + "step": 4883 + }, + { + "epoch": 1.2996274614156467, + "grad_norm": 0.3540472388267517, + "learning_rate": 1.7017726947602103e-07, + "loss": 0.209, + "step": 4884 + }, + { + "epoch": 1.2998935604044703, + "grad_norm": 0.2526331841945648, + "learning_rate": 1.7016523711559885e-07, + "loss": 0.1896, + "step": 4885 + }, + { + "epoch": 1.3001596593932943, + "grad_norm": 0.2692146897315979, + "learning_rate": 1.7015320275392152e-07, + "loss": 0.1837, + "step": 4886 + }, + { + "epoch": 1.3004257583821182, + "grad_norm": 0.33686015009880066, + "learning_rate": 1.7014116639133233e-07, + "loss": 0.1925, + "step": 4887 + }, + { + "epoch": 1.300691857370942, + "grad_norm": 0.26608896255493164, + "learning_rate": 1.701291280281745e-07, + "loss": 0.1756, + "step": 4888 + }, + { + "epoch": 1.300957956359766, + "grad_norm": 0.37117600440979004, + "learning_rate": 1.7011708766479144e-07, + "loss": 0.1964, + "step": 4889 + }, + { + "epoch": 1.3012240553485896, + "grad_norm": 0.26166433095932007, + "learning_rate": 1.7010504530152655e-07, + "loss": 0.1934, + "step": 4890 + }, + { + "epoch": 1.3014901543374136, + "grad_norm": 0.3566124439239502, + "learning_rate": 1.7009300093872332e-07, + "loss": 0.2054, + "step": 4891 + }, + { + "epoch": 1.3017562533262375, + "grad_norm": 0.3137739300727844, + "learning_rate": 1.7008095457672527e-07, + "loss": 0.1991, + "step": 4892 + }, + { + "epoch": 1.3020223523150611, + "grad_norm": 0.28113406896591187, + "learning_rate": 1.7006890621587597e-07, + "loss": 0.187, + "step": 4893 + }, + { + "epoch": 1.302288451303885, + "grad_norm": 0.2700991630554199, + "learning_rate": 1.700568558565191e-07, + "loss": 0.2088, + "step": 4894 + }, + { + "epoch": 1.302554550292709, + "grad_norm": 0.3692110478878021, + "learning_rate": 1.700448034989983e-07, + "loss": 0.1902, + "step": 4895 + }, + { + "epoch": 1.3028206492815326, + "grad_norm": 0.2934630513191223, + "learning_rate": 1.7003274914365738e-07, + "loss": 0.1828, + "step": 4896 + }, + { + "epoch": 1.3030867482703565, + "grad_norm": 0.3301782011985779, + "learning_rate": 1.7002069279084015e-07, + "loss": 0.1909, + "step": 4897 + }, + { + "epoch": 1.3033528472591804, + "grad_norm": 0.2800828516483307, + "learning_rate": 1.700086344408905e-07, + "loss": 0.1949, + "step": 4898 + }, + { + "epoch": 1.3036189462480043, + "grad_norm": 0.253656804561615, + "learning_rate": 1.699965740941523e-07, + "loss": 0.176, + "step": 4899 + }, + { + "epoch": 1.303885045236828, + "grad_norm": 0.24801690876483917, + "learning_rate": 1.6998451175096956e-07, + "loss": 0.1838, + "step": 4900 + }, + { + "epoch": 1.304151144225652, + "grad_norm": 0.28824788331985474, + "learning_rate": 1.6997244741168632e-07, + "loss": 0.1976, + "step": 4901 + }, + { + "epoch": 1.3044172432144758, + "grad_norm": 0.3612249195575714, + "learning_rate": 1.6996038107664674e-07, + "loss": 0.186, + "step": 4902 + }, + { + "epoch": 1.3046833422032997, + "grad_norm": 0.26279523968696594, + "learning_rate": 1.6994831274619487e-07, + "loss": 0.1984, + "step": 4903 + }, + { + "epoch": 1.3049494411921234, + "grad_norm": 0.36049503087997437, + "learning_rate": 1.6993624242067502e-07, + "loss": 0.1955, + "step": 4904 + }, + { + "epoch": 1.3052155401809473, + "grad_norm": 0.2847253680229187, + "learning_rate": 1.6992417010043141e-07, + "loss": 0.1976, + "step": 4905 + }, + { + "epoch": 1.3054816391697712, + "grad_norm": 0.2557484805583954, + "learning_rate": 1.6991209578580837e-07, + "loss": 0.177, + "step": 4906 + }, + { + "epoch": 1.305747738158595, + "grad_norm": 0.26285165548324585, + "learning_rate": 1.6990001947715028e-07, + "loss": 0.189, + "step": 4907 + }, + { + "epoch": 1.3060138371474188, + "grad_norm": 0.25078821182250977, + "learning_rate": 1.698879411748016e-07, + "loss": 0.184, + "step": 4908 + }, + { + "epoch": 1.3062799361362427, + "grad_norm": 0.2504613399505615, + "learning_rate": 1.6987586087910683e-07, + "loss": 0.1686, + "step": 4909 + }, + { + "epoch": 1.3065460351250664, + "grad_norm": 0.4096110165119171, + "learning_rate": 1.698637785904105e-07, + "loss": 0.1892, + "step": 4910 + }, + { + "epoch": 1.3068121341138903, + "grad_norm": 0.3619256615638733, + "learning_rate": 1.6985169430905724e-07, + "loss": 0.1914, + "step": 4911 + }, + { + "epoch": 1.3070782331027142, + "grad_norm": 0.2821843922138214, + "learning_rate": 1.698396080353917e-07, + "loss": 0.2023, + "step": 4912 + }, + { + "epoch": 1.307344332091538, + "grad_norm": 0.24985119700431824, + "learning_rate": 1.6982751976975861e-07, + "loss": 0.1802, + "step": 4913 + }, + { + "epoch": 1.307610431080362, + "grad_norm": 0.33226650953292847, + "learning_rate": 1.6981542951250277e-07, + "loss": 0.2042, + "step": 4914 + }, + { + "epoch": 1.3078765300691857, + "grad_norm": 0.3053646385669708, + "learning_rate": 1.6980333726396904e-07, + "loss": 0.1819, + "step": 4915 + }, + { + "epoch": 1.3081426290580096, + "grad_norm": 0.4318743348121643, + "learning_rate": 1.6979124302450225e-07, + "loss": 0.216, + "step": 4916 + }, + { + "epoch": 1.3084087280468335, + "grad_norm": 0.2815534174442291, + "learning_rate": 1.6977914679444737e-07, + "loss": 0.1926, + "step": 4917 + }, + { + "epoch": 1.3086748270356572, + "grad_norm": 0.33881521224975586, + "learning_rate": 1.6976704857414948e-07, + "loss": 0.1979, + "step": 4918 + }, + { + "epoch": 1.308940926024481, + "grad_norm": 0.2915138900279999, + "learning_rate": 1.6975494836395354e-07, + "loss": 0.1922, + "step": 4919 + }, + { + "epoch": 1.309207025013305, + "grad_norm": 0.2780732214450836, + "learning_rate": 1.6974284616420476e-07, + "loss": 0.1942, + "step": 4920 + }, + { + "epoch": 1.3094731240021287, + "grad_norm": 0.29031530022621155, + "learning_rate": 1.6973074197524828e-07, + "loss": 0.1999, + "step": 4921 + }, + { + "epoch": 1.3097392229909526, + "grad_norm": 0.3185518980026245, + "learning_rate": 1.6971863579742933e-07, + "loss": 0.2026, + "step": 4922 + }, + { + "epoch": 1.3100053219797765, + "grad_norm": 0.29415783286094666, + "learning_rate": 1.697065276310932e-07, + "loss": 0.2089, + "step": 4923 + }, + { + "epoch": 1.3102714209686004, + "grad_norm": 0.26924601197242737, + "learning_rate": 1.6969441747658527e-07, + "loss": 0.215, + "step": 4924 + }, + { + "epoch": 1.3105375199574243, + "grad_norm": 0.3877262771129608, + "learning_rate": 1.6968230533425092e-07, + "loss": 0.2103, + "step": 4925 + }, + { + "epoch": 1.310803618946248, + "grad_norm": 0.37313157320022583, + "learning_rate": 1.6967019120443563e-07, + "loss": 0.1951, + "step": 4926 + }, + { + "epoch": 1.3110697179350719, + "grad_norm": 0.3236340284347534, + "learning_rate": 1.6965807508748488e-07, + "loss": 0.1671, + "step": 4927 + }, + { + "epoch": 1.3113358169238958, + "grad_norm": 0.28587427735328674, + "learning_rate": 1.696459569837443e-07, + "loss": 0.1889, + "step": 4928 + }, + { + "epoch": 1.3116019159127195, + "grad_norm": 0.2725149989128113, + "learning_rate": 1.696338368935595e-07, + "loss": 0.199, + "step": 4929 + }, + { + "epoch": 1.3118680149015434, + "grad_norm": 0.27804046869277954, + "learning_rate": 1.6962171481727618e-07, + "loss": 0.1811, + "step": 4930 + }, + { + "epoch": 1.3121341138903673, + "grad_norm": 0.3475317358970642, + "learning_rate": 1.6960959075524003e-07, + "loss": 0.1878, + "step": 4931 + }, + { + "epoch": 1.312400212879191, + "grad_norm": 0.2668650150299072, + "learning_rate": 1.6959746470779695e-07, + "loss": 0.1927, + "step": 4932 + }, + { + "epoch": 1.3126663118680149, + "grad_norm": 0.2994067668914795, + "learning_rate": 1.6958533667529272e-07, + "loss": 0.1798, + "step": 4933 + }, + { + "epoch": 1.3129324108568388, + "grad_norm": 0.24210596084594727, + "learning_rate": 1.6957320665807328e-07, + "loss": 0.1809, + "step": 4934 + }, + { + "epoch": 1.3131985098456627, + "grad_norm": 0.31282609701156616, + "learning_rate": 1.6956107465648462e-07, + "loss": 0.2142, + "step": 4935 + }, + { + "epoch": 1.3134646088344863, + "grad_norm": 0.2850237786769867, + "learning_rate": 1.6954894067087278e-07, + "loss": 0.1885, + "step": 4936 + }, + { + "epoch": 1.3137307078233103, + "grad_norm": 0.3692125976085663, + "learning_rate": 1.6953680470158378e-07, + "loss": 0.2153, + "step": 4937 + }, + { + "epoch": 1.3139968068121342, + "grad_norm": 0.32071906328201294, + "learning_rate": 1.6952466674896383e-07, + "loss": 0.2024, + "step": 4938 + }, + { + "epoch": 1.314262905800958, + "grad_norm": 0.32148537039756775, + "learning_rate": 1.695125268133591e-07, + "loss": 0.1917, + "step": 4939 + }, + { + "epoch": 1.3145290047897817, + "grad_norm": 0.3474298119544983, + "learning_rate": 1.6950038489511585e-07, + "loss": 0.178, + "step": 4940 + }, + { + "epoch": 1.3147951037786056, + "grad_norm": 0.2659369707107544, + "learning_rate": 1.6948824099458036e-07, + "loss": 0.1934, + "step": 4941 + }, + { + "epoch": 1.3150612027674295, + "grad_norm": 0.2946436107158661, + "learning_rate": 1.6947609511209904e-07, + "loss": 0.1968, + "step": 4942 + }, + { + "epoch": 1.3153273017562532, + "grad_norm": 0.31869062781333923, + "learning_rate": 1.6946394724801835e-07, + "loss": 0.1918, + "step": 4943 + }, + { + "epoch": 1.3155934007450771, + "grad_norm": 0.2749162018299103, + "learning_rate": 1.6945179740268468e-07, + "loss": 0.194, + "step": 4944 + }, + { + "epoch": 1.315859499733901, + "grad_norm": 0.2866474986076355, + "learning_rate": 1.6943964557644464e-07, + "loss": 0.1862, + "step": 4945 + }, + { + "epoch": 1.3161255987227247, + "grad_norm": 0.28725549578666687, + "learning_rate": 1.6942749176964477e-07, + "loss": 0.2003, + "step": 4946 + }, + { + "epoch": 1.3163916977115486, + "grad_norm": 0.28794920444488525, + "learning_rate": 1.6941533598263181e-07, + "loss": 0.1923, + "step": 4947 + }, + { + "epoch": 1.3166577967003725, + "grad_norm": 0.37511909008026123, + "learning_rate": 1.6940317821575235e-07, + "loss": 0.2192, + "step": 4948 + }, + { + "epoch": 1.3169238956891964, + "grad_norm": 0.2541418969631195, + "learning_rate": 1.693910184693532e-07, + "loss": 0.1749, + "step": 4949 + }, + { + "epoch": 1.3171899946780203, + "grad_norm": 0.33020317554473877, + "learning_rate": 1.6937885674378125e-07, + "loss": 0.1963, + "step": 4950 + }, + { + "epoch": 1.317456093666844, + "grad_norm": 0.28583085536956787, + "learning_rate": 1.6936669303938328e-07, + "loss": 0.1916, + "step": 4951 + }, + { + "epoch": 1.317722192655668, + "grad_norm": 0.26600414514541626, + "learning_rate": 1.693545273565063e-07, + "loss": 0.1827, + "step": 4952 + }, + { + "epoch": 1.3179882916444918, + "grad_norm": 0.41796234250068665, + "learning_rate": 1.6934235969549726e-07, + "loss": 0.2018, + "step": 4953 + }, + { + "epoch": 1.3182543906333155, + "grad_norm": 0.27242934703826904, + "learning_rate": 1.693301900567032e-07, + "loss": 0.1983, + "step": 4954 + }, + { + "epoch": 1.3185204896221394, + "grad_norm": 0.33755895495414734, + "learning_rate": 1.6931801844047123e-07, + "loss": 0.1911, + "step": 4955 + }, + { + "epoch": 1.3187865886109633, + "grad_norm": 0.553023636341095, + "learning_rate": 1.6930584484714852e-07, + "loss": 0.1999, + "step": 4956 + }, + { + "epoch": 1.319052687599787, + "grad_norm": 0.3427606225013733, + "learning_rate": 1.692936692770823e-07, + "loss": 0.1952, + "step": 4957 + }, + { + "epoch": 1.319318786588611, + "grad_norm": 0.2931293547153473, + "learning_rate": 1.6928149173061978e-07, + "loss": 0.1851, + "step": 4958 + }, + { + "epoch": 1.3195848855774348, + "grad_norm": 0.3709084987640381, + "learning_rate": 1.6926931220810834e-07, + "loss": 0.206, + "step": 4959 + }, + { + "epoch": 1.3198509845662587, + "grad_norm": 0.25102999806404114, + "learning_rate": 1.6925713070989536e-07, + "loss": 0.1846, + "step": 4960 + }, + { + "epoch": 1.3201170835550826, + "grad_norm": 0.3160124123096466, + "learning_rate": 1.6924494723632828e-07, + "loss": 0.1908, + "step": 4961 + }, + { + "epoch": 1.3203831825439063, + "grad_norm": 0.32464399933815, + "learning_rate": 1.6923276178775456e-07, + "loss": 0.2009, + "step": 4962 + }, + { + "epoch": 1.3206492815327302, + "grad_norm": 0.3868546783924103, + "learning_rate": 1.6922057436452181e-07, + "loss": 0.2074, + "step": 4963 + }, + { + "epoch": 1.320915380521554, + "grad_norm": 0.33194538950920105, + "learning_rate": 1.6920838496697764e-07, + "loss": 0.1875, + "step": 4964 + }, + { + "epoch": 1.3211814795103778, + "grad_norm": 0.301389217376709, + "learning_rate": 1.6919619359546968e-07, + "loss": 0.2139, + "step": 4965 + }, + { + "epoch": 1.3214475784992017, + "grad_norm": 0.29077664017677307, + "learning_rate": 1.6918400025034566e-07, + "loss": 0.2118, + "step": 4966 + }, + { + "epoch": 1.3217136774880256, + "grad_norm": 0.2744576334953308, + "learning_rate": 1.6917180493195335e-07, + "loss": 0.2142, + "step": 4967 + }, + { + "epoch": 1.3219797764768493, + "grad_norm": 0.3688510060310364, + "learning_rate": 1.691596076406406e-07, + "loss": 0.212, + "step": 4968 + }, + { + "epoch": 1.3222458754656732, + "grad_norm": 0.3661993741989136, + "learning_rate": 1.6914740837675533e-07, + "loss": 0.2001, + "step": 4969 + }, + { + "epoch": 1.322511974454497, + "grad_norm": 0.2887817323207855, + "learning_rate": 1.6913520714064542e-07, + "loss": 0.1946, + "step": 4970 + }, + { + "epoch": 1.322778073443321, + "grad_norm": 0.2647855281829834, + "learning_rate": 1.6912300393265893e-07, + "loss": 0.2035, + "step": 4971 + }, + { + "epoch": 1.323044172432145, + "grad_norm": 0.5385107398033142, + "learning_rate": 1.6911079875314392e-07, + "loss": 0.1868, + "step": 4972 + }, + { + "epoch": 1.3233102714209686, + "grad_norm": 0.41586485505104065, + "learning_rate": 1.6909859160244845e-07, + "loss": 0.2042, + "step": 4973 + }, + { + "epoch": 1.3235763704097925, + "grad_norm": 0.39513733983039856, + "learning_rate": 1.6908638248092076e-07, + "loss": 0.1887, + "step": 4974 + }, + { + "epoch": 1.3238424693986164, + "grad_norm": 0.35990238189697266, + "learning_rate": 1.6907417138890907e-07, + "loss": 0.196, + "step": 4975 + }, + { + "epoch": 1.32410856838744, + "grad_norm": 0.3261045813560486, + "learning_rate": 1.690619583267616e-07, + "loss": 0.203, + "step": 4976 + }, + { + "epoch": 1.324374667376264, + "grad_norm": 0.3294385075569153, + "learning_rate": 1.690497432948268e-07, + "loss": 0.2005, + "step": 4977 + }, + { + "epoch": 1.3246407663650879, + "grad_norm": 0.28221389651298523, + "learning_rate": 1.6903752629345296e-07, + "loss": 0.2035, + "step": 4978 + }, + { + "epoch": 1.3249068653539116, + "grad_norm": 0.3009498715400696, + "learning_rate": 1.690253073229886e-07, + "loss": 0.1897, + "step": 4979 + }, + { + "epoch": 1.3251729643427355, + "grad_norm": 0.4638829529285431, + "learning_rate": 1.690130863837822e-07, + "loss": 0.1965, + "step": 4980 + }, + { + "epoch": 1.3254390633315594, + "grad_norm": 0.33599236607551575, + "learning_rate": 1.690008634761823e-07, + "loss": 0.2079, + "step": 4981 + }, + { + "epoch": 1.325705162320383, + "grad_norm": 0.304842472076416, + "learning_rate": 1.6898863860053761e-07, + "loss": 0.2072, + "step": 4982 + }, + { + "epoch": 1.325971261309207, + "grad_norm": 0.3534255921840668, + "learning_rate": 1.6897641175719674e-07, + "loss": 0.2175, + "step": 4983 + }, + { + "epoch": 1.3262373602980309, + "grad_norm": 0.3060854971408844, + "learning_rate": 1.6896418294650845e-07, + "loss": 0.1958, + "step": 4984 + }, + { + "epoch": 1.3265034592868548, + "grad_norm": 0.3354863226413727, + "learning_rate": 1.689519521688215e-07, + "loss": 0.1934, + "step": 4985 + }, + { + "epoch": 1.3267695582756787, + "grad_norm": 0.284915030002594, + "learning_rate": 1.6893971942448475e-07, + "loss": 0.1958, + "step": 4986 + }, + { + "epoch": 1.3270356572645023, + "grad_norm": 0.47653257846832275, + "learning_rate": 1.6892748471384712e-07, + "loss": 0.2175, + "step": 4987 + }, + { + "epoch": 1.3273017562533262, + "grad_norm": 0.2692255675792694, + "learning_rate": 1.6891524803725757e-07, + "loss": 0.1741, + "step": 4988 + }, + { + "epoch": 1.3275678552421502, + "grad_norm": 0.24917586147785187, + "learning_rate": 1.689030093950651e-07, + "loss": 0.1655, + "step": 4989 + }, + { + "epoch": 1.3278339542309738, + "grad_norm": 0.2620799243450165, + "learning_rate": 1.6889076878761878e-07, + "loss": 0.1877, + "step": 4990 + }, + { + "epoch": 1.3281000532197977, + "grad_norm": 0.2839523255825043, + "learning_rate": 1.6887852621526775e-07, + "loss": 0.1958, + "step": 4991 + }, + { + "epoch": 1.3283661522086216, + "grad_norm": 0.27923524379730225, + "learning_rate": 1.6886628167836117e-07, + "loss": 0.2013, + "step": 4992 + }, + { + "epoch": 1.3286322511974453, + "grad_norm": 0.2898057699203491, + "learning_rate": 1.6885403517724827e-07, + "loss": 0.2005, + "step": 4993 + }, + { + "epoch": 1.3288983501862692, + "grad_norm": 0.3570701777935028, + "learning_rate": 1.688417867122784e-07, + "loss": 0.1795, + "step": 4994 + }, + { + "epoch": 1.3291644491750931, + "grad_norm": 0.2756751477718353, + "learning_rate": 1.688295362838009e-07, + "loss": 0.1844, + "step": 4995 + }, + { + "epoch": 1.329430548163917, + "grad_norm": 0.42971375584602356, + "learning_rate": 1.6881728389216513e-07, + "loss": 0.206, + "step": 4996 + }, + { + "epoch": 1.329696647152741, + "grad_norm": 0.28907209634780884, + "learning_rate": 1.6880502953772058e-07, + "loss": 0.2084, + "step": 4997 + }, + { + "epoch": 1.3299627461415646, + "grad_norm": 0.26783788204193115, + "learning_rate": 1.687927732208168e-07, + "loss": 0.1847, + "step": 4998 + }, + { + "epoch": 1.3302288451303885, + "grad_norm": 0.25632432103157043, + "learning_rate": 1.687805149418033e-07, + "loss": 0.1687, + "step": 4999 + }, + { + "epoch": 1.3304949441192124, + "grad_norm": 0.2771508991718292, + "learning_rate": 1.6876825470102976e-07, + "loss": 0.184, + "step": 5000 + }, + { + "epoch": 1.3307610431080361, + "grad_norm": 0.26233232021331787, + "learning_rate": 1.6875599249884584e-07, + "loss": 0.189, + "step": 5001 + }, + { + "epoch": 1.33102714209686, + "grad_norm": 0.316836953163147, + "learning_rate": 1.687437283356013e-07, + "loss": 0.1866, + "step": 5002 + }, + { + "epoch": 1.331293241085684, + "grad_norm": 0.24297310411930084, + "learning_rate": 1.6873146221164594e-07, + "loss": 0.1843, + "step": 5003 + }, + { + "epoch": 1.3315593400745076, + "grad_norm": 0.253386527299881, + "learning_rate": 1.6871919412732957e-07, + "loss": 0.1855, + "step": 5004 + }, + { + "epoch": 1.3318254390633315, + "grad_norm": 0.26497459411621094, + "learning_rate": 1.6870692408300218e-07, + "loss": 0.1794, + "step": 5005 + }, + { + "epoch": 1.3320915380521554, + "grad_norm": 0.29578372836112976, + "learning_rate": 1.686946520790137e-07, + "loss": 0.2085, + "step": 5006 + }, + { + "epoch": 1.3323576370409793, + "grad_norm": 0.2739415764808655, + "learning_rate": 1.6868237811571412e-07, + "loss": 0.2021, + "step": 5007 + }, + { + "epoch": 1.3326237360298032, + "grad_norm": 0.2464197278022766, + "learning_rate": 1.6867010219345358e-07, + "loss": 0.1764, + "step": 5008 + }, + { + "epoch": 1.332889835018627, + "grad_norm": 0.2707882821559906, + "learning_rate": 1.6865782431258215e-07, + "loss": 0.1984, + "step": 5009 + }, + { + "epoch": 1.3331559340074508, + "grad_norm": 0.320624977350235, + "learning_rate": 1.6864554447345007e-07, + "loss": 0.1957, + "step": 5010 + }, + { + "epoch": 1.3334220329962747, + "grad_norm": 0.29463258385658264, + "learning_rate": 1.6863326267640755e-07, + "loss": 0.1929, + "step": 5011 + }, + { + "epoch": 1.3336881319850984, + "grad_norm": 0.30197152495384216, + "learning_rate": 1.6862097892180492e-07, + "loss": 0.2141, + "step": 5012 + }, + { + "epoch": 1.3339542309739223, + "grad_norm": 0.4684069752693176, + "learning_rate": 1.6860869320999257e-07, + "loss": 0.2056, + "step": 5013 + }, + { + "epoch": 1.3342203299627462, + "grad_norm": 0.2591577470302582, + "learning_rate": 1.685964055413208e-07, + "loss": 0.1637, + "step": 5014 + }, + { + "epoch": 1.3344864289515699, + "grad_norm": 0.28931617736816406, + "learning_rate": 1.6858411591614016e-07, + "loss": 0.1855, + "step": 5015 + }, + { + "epoch": 1.3347525279403938, + "grad_norm": 0.26501137018203735, + "learning_rate": 1.6857182433480119e-07, + "loss": 0.1776, + "step": 5016 + }, + { + "epoch": 1.3350186269292177, + "grad_norm": 0.2650046944618225, + "learning_rate": 1.6855953079765446e-07, + "loss": 0.1938, + "step": 5017 + }, + { + "epoch": 1.3352847259180416, + "grad_norm": 0.29255911707878113, + "learning_rate": 1.6854723530505054e-07, + "loss": 0.2026, + "step": 5018 + }, + { + "epoch": 1.3355508249068653, + "grad_norm": 0.3650743365287781, + "learning_rate": 1.6853493785734023e-07, + "loss": 0.1906, + "step": 5019 + }, + { + "epoch": 1.3358169238956892, + "grad_norm": 0.28926870226860046, + "learning_rate": 1.685226384548742e-07, + "loss": 0.2092, + "step": 5020 + }, + { + "epoch": 1.336083022884513, + "grad_norm": 0.2664541006088257, + "learning_rate": 1.6851033709800327e-07, + "loss": 0.2011, + "step": 5021 + }, + { + "epoch": 1.336349121873337, + "grad_norm": 0.2544054090976715, + "learning_rate": 1.6849803378707833e-07, + "loss": 0.1794, + "step": 5022 + }, + { + "epoch": 1.3366152208621607, + "grad_norm": 0.3429318964481354, + "learning_rate": 1.6848572852245022e-07, + "loss": 0.194, + "step": 5023 + }, + { + "epoch": 1.3368813198509846, + "grad_norm": 0.35348764061927795, + "learning_rate": 1.6847342130447e-07, + "loss": 0.211, + "step": 5024 + }, + { + "epoch": 1.3371474188398085, + "grad_norm": 0.2715548872947693, + "learning_rate": 1.6846111213348866e-07, + "loss": 0.2017, + "step": 5025 + }, + { + "epoch": 1.3374135178286322, + "grad_norm": 0.23738743364810944, + "learning_rate": 1.6844880100985726e-07, + "loss": 0.176, + "step": 5026 + }, + { + "epoch": 1.337679616817456, + "grad_norm": 0.25180235505104065, + "learning_rate": 1.68436487933927e-07, + "loss": 0.1855, + "step": 5027 + }, + { + "epoch": 1.33794571580628, + "grad_norm": 0.3016161322593689, + "learning_rate": 1.68424172906049e-07, + "loss": 0.202, + "step": 5028 + }, + { + "epoch": 1.3382118147951036, + "grad_norm": 0.3436412513256073, + "learning_rate": 1.6841185592657457e-07, + "loss": 0.1911, + "step": 5029 + }, + { + "epoch": 1.3384779137839276, + "grad_norm": 0.3644704222679138, + "learning_rate": 1.6839953699585495e-07, + "loss": 0.2133, + "step": 5030 + }, + { + "epoch": 1.3387440127727515, + "grad_norm": 0.28583091497421265, + "learning_rate": 1.6838721611424158e-07, + "loss": 0.1882, + "step": 5031 + }, + { + "epoch": 1.3390101117615754, + "grad_norm": 0.37265172600746155, + "learning_rate": 1.6837489328208583e-07, + "loss": 0.2154, + "step": 5032 + }, + { + "epoch": 1.3392762107503993, + "grad_norm": 0.2615170478820801, + "learning_rate": 1.6836256849973917e-07, + "loss": 0.1849, + "step": 5033 + }, + { + "epoch": 1.339542309739223, + "grad_norm": 0.40061062574386597, + "learning_rate": 1.6835024176755312e-07, + "loss": 0.198, + "step": 5034 + }, + { + "epoch": 1.3398084087280469, + "grad_norm": 0.26325732469558716, + "learning_rate": 1.683379130858793e-07, + "loss": 0.1833, + "step": 5035 + }, + { + "epoch": 1.3400745077168708, + "grad_norm": 0.28018054366111755, + "learning_rate": 1.6832558245506934e-07, + "loss": 0.2025, + "step": 5036 + }, + { + "epoch": 1.3403406067056944, + "grad_norm": 0.4017902910709381, + "learning_rate": 1.683132498754749e-07, + "loss": 0.2068, + "step": 5037 + }, + { + "epoch": 1.3406067056945183, + "grad_norm": 0.38808730244636536, + "learning_rate": 1.6830091534744778e-07, + "loss": 0.1901, + "step": 5038 + }, + { + "epoch": 1.3408728046833422, + "grad_norm": 0.35957473516464233, + "learning_rate": 1.6828857887133976e-07, + "loss": 0.202, + "step": 5039 + }, + { + "epoch": 1.341138903672166, + "grad_norm": 0.42700979113578796, + "learning_rate": 1.682762404475027e-07, + "loss": 0.1787, + "step": 5040 + }, + { + "epoch": 1.3414050026609898, + "grad_norm": 0.3930888772010803, + "learning_rate": 1.6826390007628848e-07, + "loss": 0.1912, + "step": 5041 + }, + { + "epoch": 1.3416711016498137, + "grad_norm": 0.29633256793022156, + "learning_rate": 1.6825155775804914e-07, + "loss": 0.1994, + "step": 5042 + }, + { + "epoch": 1.3419372006386376, + "grad_norm": 0.29336151480674744, + "learning_rate": 1.6823921349313669e-07, + "loss": 0.194, + "step": 5043 + }, + { + "epoch": 1.3422032996274615, + "grad_norm": 0.2711072564125061, + "learning_rate": 1.6822686728190316e-07, + "loss": 0.1932, + "step": 5044 + }, + { + "epoch": 1.3424693986162852, + "grad_norm": 0.35358718037605286, + "learning_rate": 1.6821451912470078e-07, + "loss": 0.1994, + "step": 5045 + }, + { + "epoch": 1.3427354976051091, + "grad_norm": 0.39131179451942444, + "learning_rate": 1.6820216902188168e-07, + "loss": 0.213, + "step": 5046 + }, + { + "epoch": 1.343001596593933, + "grad_norm": 0.3407585322856903, + "learning_rate": 1.6818981697379812e-07, + "loss": 0.1946, + "step": 5047 + }, + { + "epoch": 1.3432676955827567, + "grad_norm": 0.3115473985671997, + "learning_rate": 1.6817746298080242e-07, + "loss": 0.2061, + "step": 5048 + }, + { + "epoch": 1.3435337945715806, + "grad_norm": 0.37860599160194397, + "learning_rate": 1.6816510704324692e-07, + "loss": 0.2041, + "step": 5049 + }, + { + "epoch": 1.3437998935604045, + "grad_norm": 0.28976795077323914, + "learning_rate": 1.6815274916148408e-07, + "loss": 0.1981, + "step": 5050 + }, + { + "epoch": 1.3440659925492282, + "grad_norm": 0.2679380476474762, + "learning_rate": 1.681403893358663e-07, + "loss": 0.1872, + "step": 5051 + }, + { + "epoch": 1.344332091538052, + "grad_norm": 0.2702246308326721, + "learning_rate": 1.6812802756674617e-07, + "loss": 0.198, + "step": 5052 + }, + { + "epoch": 1.344598190526876, + "grad_norm": 0.29143571853637695, + "learning_rate": 1.6811566385447624e-07, + "loss": 0.2002, + "step": 5053 + }, + { + "epoch": 1.3448642895157, + "grad_norm": 0.4117394983768463, + "learning_rate": 1.681032981994092e-07, + "loss": 0.1938, + "step": 5054 + }, + { + "epoch": 1.3451303885045236, + "grad_norm": 0.3123292326927185, + "learning_rate": 1.6809093060189765e-07, + "loss": 0.2017, + "step": 5055 + }, + { + "epoch": 1.3453964874933475, + "grad_norm": 0.33895137906074524, + "learning_rate": 1.6807856106229438e-07, + "loss": 0.1897, + "step": 5056 + }, + { + "epoch": 1.3456625864821714, + "grad_norm": 0.28095299005508423, + "learning_rate": 1.6806618958095228e-07, + "loss": 0.1811, + "step": 5057 + }, + { + "epoch": 1.3459286854709953, + "grad_norm": 0.25623080134391785, + "learning_rate": 1.6805381615822406e-07, + "loss": 0.1886, + "step": 5058 + }, + { + "epoch": 1.346194784459819, + "grad_norm": 0.38838791847229004, + "learning_rate": 1.6804144079446274e-07, + "loss": 0.2099, + "step": 5059 + }, + { + "epoch": 1.346460883448643, + "grad_norm": 0.28948289155960083, + "learning_rate": 1.6802906349002126e-07, + "loss": 0.1861, + "step": 5060 + }, + { + "epoch": 1.3467269824374668, + "grad_norm": 0.3230811059474945, + "learning_rate": 1.6801668424525265e-07, + "loss": 0.2054, + "step": 5061 + }, + { + "epoch": 1.3469930814262905, + "grad_norm": 0.25741997361183167, + "learning_rate": 1.6800430306051e-07, + "loss": 0.1879, + "step": 5062 + }, + { + "epoch": 1.3472591804151144, + "grad_norm": 0.28569790720939636, + "learning_rate": 1.6799191993614646e-07, + "loss": 0.2085, + "step": 5063 + }, + { + "epoch": 1.3475252794039383, + "grad_norm": 0.37271785736083984, + "learning_rate": 1.6797953487251512e-07, + "loss": 0.1936, + "step": 5064 + }, + { + "epoch": 1.347791378392762, + "grad_norm": 0.41509711742401123, + "learning_rate": 1.6796714786996937e-07, + "loss": 0.2107, + "step": 5065 + }, + { + "epoch": 1.3480574773815859, + "grad_norm": 0.25460416078567505, + "learning_rate": 1.6795475892886244e-07, + "loss": 0.1752, + "step": 5066 + }, + { + "epoch": 1.3483235763704098, + "grad_norm": 0.297002375125885, + "learning_rate": 1.6794236804954766e-07, + "loss": 0.1726, + "step": 5067 + }, + { + "epoch": 1.3485896753592337, + "grad_norm": 0.3506621718406677, + "learning_rate": 1.679299752323785e-07, + "loss": 0.1963, + "step": 5068 + }, + { + "epoch": 1.3488557743480576, + "grad_norm": 0.3444822132587433, + "learning_rate": 1.6791758047770842e-07, + "loss": 0.1969, + "step": 5069 + }, + { + "epoch": 1.3491218733368813, + "grad_norm": 0.28881609439849854, + "learning_rate": 1.679051837858909e-07, + "loss": 0.2058, + "step": 5070 + }, + { + "epoch": 1.3493879723257052, + "grad_norm": 0.27637171745300293, + "learning_rate": 1.6789278515727956e-07, + "loss": 0.199, + "step": 5071 + }, + { + "epoch": 1.349654071314529, + "grad_norm": 0.40108171105384827, + "learning_rate": 1.6788038459222805e-07, + "loss": 0.2102, + "step": 5072 + }, + { + "epoch": 1.3499201703033528, + "grad_norm": 0.27443307638168335, + "learning_rate": 1.6786798209109e-07, + "loss": 0.1929, + "step": 5073 + }, + { + "epoch": 1.3501862692921767, + "grad_norm": 0.3280278742313385, + "learning_rate": 1.678555776542192e-07, + "loss": 0.1944, + "step": 5074 + }, + { + "epoch": 1.3504523682810006, + "grad_norm": 0.3081546425819397, + "learning_rate": 1.6784317128196942e-07, + "loss": 0.1962, + "step": 5075 + }, + { + "epoch": 1.3507184672698243, + "grad_norm": 0.39832285046577454, + "learning_rate": 1.6783076297469455e-07, + "loss": 0.2109, + "step": 5076 + }, + { + "epoch": 1.3509845662586482, + "grad_norm": 0.24649451673030853, + "learning_rate": 1.6781835273274846e-07, + "loss": 0.1598, + "step": 5077 + }, + { + "epoch": 1.351250665247472, + "grad_norm": 0.26552310585975647, + "learning_rate": 1.6780594055648517e-07, + "loss": 0.1764, + "step": 5078 + }, + { + "epoch": 1.351516764236296, + "grad_norm": 0.3469849228858948, + "learning_rate": 1.6779352644625862e-07, + "loss": 0.1929, + "step": 5079 + }, + { + "epoch": 1.3517828632251199, + "grad_norm": 0.25492095947265625, + "learning_rate": 1.6778111040242298e-07, + "loss": 0.1927, + "step": 5080 + }, + { + "epoch": 1.3520489622139436, + "grad_norm": 0.34136858582496643, + "learning_rate": 1.677686924253323e-07, + "loss": 0.1948, + "step": 5081 + }, + { + "epoch": 1.3523150612027675, + "grad_norm": 0.3528984785079956, + "learning_rate": 1.677562725153408e-07, + "loss": 0.1863, + "step": 5082 + }, + { + "epoch": 1.3525811601915914, + "grad_norm": 0.2832793593406677, + "learning_rate": 1.6774385067280275e-07, + "loss": 0.191, + "step": 5083 + }, + { + "epoch": 1.352847259180415, + "grad_norm": 0.35438182950019836, + "learning_rate": 1.677314268980724e-07, + "loss": 0.1817, + "step": 5084 + }, + { + "epoch": 1.353113358169239, + "grad_norm": 0.27677348256111145, + "learning_rate": 1.6771900119150412e-07, + "loss": 0.1978, + "step": 5085 + }, + { + "epoch": 1.3533794571580628, + "grad_norm": 0.273336797952652, + "learning_rate": 1.677065735534523e-07, + "loss": 0.2118, + "step": 5086 + }, + { + "epoch": 1.3536455561468865, + "grad_norm": 0.259688138961792, + "learning_rate": 1.6769414398427143e-07, + "loss": 0.1867, + "step": 5087 + }, + { + "epoch": 1.3539116551357104, + "grad_norm": 0.2767431437969208, + "learning_rate": 1.67681712484316e-07, + "loss": 0.2065, + "step": 5088 + }, + { + "epoch": 1.3541777541245343, + "grad_norm": 0.33372247219085693, + "learning_rate": 1.676692790539406e-07, + "loss": 0.1957, + "step": 5089 + }, + { + "epoch": 1.3544438531133582, + "grad_norm": 0.4155483543872833, + "learning_rate": 1.6765684369349988e-07, + "loss": 0.1754, + "step": 5090 + }, + { + "epoch": 1.3547099521021821, + "grad_norm": 0.44397321343421936, + "learning_rate": 1.6764440640334844e-07, + "loss": 0.2056, + "step": 5091 + }, + { + "epoch": 1.3549760510910058, + "grad_norm": 0.3636660873889923, + "learning_rate": 1.676319671838411e-07, + "loss": 0.2014, + "step": 5092 + }, + { + "epoch": 1.3552421500798297, + "grad_norm": 0.3323994576931, + "learning_rate": 1.6761952603533262e-07, + "loss": 0.1927, + "step": 5093 + }, + { + "epoch": 1.3555082490686536, + "grad_norm": 0.31368154287338257, + "learning_rate": 1.6760708295817784e-07, + "loss": 0.203, + "step": 5094 + }, + { + "epoch": 1.3557743480574773, + "grad_norm": 0.266703724861145, + "learning_rate": 1.6759463795273167e-07, + "loss": 0.183, + "step": 5095 + }, + { + "epoch": 1.3560404470463012, + "grad_norm": 0.30031898617744446, + "learning_rate": 1.6758219101934907e-07, + "loss": 0.1955, + "step": 5096 + }, + { + "epoch": 1.3563065460351251, + "grad_norm": 0.313236802816391, + "learning_rate": 1.6756974215838505e-07, + "loss": 0.1742, + "step": 5097 + }, + { + "epoch": 1.3565726450239488, + "grad_norm": 0.2867322862148285, + "learning_rate": 1.675572913701947e-07, + "loss": 0.1939, + "step": 5098 + }, + { + "epoch": 1.3568387440127727, + "grad_norm": 0.2782001495361328, + "learning_rate": 1.6754483865513306e-07, + "loss": 0.1722, + "step": 5099 + }, + { + "epoch": 1.3571048430015966, + "grad_norm": 0.3765548765659332, + "learning_rate": 1.675323840135554e-07, + "loss": 0.1939, + "step": 5100 + }, + { + "epoch": 1.3573709419904203, + "grad_norm": 0.3852504789829254, + "learning_rate": 1.6751992744581693e-07, + "loss": 0.2106, + "step": 5101 + }, + { + "epoch": 1.3576370409792442, + "grad_norm": 0.2573069632053375, + "learning_rate": 1.675074689522729e-07, + "loss": 0.1962, + "step": 5102 + }, + { + "epoch": 1.357903139968068, + "grad_norm": 0.2632007300853729, + "learning_rate": 1.674950085332787e-07, + "loss": 0.1789, + "step": 5103 + }, + { + "epoch": 1.358169238956892, + "grad_norm": 0.3097168505191803, + "learning_rate": 1.6748254618918967e-07, + "loss": 0.2089, + "step": 5104 + }, + { + "epoch": 1.358435337945716, + "grad_norm": 0.26602837443351746, + "learning_rate": 1.6747008192036128e-07, + "loss": 0.1821, + "step": 5105 + }, + { + "epoch": 1.3587014369345396, + "grad_norm": 0.28558263182640076, + "learning_rate": 1.674576157271491e-07, + "loss": 0.1938, + "step": 5106 + }, + { + "epoch": 1.3589675359233635, + "grad_norm": 0.2618442177772522, + "learning_rate": 1.674451476099086e-07, + "loss": 0.187, + "step": 5107 + }, + { + "epoch": 1.3592336349121874, + "grad_norm": 0.3573010265827179, + "learning_rate": 1.6743267756899545e-07, + "loss": 0.2068, + "step": 5108 + }, + { + "epoch": 1.359499733901011, + "grad_norm": 0.3531862795352936, + "learning_rate": 1.6742020560476527e-07, + "loss": 0.2128, + "step": 5109 + }, + { + "epoch": 1.359765832889835, + "grad_norm": 0.39006417989730835, + "learning_rate": 1.6740773171757388e-07, + "loss": 0.2101, + "step": 5110 + }, + { + "epoch": 1.360031931878659, + "grad_norm": 0.24795092642307281, + "learning_rate": 1.6739525590777696e-07, + "loss": 0.1785, + "step": 5111 + }, + { + "epoch": 1.3602980308674826, + "grad_norm": 0.2977757453918457, + "learning_rate": 1.673827781757304e-07, + "loss": 0.1907, + "step": 5112 + }, + { + "epoch": 1.3605641298563065, + "grad_norm": 0.3482801914215088, + "learning_rate": 1.673702985217901e-07, + "loss": 0.1995, + "step": 5113 + }, + { + "epoch": 1.3608302288451304, + "grad_norm": 0.25931429862976074, + "learning_rate": 1.6735781694631194e-07, + "loss": 0.1982, + "step": 5114 + }, + { + "epoch": 1.3610963278339543, + "grad_norm": 0.3630266785621643, + "learning_rate": 1.6734533344965198e-07, + "loss": 0.2221, + "step": 5115 + }, + { + "epoch": 1.3613624268227782, + "grad_norm": 0.2598949074745178, + "learning_rate": 1.6733284803216625e-07, + "loss": 0.1775, + "step": 5116 + }, + { + "epoch": 1.3616285258116019, + "grad_norm": 0.36576414108276367, + "learning_rate": 1.673203606942109e-07, + "loss": 0.2095, + "step": 5117 + }, + { + "epoch": 1.3618946248004258, + "grad_norm": 0.2481842339038849, + "learning_rate": 1.6730787143614204e-07, + "loss": 0.1681, + "step": 5118 + }, + { + "epoch": 1.3621607237892497, + "grad_norm": 0.2752997577190399, + "learning_rate": 1.672953802583159e-07, + "loss": 0.203, + "step": 5119 + }, + { + "epoch": 1.3624268227780734, + "grad_norm": 0.2888612747192383, + "learning_rate": 1.6728288716108877e-07, + "loss": 0.2062, + "step": 5120 + }, + { + "epoch": 1.3626929217668973, + "grad_norm": 0.30848199129104614, + "learning_rate": 1.67270392144817e-07, + "loss": 0.1865, + "step": 5121 + }, + { + "epoch": 1.3629590207557212, + "grad_norm": 0.2627262473106384, + "learning_rate": 1.672578952098569e-07, + "loss": 0.1953, + "step": 5122 + }, + { + "epoch": 1.3632251197445449, + "grad_norm": 0.6919583678245544, + "learning_rate": 1.67245396356565e-07, + "loss": 0.2047, + "step": 5123 + }, + { + "epoch": 1.3634912187333688, + "grad_norm": 0.36024102568626404, + "learning_rate": 1.6723289558529774e-07, + "loss": 0.2163, + "step": 5124 + }, + { + "epoch": 1.3637573177221927, + "grad_norm": 0.28053775429725647, + "learning_rate": 1.6722039289641164e-07, + "loss": 0.2006, + "step": 5125 + }, + { + "epoch": 1.3640234167110166, + "grad_norm": 0.27475351095199585, + "learning_rate": 1.6720788829026336e-07, + "loss": 0.19, + "step": 5126 + }, + { + "epoch": 1.3642895156998405, + "grad_norm": 0.3262397050857544, + "learning_rate": 1.6719538176720953e-07, + "loss": 0.2103, + "step": 5127 + }, + { + "epoch": 1.3645556146886642, + "grad_norm": 0.40049880743026733, + "learning_rate": 1.6718287332760686e-07, + "loss": 0.2086, + "step": 5128 + }, + { + "epoch": 1.364821713677488, + "grad_norm": 0.2722965478897095, + "learning_rate": 1.6717036297181212e-07, + "loss": 0.205, + "step": 5129 + }, + { + "epoch": 1.365087812666312, + "grad_norm": 0.29008179903030396, + "learning_rate": 1.6715785070018216e-07, + "loss": 0.1832, + "step": 5130 + }, + { + "epoch": 1.3653539116551356, + "grad_norm": 0.2706787884235382, + "learning_rate": 1.6714533651307378e-07, + "loss": 0.1909, + "step": 5131 + }, + { + "epoch": 1.3656200106439595, + "grad_norm": 0.2956831157207489, + "learning_rate": 1.67132820410844e-07, + "loss": 0.205, + "step": 5132 + }, + { + "epoch": 1.3658861096327835, + "grad_norm": 0.2768319547176361, + "learning_rate": 1.6712030239384971e-07, + "loss": 0.1937, + "step": 5133 + }, + { + "epoch": 1.3661522086216071, + "grad_norm": 0.35916754603385925, + "learning_rate": 1.6710778246244806e-07, + "loss": 0.2054, + "step": 5134 + }, + { + "epoch": 1.366418307610431, + "grad_norm": 0.2601218819618225, + "learning_rate": 1.6709526061699602e-07, + "loss": 0.1855, + "step": 5135 + }, + { + "epoch": 1.366684406599255, + "grad_norm": 0.24426454305648804, + "learning_rate": 1.6708273685785086e-07, + "loss": 0.1726, + "step": 5136 + }, + { + "epoch": 1.3669505055880788, + "grad_norm": 0.2564459443092346, + "learning_rate": 1.670702111853697e-07, + "loss": 0.1658, + "step": 5137 + }, + { + "epoch": 1.3672166045769025, + "grad_norm": 0.3768395185470581, + "learning_rate": 1.6705768359990984e-07, + "loss": 0.2106, + "step": 5138 + }, + { + "epoch": 1.3674827035657264, + "grad_norm": 0.2621629238128662, + "learning_rate": 1.6704515410182855e-07, + "loss": 0.1844, + "step": 5139 + }, + { + "epoch": 1.3677488025545503, + "grad_norm": 0.2831583023071289, + "learning_rate": 1.6703262269148325e-07, + "loss": 0.1996, + "step": 5140 + }, + { + "epoch": 1.3680149015433742, + "grad_norm": 0.2751041650772095, + "learning_rate": 1.670200893692313e-07, + "loss": 0.173, + "step": 5141 + }, + { + "epoch": 1.368281000532198, + "grad_norm": 0.26284560561180115, + "learning_rate": 1.6700755413543025e-07, + "loss": 0.1946, + "step": 5142 + }, + { + "epoch": 1.3685470995210218, + "grad_norm": 0.27300435304641724, + "learning_rate": 1.6699501699043756e-07, + "loss": 0.1849, + "step": 5143 + }, + { + "epoch": 1.3688131985098457, + "grad_norm": 0.26362183690071106, + "learning_rate": 1.6698247793461088e-07, + "loss": 0.1889, + "step": 5144 + }, + { + "epoch": 1.3690792974986694, + "grad_norm": 0.28602707386016846, + "learning_rate": 1.6696993696830776e-07, + "loss": 0.2073, + "step": 5145 + }, + { + "epoch": 1.3693453964874933, + "grad_norm": 0.2599249482154846, + "learning_rate": 1.6695739409188598e-07, + "loss": 0.1921, + "step": 5146 + }, + { + "epoch": 1.3696114954763172, + "grad_norm": 0.2546827793121338, + "learning_rate": 1.6694484930570327e-07, + "loss": 0.1933, + "step": 5147 + }, + { + "epoch": 1.369877594465141, + "grad_norm": 0.3764136731624603, + "learning_rate": 1.6693230261011742e-07, + "loss": 0.1828, + "step": 5148 + }, + { + "epoch": 1.3701436934539648, + "grad_norm": 0.2783491611480713, + "learning_rate": 1.6691975400548626e-07, + "loss": 0.2008, + "step": 5149 + }, + { + "epoch": 1.3704097924427887, + "grad_norm": 0.2692537009716034, + "learning_rate": 1.6690720349216775e-07, + "loss": 0.188, + "step": 5150 + }, + { + "epoch": 1.3706758914316126, + "grad_norm": 0.2683783769607544, + "learning_rate": 1.668946510705198e-07, + "loss": 0.1877, + "step": 5151 + }, + { + "epoch": 1.3709419904204365, + "grad_norm": 0.2914840877056122, + "learning_rate": 1.668820967409005e-07, + "loss": 0.1959, + "step": 5152 + }, + { + "epoch": 1.3712080894092602, + "grad_norm": 0.2979084253311157, + "learning_rate": 1.6686954050366788e-07, + "loss": 0.217, + "step": 5153 + }, + { + "epoch": 1.371474188398084, + "grad_norm": 0.2575119137763977, + "learning_rate": 1.6685698235918007e-07, + "loss": 0.1827, + "step": 5154 + }, + { + "epoch": 1.371740287386908, + "grad_norm": 0.2877916693687439, + "learning_rate": 1.6684442230779529e-07, + "loss": 0.1984, + "step": 5155 + }, + { + "epoch": 1.3720063863757317, + "grad_norm": 0.27592897415161133, + "learning_rate": 1.6683186034987174e-07, + "loss": 0.1924, + "step": 5156 + }, + { + "epoch": 1.3722724853645556, + "grad_norm": 0.4492034614086151, + "learning_rate": 1.6681929648576773e-07, + "loss": 0.2249, + "step": 5157 + }, + { + "epoch": 1.3725385843533795, + "grad_norm": 0.2642395496368408, + "learning_rate": 1.6680673071584157e-07, + "loss": 0.2029, + "step": 5158 + }, + { + "epoch": 1.3728046833422032, + "grad_norm": 0.2756316661834717, + "learning_rate": 1.6679416304045169e-07, + "loss": 0.1867, + "step": 5159 + }, + { + "epoch": 1.373070782331027, + "grad_norm": 0.37921538949012756, + "learning_rate": 1.6678159345995657e-07, + "loss": 0.1971, + "step": 5160 + }, + { + "epoch": 1.373336881319851, + "grad_norm": 0.27300357818603516, + "learning_rate": 1.667690219747147e-07, + "loss": 0.1993, + "step": 5161 + }, + { + "epoch": 1.373602980308675, + "grad_norm": 0.2593526244163513, + "learning_rate": 1.6675644858508464e-07, + "loss": 0.1905, + "step": 5162 + }, + { + "epoch": 1.3738690792974988, + "grad_norm": 0.3758106529712677, + "learning_rate": 1.6674387329142504e-07, + "loss": 0.2012, + "step": 5163 + }, + { + "epoch": 1.3741351782863225, + "grad_norm": 0.2820485532283783, + "learning_rate": 1.6673129609409448e-07, + "loss": 0.1763, + "step": 5164 + }, + { + "epoch": 1.3744012772751464, + "grad_norm": 0.36989104747772217, + "learning_rate": 1.6671871699345178e-07, + "loss": 0.2177, + "step": 5165 + }, + { + "epoch": 1.3746673762639703, + "grad_norm": 0.34989070892333984, + "learning_rate": 1.667061359898557e-07, + "loss": 0.186, + "step": 5166 + }, + { + "epoch": 1.374933475252794, + "grad_norm": 0.3763914704322815, + "learning_rate": 1.6669355308366506e-07, + "loss": 0.2152, + "step": 5167 + }, + { + "epoch": 1.3751995742416179, + "grad_norm": 0.26363450288772583, + "learning_rate": 1.6668096827523878e-07, + "loss": 0.1829, + "step": 5168 + }, + { + "epoch": 1.3754656732304418, + "grad_norm": 0.27471479773521423, + "learning_rate": 1.6666838156493577e-07, + "loss": 0.1883, + "step": 5169 + }, + { + "epoch": 1.3757317722192655, + "grad_norm": 0.24911315739154816, + "learning_rate": 1.6665579295311503e-07, + "loss": 0.1819, + "step": 5170 + }, + { + "epoch": 1.3759978712080894, + "grad_norm": 0.28417861461639404, + "learning_rate": 1.6664320244013564e-07, + "loss": 0.2149, + "step": 5171 + }, + { + "epoch": 1.3762639701969133, + "grad_norm": 0.3486214578151703, + "learning_rate": 1.6663061002635667e-07, + "loss": 0.1966, + "step": 5172 + }, + { + "epoch": 1.3765300691857372, + "grad_norm": 0.30009138584136963, + "learning_rate": 1.666180157121373e-07, + "loss": 0.1883, + "step": 5173 + }, + { + "epoch": 1.376796168174561, + "grad_norm": 0.29058435559272766, + "learning_rate": 1.6660541949783677e-07, + "loss": 0.1979, + "step": 5174 + }, + { + "epoch": 1.3770622671633848, + "grad_norm": 0.3104119300842285, + "learning_rate": 1.6659282138381432e-07, + "loss": 0.2021, + "step": 5175 + }, + { + "epoch": 1.3773283661522087, + "grad_norm": 0.2525884509086609, + "learning_rate": 1.6658022137042927e-07, + "loss": 0.1879, + "step": 5176 + }, + { + "epoch": 1.3775944651410326, + "grad_norm": 0.24543151259422302, + "learning_rate": 1.6656761945804104e-07, + "loss": 0.1834, + "step": 5177 + }, + { + "epoch": 1.3778605641298562, + "grad_norm": 0.28501009941101074, + "learning_rate": 1.6655501564700898e-07, + "loss": 0.199, + "step": 5178 + }, + { + "epoch": 1.3781266631186802, + "grad_norm": 0.2959047555923462, + "learning_rate": 1.665424099376927e-07, + "loss": 0.1965, + "step": 5179 + }, + { + "epoch": 1.378392762107504, + "grad_norm": 0.3841138184070587, + "learning_rate": 1.665298023304516e-07, + "loss": 0.1982, + "step": 5180 + }, + { + "epoch": 1.3786588610963277, + "grad_norm": 0.25485673546791077, + "learning_rate": 1.6651719282564538e-07, + "loss": 0.1869, + "step": 5181 + }, + { + "epoch": 1.3789249600851516, + "grad_norm": 0.5269164443016052, + "learning_rate": 1.665045814236336e-07, + "loss": 0.1891, + "step": 5182 + }, + { + "epoch": 1.3791910590739755, + "grad_norm": 0.2528325319290161, + "learning_rate": 1.6649196812477608e-07, + "loss": 0.1789, + "step": 5183 + }, + { + "epoch": 1.3794571580627992, + "grad_norm": 0.2931307256221771, + "learning_rate": 1.664793529294325e-07, + "loss": 0.1963, + "step": 5184 + }, + { + "epoch": 1.3797232570516231, + "grad_norm": 0.36219480633735657, + "learning_rate": 1.6646673583796265e-07, + "loss": 0.2065, + "step": 5185 + }, + { + "epoch": 1.379989356040447, + "grad_norm": 0.3641359210014343, + "learning_rate": 1.6645411685072642e-07, + "loss": 0.2073, + "step": 5186 + }, + { + "epoch": 1.380255455029271, + "grad_norm": 0.5429235100746155, + "learning_rate": 1.6644149596808377e-07, + "loss": 0.2195, + "step": 5187 + }, + { + "epoch": 1.3805215540180948, + "grad_norm": 0.2605046033859253, + "learning_rate": 1.6642887319039461e-07, + "loss": 0.1849, + "step": 5188 + }, + { + "epoch": 1.3807876530069185, + "grad_norm": 0.2571684420108795, + "learning_rate": 1.6641624851801904e-07, + "loss": 0.1837, + "step": 5189 + }, + { + "epoch": 1.3810537519957424, + "grad_norm": 0.2682628035545349, + "learning_rate": 1.6640362195131706e-07, + "loss": 0.1884, + "step": 5190 + }, + { + "epoch": 1.3813198509845663, + "grad_norm": 0.26386570930480957, + "learning_rate": 1.6639099349064886e-07, + "loss": 0.1831, + "step": 5191 + }, + { + "epoch": 1.38158594997339, + "grad_norm": 0.27235665917396545, + "learning_rate": 1.663783631363746e-07, + "loss": 0.1829, + "step": 5192 + }, + { + "epoch": 1.381852048962214, + "grad_norm": 0.2562609910964966, + "learning_rate": 1.6636573088885456e-07, + "loss": 0.1858, + "step": 5193 + }, + { + "epoch": 1.3821181479510378, + "grad_norm": 0.33752354979515076, + "learning_rate": 1.66353096748449e-07, + "loss": 0.2155, + "step": 5194 + }, + { + "epoch": 1.3823842469398615, + "grad_norm": 0.2656906247138977, + "learning_rate": 1.663404607155183e-07, + "loss": 0.1894, + "step": 5195 + }, + { + "epoch": 1.3826503459286854, + "grad_norm": 0.3588109016418457, + "learning_rate": 1.6632782279042285e-07, + "loss": 0.1942, + "step": 5196 + }, + { + "epoch": 1.3829164449175093, + "grad_norm": 0.4141463339328766, + "learning_rate": 1.6631518297352307e-07, + "loss": 0.2011, + "step": 5197 + }, + { + "epoch": 1.3831825439063332, + "grad_norm": 0.30198249220848083, + "learning_rate": 1.6630254126517953e-07, + "loss": 0.1879, + "step": 5198 + }, + { + "epoch": 1.3834486428951571, + "grad_norm": 0.2601830065250397, + "learning_rate": 1.662898976657528e-07, + "loss": 0.1801, + "step": 5199 + }, + { + "epoch": 1.3837147418839808, + "grad_norm": 0.25619426369667053, + "learning_rate": 1.6627725217560347e-07, + "loss": 0.19, + "step": 5200 + }, + { + "epoch": 1.3839808408728047, + "grad_norm": 0.33272165060043335, + "learning_rate": 1.662646047950922e-07, + "loss": 0.1826, + "step": 5201 + }, + { + "epoch": 1.3842469398616286, + "grad_norm": 0.36095499992370605, + "learning_rate": 1.6625195552457978e-07, + "loss": 0.2079, + "step": 5202 + }, + { + "epoch": 1.3845130388504523, + "grad_norm": 0.4829521179199219, + "learning_rate": 1.6623930436442693e-07, + "loss": 0.2168, + "step": 5203 + }, + { + "epoch": 1.3847791378392762, + "grad_norm": 0.2828689515590668, + "learning_rate": 1.6622665131499453e-07, + "loss": 0.2049, + "step": 5204 + }, + { + "epoch": 1.3850452368281, + "grad_norm": 0.2789875268936157, + "learning_rate": 1.6621399637664346e-07, + "loss": 0.2029, + "step": 5205 + }, + { + "epoch": 1.3853113358169238, + "grad_norm": 0.3400018513202667, + "learning_rate": 1.6620133954973466e-07, + "loss": 0.1937, + "step": 5206 + }, + { + "epoch": 1.3855774348057477, + "grad_norm": 0.2518065869808197, + "learning_rate": 1.661886808346291e-07, + "loss": 0.1621, + "step": 5207 + }, + { + "epoch": 1.3858435337945716, + "grad_norm": 0.3396887481212616, + "learning_rate": 1.661760202316879e-07, + "loss": 0.1714, + "step": 5208 + }, + { + "epoch": 1.3861096327833955, + "grad_norm": 0.7180114984512329, + "learning_rate": 1.6616335774127212e-07, + "loss": 0.1739, + "step": 5209 + }, + { + "epoch": 1.3863757317722194, + "grad_norm": 0.2674158811569214, + "learning_rate": 1.6615069336374293e-07, + "loss": 0.2007, + "step": 5210 + }, + { + "epoch": 1.386641830761043, + "grad_norm": 0.37445390224456787, + "learning_rate": 1.6613802709946151e-07, + "loss": 0.2041, + "step": 5211 + }, + { + "epoch": 1.386907929749867, + "grad_norm": 0.2767443060874939, + "learning_rate": 1.661253589487892e-07, + "loss": 0.2016, + "step": 5212 + }, + { + "epoch": 1.387174028738691, + "grad_norm": 0.2731093764305115, + "learning_rate": 1.6611268891208725e-07, + "loss": 0.1918, + "step": 5213 + }, + { + "epoch": 1.3874401277275146, + "grad_norm": 0.3812277615070343, + "learning_rate": 1.6610001698971707e-07, + "loss": 0.1778, + "step": 5214 + }, + { + "epoch": 1.3877062267163385, + "grad_norm": 0.3335045874118805, + "learning_rate": 1.6608734318204008e-07, + "loss": 0.2038, + "step": 5215 + }, + { + "epoch": 1.3879723257051624, + "grad_norm": 0.2836548686027527, + "learning_rate": 1.6607466748941777e-07, + "loss": 0.1955, + "step": 5216 + }, + { + "epoch": 1.388238424693986, + "grad_norm": 0.2592799961566925, + "learning_rate": 1.6606198991221166e-07, + "loss": 0.1833, + "step": 5217 + }, + { + "epoch": 1.38850452368281, + "grad_norm": 0.27898016571998596, + "learning_rate": 1.660493104507834e-07, + "loss": 0.1978, + "step": 5218 + }, + { + "epoch": 1.3887706226716339, + "grad_norm": 0.33587419986724854, + "learning_rate": 1.6603662910549456e-07, + "loss": 0.2094, + "step": 5219 + }, + { + "epoch": 1.3890367216604576, + "grad_norm": 0.3073660731315613, + "learning_rate": 1.6602394587670686e-07, + "loss": 0.1928, + "step": 5220 + }, + { + "epoch": 1.3893028206492815, + "grad_norm": 0.47461557388305664, + "learning_rate": 1.6601126076478206e-07, + "loss": 0.2183, + "step": 5221 + }, + { + "epoch": 1.3895689196381054, + "grad_norm": 0.5287013053894043, + "learning_rate": 1.6599857377008198e-07, + "loss": 0.2235, + "step": 5222 + }, + { + "epoch": 1.3898350186269293, + "grad_norm": 0.26494741439819336, + "learning_rate": 1.659858848929684e-07, + "loss": 0.1726, + "step": 5223 + }, + { + "epoch": 1.3901011176157532, + "grad_norm": 0.25006479024887085, + "learning_rate": 1.6597319413380335e-07, + "loss": 0.1846, + "step": 5224 + }, + { + "epoch": 1.3903672166045768, + "grad_norm": 0.26913484930992126, + "learning_rate": 1.6596050149294872e-07, + "loss": 0.1792, + "step": 5225 + }, + { + "epoch": 1.3906333155934008, + "grad_norm": 0.39563947916030884, + "learning_rate": 1.6594780697076652e-07, + "loss": 0.2092, + "step": 5226 + }, + { + "epoch": 1.3908994145822247, + "grad_norm": 0.3332732021808624, + "learning_rate": 1.6593511056761892e-07, + "loss": 0.1936, + "step": 5227 + }, + { + "epoch": 1.3911655135710483, + "grad_norm": 0.3665243089199066, + "learning_rate": 1.6592241228386793e-07, + "loss": 0.2152, + "step": 5228 + }, + { + "epoch": 1.3914316125598722, + "grad_norm": 0.33555325865745544, + "learning_rate": 1.659097121198758e-07, + "loss": 0.2037, + "step": 5229 + }, + { + "epoch": 1.3916977115486961, + "grad_norm": 0.2699471712112427, + "learning_rate": 1.6589701007600475e-07, + "loss": 0.1843, + "step": 5230 + }, + { + "epoch": 1.3919638105375198, + "grad_norm": 0.2561134696006775, + "learning_rate": 1.6588430615261705e-07, + "loss": 0.1751, + "step": 5231 + }, + { + "epoch": 1.3922299095263437, + "grad_norm": 0.27990278601646423, + "learning_rate": 1.658716003500751e-07, + "loss": 0.1989, + "step": 5232 + }, + { + "epoch": 1.3924960085151676, + "grad_norm": 0.27740466594696045, + "learning_rate": 1.6585889266874115e-07, + "loss": 0.1992, + "step": 5233 + }, + { + "epoch": 1.3927621075039915, + "grad_norm": 0.31445634365081787, + "learning_rate": 1.6584618310897782e-07, + "loss": 0.1841, + "step": 5234 + }, + { + "epoch": 1.3930282064928154, + "grad_norm": 0.2840164005756378, + "learning_rate": 1.6583347167114755e-07, + "loss": 0.1827, + "step": 5235 + }, + { + "epoch": 1.3932943054816391, + "grad_norm": 0.579523503780365, + "learning_rate": 1.6582075835561287e-07, + "loss": 0.1897, + "step": 5236 + }, + { + "epoch": 1.393560404470463, + "grad_norm": 0.3411690294742584, + "learning_rate": 1.658080431627364e-07, + "loss": 0.1879, + "step": 5237 + }, + { + "epoch": 1.393826503459287, + "grad_norm": 0.35265010595321655, + "learning_rate": 1.6579532609288082e-07, + "loss": 0.1732, + "step": 5238 + }, + { + "epoch": 1.3940926024481106, + "grad_norm": 0.25862863659858704, + "learning_rate": 1.6578260714640882e-07, + "loss": 0.1944, + "step": 5239 + }, + { + "epoch": 1.3943587014369345, + "grad_norm": 0.28999099135398865, + "learning_rate": 1.6576988632368323e-07, + "loss": 0.1932, + "step": 5240 + }, + { + "epoch": 1.3946248004257584, + "grad_norm": 0.3415851891040802, + "learning_rate": 1.657571636250668e-07, + "loss": 0.1837, + "step": 5241 + }, + { + "epoch": 1.394890899414582, + "grad_norm": 0.27033987641334534, + "learning_rate": 1.6574443905092244e-07, + "loss": 0.1911, + "step": 5242 + }, + { + "epoch": 1.395156998403406, + "grad_norm": 0.3354329466819763, + "learning_rate": 1.6573171260161307e-07, + "loss": 0.1998, + "step": 5243 + }, + { + "epoch": 1.39542309739223, + "grad_norm": 0.28054219484329224, + "learning_rate": 1.657189842775017e-07, + "loss": 0.2059, + "step": 5244 + }, + { + "epoch": 1.3956891963810538, + "grad_norm": 0.24994084239006042, + "learning_rate": 1.6570625407895134e-07, + "loss": 0.1866, + "step": 5245 + }, + { + "epoch": 1.3959552953698777, + "grad_norm": 0.26518192887306213, + "learning_rate": 1.656935220063251e-07, + "loss": 0.1835, + "step": 5246 + }, + { + "epoch": 1.3962213943587014, + "grad_norm": 0.3183805048465729, + "learning_rate": 1.6568078805998612e-07, + "loss": 0.2057, + "step": 5247 + }, + { + "epoch": 1.3964874933475253, + "grad_norm": 0.27221930027008057, + "learning_rate": 1.6566805224029758e-07, + "loss": 0.1845, + "step": 5248 + }, + { + "epoch": 1.3967535923363492, + "grad_norm": 0.35313472151756287, + "learning_rate": 1.6565531454762277e-07, + "loss": 0.1973, + "step": 5249 + }, + { + "epoch": 1.397019691325173, + "grad_norm": 0.3458358943462372, + "learning_rate": 1.6564257498232498e-07, + "loss": 0.1899, + "step": 5250 + }, + { + "epoch": 1.3972857903139968, + "grad_norm": 0.32843896746635437, + "learning_rate": 1.656298335447675e-07, + "loss": 0.2023, + "step": 5251 + }, + { + "epoch": 1.3975518893028207, + "grad_norm": 0.276481568813324, + "learning_rate": 1.6561709023531385e-07, + "loss": 0.1717, + "step": 5252 + }, + { + "epoch": 1.3978179882916444, + "grad_norm": 0.28084608912467957, + "learning_rate": 1.6560434505432744e-07, + "loss": 0.2019, + "step": 5253 + }, + { + "epoch": 1.3980840872804683, + "grad_norm": 0.2788121700286865, + "learning_rate": 1.6559159800217178e-07, + "loss": 0.1814, + "step": 5254 + }, + { + "epoch": 1.3983501862692922, + "grad_norm": 0.3087974190711975, + "learning_rate": 1.6557884907921048e-07, + "loss": 0.1905, + "step": 5255 + }, + { + "epoch": 1.398616285258116, + "grad_norm": 0.2858281135559082, + "learning_rate": 1.655660982858071e-07, + "loss": 0.1951, + "step": 5256 + }, + { + "epoch": 1.3988823842469398, + "grad_norm": 0.41566893458366394, + "learning_rate": 1.6555334562232538e-07, + "loss": 0.2024, + "step": 5257 + }, + { + "epoch": 1.3991484832357637, + "grad_norm": 0.31858962774276733, + "learning_rate": 1.6554059108912905e-07, + "loss": 0.21, + "step": 5258 + }, + { + "epoch": 1.3994145822245876, + "grad_norm": 0.38178667426109314, + "learning_rate": 1.6552783468658186e-07, + "loss": 0.2036, + "step": 5259 + }, + { + "epoch": 1.3996806812134115, + "grad_norm": 0.27386143803596497, + "learning_rate": 1.6551507641504767e-07, + "loss": 0.1858, + "step": 5260 + }, + { + "epoch": 1.3999467802022352, + "grad_norm": 0.2820086181163788, + "learning_rate": 1.6550231627489039e-07, + "loss": 0.1787, + "step": 5261 + }, + { + "epoch": 1.400212879191059, + "grad_norm": 0.28358739614486694, + "learning_rate": 1.654895542664739e-07, + "loss": 0.1934, + "step": 5262 + }, + { + "epoch": 1.400478978179883, + "grad_norm": 0.32743290066719055, + "learning_rate": 1.654767903901623e-07, + "loss": 0.1949, + "step": 5263 + }, + { + "epoch": 1.4007450771687067, + "grad_norm": 0.27598169445991516, + "learning_rate": 1.6546402464631953e-07, + "loss": 0.1711, + "step": 5264 + }, + { + "epoch": 1.4010111761575306, + "grad_norm": 0.26825597882270813, + "learning_rate": 1.6545125703530976e-07, + "loss": 0.1884, + "step": 5265 + }, + { + "epoch": 1.4012772751463545, + "grad_norm": 0.30284661054611206, + "learning_rate": 1.6543848755749715e-07, + "loss": 0.2121, + "step": 5266 + }, + { + "epoch": 1.4015433741351782, + "grad_norm": 0.3033389747142792, + "learning_rate": 1.6542571621324588e-07, + "loss": 0.1861, + "step": 5267 + }, + { + "epoch": 1.401809473124002, + "grad_norm": 0.26416489481925964, + "learning_rate": 1.6541294300292023e-07, + "loss": 0.1965, + "step": 5268 + }, + { + "epoch": 1.402075572112826, + "grad_norm": 0.4651348888874054, + "learning_rate": 1.6540016792688453e-07, + "loss": 0.2188, + "step": 5269 + }, + { + "epoch": 1.4023416711016499, + "grad_norm": 0.2720683515071869, + "learning_rate": 1.6538739098550315e-07, + "loss": 0.1791, + "step": 5270 + }, + { + "epoch": 1.4026077700904738, + "grad_norm": 0.33273667097091675, + "learning_rate": 1.6537461217914049e-07, + "loss": 0.2129, + "step": 5271 + }, + { + "epoch": 1.4028738690792975, + "grad_norm": 0.27300122380256653, + "learning_rate": 1.6536183150816105e-07, + "loss": 0.199, + "step": 5272 + }, + { + "epoch": 1.4031399680681214, + "grad_norm": 0.3620617389678955, + "learning_rate": 1.6534904897292936e-07, + "loss": 0.1864, + "step": 5273 + }, + { + "epoch": 1.4034060670569453, + "grad_norm": 0.27795276045799255, + "learning_rate": 1.6533626457380997e-07, + "loss": 0.2032, + "step": 5274 + }, + { + "epoch": 1.403672166045769, + "grad_norm": 0.27659302949905396, + "learning_rate": 1.653234783111676e-07, + "loss": 0.189, + "step": 5275 + }, + { + "epoch": 1.4039382650345928, + "grad_norm": 0.3353051543235779, + "learning_rate": 1.6531069018536684e-07, + "loss": 0.2195, + "step": 5276 + }, + { + "epoch": 1.4042043640234168, + "grad_norm": 0.46648868918418884, + "learning_rate": 1.652979001967725e-07, + "loss": 0.2042, + "step": 5277 + }, + { + "epoch": 1.4044704630122404, + "grad_norm": 0.31661656498908997, + "learning_rate": 1.6528510834574933e-07, + "loss": 0.2078, + "step": 5278 + }, + { + "epoch": 1.4047365620010643, + "grad_norm": 0.39477425813674927, + "learning_rate": 1.6527231463266222e-07, + "loss": 0.2004, + "step": 5279 + }, + { + "epoch": 1.4050026609898882, + "grad_norm": 0.38346508145332336, + "learning_rate": 1.6525951905787605e-07, + "loss": 0.1976, + "step": 5280 + }, + { + "epoch": 1.4052687599787121, + "grad_norm": 0.33785146474838257, + "learning_rate": 1.652467216217558e-07, + "loss": 0.1964, + "step": 5281 + }, + { + "epoch": 1.405534858967536, + "grad_norm": 0.3496895730495453, + "learning_rate": 1.6523392232466645e-07, + "loss": 0.1972, + "step": 5282 + }, + { + "epoch": 1.4058009579563597, + "grad_norm": 0.25822263956069946, + "learning_rate": 1.6522112116697307e-07, + "loss": 0.1904, + "step": 5283 + }, + { + "epoch": 1.4060670569451836, + "grad_norm": 0.2850895822048187, + "learning_rate": 1.6520831814904076e-07, + "loss": 0.1919, + "step": 5284 + }, + { + "epoch": 1.4063331559340075, + "grad_norm": 0.320698618888855, + "learning_rate": 1.6519551327123472e-07, + "loss": 0.1915, + "step": 5285 + }, + { + "epoch": 1.4065992549228312, + "grad_norm": 0.5619469881057739, + "learning_rate": 1.651827065339202e-07, + "loss": 0.2131, + "step": 5286 + }, + { + "epoch": 1.4068653539116551, + "grad_norm": 0.37319275736808777, + "learning_rate": 1.6516989793746238e-07, + "loss": 0.2224, + "step": 5287 + }, + { + "epoch": 1.407131452900479, + "grad_norm": 0.2939843535423279, + "learning_rate": 1.6515708748222668e-07, + "loss": 0.1954, + "step": 5288 + }, + { + "epoch": 1.4073975518893027, + "grad_norm": 0.4361952543258667, + "learning_rate": 1.651442751685784e-07, + "loss": 0.2227, + "step": 5289 + }, + { + "epoch": 1.4076636508781266, + "grad_norm": 0.35884615778923035, + "learning_rate": 1.6513146099688305e-07, + "loss": 0.1995, + "step": 5290 + }, + { + "epoch": 1.4079297498669505, + "grad_norm": 0.24867630004882812, + "learning_rate": 1.6511864496750604e-07, + "loss": 0.1787, + "step": 5291 + }, + { + "epoch": 1.4081958488557744, + "grad_norm": 0.36559075117111206, + "learning_rate": 1.6510582708081298e-07, + "loss": 0.2023, + "step": 5292 + }, + { + "epoch": 1.4084619478445983, + "grad_norm": 0.26885128021240234, + "learning_rate": 1.650930073371694e-07, + "loss": 0.186, + "step": 5293 + }, + { + "epoch": 1.408728046833422, + "grad_norm": 0.27352920174598694, + "learning_rate": 1.65080185736941e-07, + "loss": 0.2047, + "step": 5294 + }, + { + "epoch": 1.408994145822246, + "grad_norm": 0.3544783890247345, + "learning_rate": 1.6506736228049345e-07, + "loss": 0.2017, + "step": 5295 + }, + { + "epoch": 1.4092602448110698, + "grad_norm": 0.2627209722995758, + "learning_rate": 1.6505453696819252e-07, + "loss": 0.1856, + "step": 5296 + }, + { + "epoch": 1.4095263437998935, + "grad_norm": 0.47213566303253174, + "learning_rate": 1.6504170980040395e-07, + "loss": 0.2145, + "step": 5297 + }, + { + "epoch": 1.4097924427887174, + "grad_norm": 0.35869109630584717, + "learning_rate": 1.650288807774937e-07, + "loss": 0.1982, + "step": 5298 + }, + { + "epoch": 1.4100585417775413, + "grad_norm": 0.31982848048210144, + "learning_rate": 1.6501604989982758e-07, + "loss": 0.1853, + "step": 5299 + }, + { + "epoch": 1.410324640766365, + "grad_norm": 0.37985092401504517, + "learning_rate": 1.650032171677716e-07, + "loss": 0.2069, + "step": 5300 + }, + { + "epoch": 1.410590739755189, + "grad_norm": 0.29973605275154114, + "learning_rate": 1.649903825816918e-07, + "loss": 0.1922, + "step": 5301 + }, + { + "epoch": 1.4108568387440128, + "grad_norm": 0.2630399167537689, + "learning_rate": 1.6497754614195423e-07, + "loss": 0.1757, + "step": 5302 + }, + { + "epoch": 1.4111229377328365, + "grad_norm": 0.27310100197792053, + "learning_rate": 1.64964707848925e-07, + "loss": 0.196, + "step": 5303 + }, + { + "epoch": 1.4113890367216604, + "grad_norm": 0.3951683044433594, + "learning_rate": 1.6495186770297025e-07, + "loss": 0.2099, + "step": 5304 + }, + { + "epoch": 1.4116551357104843, + "grad_norm": 0.2721766233444214, + "learning_rate": 1.649390257044563e-07, + "loss": 0.1873, + "step": 5305 + }, + { + "epoch": 1.4119212346993082, + "grad_norm": 0.27618831396102905, + "learning_rate": 1.6492618185374933e-07, + "loss": 0.1993, + "step": 5306 + }, + { + "epoch": 1.412187333688132, + "grad_norm": 0.3251558542251587, + "learning_rate": 1.6491333615121573e-07, + "loss": 0.2004, + "step": 5307 + }, + { + "epoch": 1.4124534326769558, + "grad_norm": 0.25881609320640564, + "learning_rate": 1.6490048859722189e-07, + "loss": 0.1872, + "step": 5308 + }, + { + "epoch": 1.4127195316657797, + "grad_norm": 0.43415629863739014, + "learning_rate": 1.648876391921342e-07, + "loss": 0.2204, + "step": 5309 + }, + { + "epoch": 1.4129856306546036, + "grad_norm": 0.2654666602611542, + "learning_rate": 1.6487478793631916e-07, + "loss": 0.1828, + "step": 5310 + }, + { + "epoch": 1.4132517296434273, + "grad_norm": 0.27011820673942566, + "learning_rate": 1.6486193483014338e-07, + "loss": 0.1892, + "step": 5311 + }, + { + "epoch": 1.4135178286322512, + "grad_norm": 0.39370742440223694, + "learning_rate": 1.6484907987397343e-07, + "loss": 0.2024, + "step": 5312 + }, + { + "epoch": 1.413783927621075, + "grad_norm": 0.26937657594680786, + "learning_rate": 1.6483622306817593e-07, + "loss": 0.2015, + "step": 5313 + }, + { + "epoch": 1.4140500266098988, + "grad_norm": 0.3805416226387024, + "learning_rate": 1.6482336441311757e-07, + "loss": 0.2372, + "step": 5314 + }, + { + "epoch": 1.4143161255987227, + "grad_norm": 0.6199563145637512, + "learning_rate": 1.6481050390916512e-07, + "loss": 0.1715, + "step": 5315 + }, + { + "epoch": 1.4145822245875466, + "grad_norm": 0.372329443693161, + "learning_rate": 1.6479764155668542e-07, + "loss": 0.2085, + "step": 5316 + }, + { + "epoch": 1.4148483235763705, + "grad_norm": 0.2977082133293152, + "learning_rate": 1.647847773560453e-07, + "loss": 0.1936, + "step": 5317 + }, + { + "epoch": 1.4151144225651944, + "grad_norm": 0.2691383957862854, + "learning_rate": 1.6477191130761172e-07, + "loss": 0.1859, + "step": 5318 + }, + { + "epoch": 1.415380521554018, + "grad_norm": 0.37090203166007996, + "learning_rate": 1.6475904341175158e-07, + "loss": 0.1805, + "step": 5319 + }, + { + "epoch": 1.415646620542842, + "grad_norm": 0.26844221353530884, + "learning_rate": 1.647461736688319e-07, + "loss": 0.1896, + "step": 5320 + }, + { + "epoch": 1.4159127195316659, + "grad_norm": 0.25429847836494446, + "learning_rate": 1.647333020792198e-07, + "loss": 0.1884, + "step": 5321 + }, + { + "epoch": 1.4161788185204895, + "grad_norm": 0.29384854435920715, + "learning_rate": 1.647204286432824e-07, + "loss": 0.2045, + "step": 5322 + }, + { + "epoch": 1.4164449175093135, + "grad_norm": 0.27477505803108215, + "learning_rate": 1.647075533613868e-07, + "loss": 0.2064, + "step": 5323 + }, + { + "epoch": 1.4167110164981374, + "grad_norm": 0.365469753742218, + "learning_rate": 1.6469467623390034e-07, + "loss": 0.188, + "step": 5324 + }, + { + "epoch": 1.416977115486961, + "grad_norm": 0.3056054711341858, + "learning_rate": 1.6468179726119023e-07, + "loss": 0.1941, + "step": 5325 + }, + { + "epoch": 1.417243214475785, + "grad_norm": 0.30196693539619446, + "learning_rate": 1.646689164436238e-07, + "loss": 0.1964, + "step": 5326 + }, + { + "epoch": 1.4175093134646088, + "grad_norm": 0.2731672525405884, + "learning_rate": 1.646560337815685e-07, + "loss": 0.1959, + "step": 5327 + }, + { + "epoch": 1.4177754124534327, + "grad_norm": 0.3246804177761078, + "learning_rate": 1.646431492753917e-07, + "loss": 0.1944, + "step": 5328 + }, + { + "epoch": 1.4180415114422567, + "grad_norm": 0.3394722640514374, + "learning_rate": 1.6463026292546093e-07, + "loss": 0.1845, + "step": 5329 + }, + { + "epoch": 1.4183076104310803, + "grad_norm": 0.2694746255874634, + "learning_rate": 1.6461737473214373e-07, + "loss": 0.1904, + "step": 5330 + }, + { + "epoch": 1.4185737094199042, + "grad_norm": 0.25193914771080017, + "learning_rate": 1.646044846958077e-07, + "loss": 0.1713, + "step": 5331 + }, + { + "epoch": 1.4188398084087281, + "grad_norm": 0.4257175624370575, + "learning_rate": 1.6459159281682047e-07, + "loss": 0.2021, + "step": 5332 + }, + { + "epoch": 1.4191059073975518, + "grad_norm": 0.37919193506240845, + "learning_rate": 1.6457869909554974e-07, + "loss": 0.1899, + "step": 5333 + }, + { + "epoch": 1.4193720063863757, + "grad_norm": 0.3230416178703308, + "learning_rate": 1.6456580353236332e-07, + "loss": 0.2102, + "step": 5334 + }, + { + "epoch": 1.4196381053751996, + "grad_norm": 0.3744446635246277, + "learning_rate": 1.6455290612762895e-07, + "loss": 0.1981, + "step": 5335 + }, + { + "epoch": 1.4199042043640233, + "grad_norm": 0.3460577130317688, + "learning_rate": 1.645400068817145e-07, + "loss": 0.2008, + "step": 5336 + }, + { + "epoch": 1.4201703033528472, + "grad_norm": 0.3371798098087311, + "learning_rate": 1.6452710579498797e-07, + "loss": 0.1995, + "step": 5337 + }, + { + "epoch": 1.4204364023416711, + "grad_norm": 0.5788954496383667, + "learning_rate": 1.645142028678172e-07, + "loss": 0.2034, + "step": 5338 + }, + { + "epoch": 1.420702501330495, + "grad_norm": 0.3592880070209503, + "learning_rate": 1.645012981005703e-07, + "loss": 0.2031, + "step": 5339 + }, + { + "epoch": 1.4209686003193187, + "grad_norm": 0.24424141645431519, + "learning_rate": 1.644883914936153e-07, + "loss": 0.1787, + "step": 5340 + }, + { + "epoch": 1.4212346993081426, + "grad_norm": 0.24793082475662231, + "learning_rate": 1.6447548304732032e-07, + "loss": 0.188, + "step": 5341 + }, + { + "epoch": 1.4215007982969665, + "grad_norm": 0.2654854953289032, + "learning_rate": 1.6446257276205354e-07, + "loss": 0.2039, + "step": 5342 + }, + { + "epoch": 1.4217668972857904, + "grad_norm": 0.3736499547958374, + "learning_rate": 1.644496606381832e-07, + "loss": 0.2029, + "step": 5343 + }, + { + "epoch": 1.422032996274614, + "grad_norm": 0.2777019143104553, + "learning_rate": 1.6443674667607757e-07, + "loss": 0.2001, + "step": 5344 + }, + { + "epoch": 1.422299095263438, + "grad_norm": 0.2545159161090851, + "learning_rate": 1.6442383087610495e-07, + "loss": 0.1861, + "step": 5345 + }, + { + "epoch": 1.422565194252262, + "grad_norm": 0.26863765716552734, + "learning_rate": 1.644109132386338e-07, + "loss": 0.1852, + "step": 5346 + }, + { + "epoch": 1.4228312932410856, + "grad_norm": 0.25125375390052795, + "learning_rate": 1.6439799376403252e-07, + "loss": 0.1939, + "step": 5347 + }, + { + "epoch": 1.4230973922299095, + "grad_norm": 0.2427806407213211, + "learning_rate": 1.6438507245266954e-07, + "loss": 0.1648, + "step": 5348 + }, + { + "epoch": 1.4233634912187334, + "grad_norm": 0.26579776406288147, + "learning_rate": 1.6437214930491349e-07, + "loss": 0.1962, + "step": 5349 + }, + { + "epoch": 1.423629590207557, + "grad_norm": 0.2632758617401123, + "learning_rate": 1.6435922432113297e-07, + "loss": 0.1976, + "step": 5350 + }, + { + "epoch": 1.423895689196381, + "grad_norm": 0.27294740080833435, + "learning_rate": 1.6434629750169653e-07, + "loss": 0.1964, + "step": 5351 + }, + { + "epoch": 1.424161788185205, + "grad_norm": 0.4039916396141052, + "learning_rate": 1.6433336884697292e-07, + "loss": 0.2156, + "step": 5352 + }, + { + "epoch": 1.4244278871740288, + "grad_norm": 0.2898534834384918, + "learning_rate": 1.6432043835733094e-07, + "loss": 0.1955, + "step": 5353 + }, + { + "epoch": 1.4246939861628527, + "grad_norm": 0.337659627199173, + "learning_rate": 1.6430750603313934e-07, + "loss": 0.2056, + "step": 5354 + }, + { + "epoch": 1.4249600851516764, + "grad_norm": 0.2954513132572174, + "learning_rate": 1.6429457187476699e-07, + "loss": 0.2088, + "step": 5355 + }, + { + "epoch": 1.4252261841405003, + "grad_norm": 0.272986501455307, + "learning_rate": 1.642816358825828e-07, + "loss": 0.1946, + "step": 5356 + }, + { + "epoch": 1.4254922831293242, + "grad_norm": 0.2562568485736847, + "learning_rate": 1.6426869805695573e-07, + "loss": 0.1926, + "step": 5357 + }, + { + "epoch": 1.4257583821181479, + "grad_norm": 0.2544352412223816, + "learning_rate": 1.6425575839825478e-07, + "loss": 0.1734, + "step": 5358 + }, + { + "epoch": 1.4260244811069718, + "grad_norm": 0.2983199656009674, + "learning_rate": 1.6424281690684906e-07, + "loss": 0.2009, + "step": 5359 + }, + { + "epoch": 1.4262905800957957, + "grad_norm": 0.22508296370506287, + "learning_rate": 1.642298735831076e-07, + "loss": 0.1702, + "step": 5360 + }, + { + "epoch": 1.4265566790846194, + "grad_norm": 0.39987385272979736, + "learning_rate": 1.6421692842739964e-07, + "loss": 0.2055, + "step": 5361 + }, + { + "epoch": 1.4268227780734433, + "grad_norm": 0.34059450030326843, + "learning_rate": 1.642039814400944e-07, + "loss": 0.205, + "step": 5362 + }, + { + "epoch": 1.4270888770622672, + "grad_norm": 0.3309866487979889, + "learning_rate": 1.6419103262156114e-07, + "loss": 0.2047, + "step": 5363 + }, + { + "epoch": 1.427354976051091, + "grad_norm": 0.35328060388565063, + "learning_rate": 1.641780819721692e-07, + "loss": 0.2209, + "step": 5364 + }, + { + "epoch": 1.427621075039915, + "grad_norm": 0.24961215257644653, + "learning_rate": 1.6416512949228793e-07, + "loss": 0.1763, + "step": 5365 + }, + { + "epoch": 1.4278871740287387, + "grad_norm": 0.3660323917865753, + "learning_rate": 1.6415217518228678e-07, + "loss": 0.1915, + "step": 5366 + }, + { + "epoch": 1.4281532730175626, + "grad_norm": 0.42888471484184265, + "learning_rate": 1.6413921904253523e-07, + "loss": 0.2082, + "step": 5367 + }, + { + "epoch": 1.4284193720063865, + "grad_norm": 0.28958454728126526, + "learning_rate": 1.641262610734028e-07, + "loss": 0.1927, + "step": 5368 + }, + { + "epoch": 1.4286854709952101, + "grad_norm": 0.3358992636203766, + "learning_rate": 1.6411330127525912e-07, + "loss": 0.1935, + "step": 5369 + }, + { + "epoch": 1.428951569984034, + "grad_norm": 0.33338049054145813, + "learning_rate": 1.641003396484738e-07, + "loss": 0.1831, + "step": 5370 + }, + { + "epoch": 1.429217668972858, + "grad_norm": 0.26289042830467224, + "learning_rate": 1.640873761934165e-07, + "loss": 0.187, + "step": 5371 + }, + { + "epoch": 1.4294837679616816, + "grad_norm": 0.48278921842575073, + "learning_rate": 1.6407441091045705e-07, + "loss": 0.1986, + "step": 5372 + }, + { + "epoch": 1.4297498669505055, + "grad_norm": 0.272490531206131, + "learning_rate": 1.6406144379996515e-07, + "loss": 0.184, + "step": 5373 + }, + { + "epoch": 1.4300159659393294, + "grad_norm": 0.2643483281135559, + "learning_rate": 1.640484748623107e-07, + "loss": 0.1765, + "step": 5374 + }, + { + "epoch": 1.4302820649281534, + "grad_norm": 0.29785171151161194, + "learning_rate": 1.6403550409786362e-07, + "loss": 0.2058, + "step": 5375 + }, + { + "epoch": 1.430548163916977, + "grad_norm": 0.3195945918560028, + "learning_rate": 1.640225315069938e-07, + "loss": 0.1991, + "step": 5376 + }, + { + "epoch": 1.430814262905801, + "grad_norm": 0.2932005822658539, + "learning_rate": 1.6400955709007134e-07, + "loss": 0.2009, + "step": 5377 + }, + { + "epoch": 1.4310803618946248, + "grad_norm": 0.2976175546646118, + "learning_rate": 1.639965808474662e-07, + "loss": 0.2054, + "step": 5378 + }, + { + "epoch": 1.4313464608834487, + "grad_norm": 0.30503469705581665, + "learning_rate": 1.6398360277954852e-07, + "loss": 0.207, + "step": 5379 + }, + { + "epoch": 1.4316125598722724, + "grad_norm": 0.3404541015625, + "learning_rate": 1.6397062288668847e-07, + "loss": 0.2034, + "step": 5380 + }, + { + "epoch": 1.4318786588610963, + "grad_norm": 0.2770255208015442, + "learning_rate": 1.6395764116925628e-07, + "loss": 0.2077, + "step": 5381 + }, + { + "epoch": 1.4321447578499202, + "grad_norm": 0.4774217903614044, + "learning_rate": 1.6394465762762217e-07, + "loss": 0.1902, + "step": 5382 + }, + { + "epoch": 1.432410856838744, + "grad_norm": 0.3064954876899719, + "learning_rate": 1.6393167226215653e-07, + "loss": 0.2015, + "step": 5383 + }, + { + "epoch": 1.4326769558275678, + "grad_norm": 0.2934679687023163, + "learning_rate": 1.6391868507322965e-07, + "loss": 0.1868, + "step": 5384 + }, + { + "epoch": 1.4329430548163917, + "grad_norm": 0.2702910304069519, + "learning_rate": 1.6390569606121197e-07, + "loss": 0.1886, + "step": 5385 + }, + { + "epoch": 1.4332091538052154, + "grad_norm": 0.362844854593277, + "learning_rate": 1.63892705226474e-07, + "loss": 0.194, + "step": 5386 + }, + { + "epoch": 1.4334752527940393, + "grad_norm": 0.3645019829273224, + "learning_rate": 1.638797125693862e-07, + "loss": 0.2106, + "step": 5387 + }, + { + "epoch": 1.4337413517828632, + "grad_norm": 0.2831210494041443, + "learning_rate": 1.6386671809031923e-07, + "loss": 0.1872, + "step": 5388 + }, + { + "epoch": 1.4340074507716871, + "grad_norm": 0.38189658522605896, + "learning_rate": 1.6385372178964366e-07, + "loss": 0.2253, + "step": 5389 + }, + { + "epoch": 1.434273549760511, + "grad_norm": 0.2876148819923401, + "learning_rate": 1.6384072366773017e-07, + "loss": 0.1996, + "step": 5390 + }, + { + "epoch": 1.4345396487493347, + "grad_norm": 0.26362940669059753, + "learning_rate": 1.6382772372494956e-07, + "loss": 0.1841, + "step": 5391 + }, + { + "epoch": 1.4348057477381586, + "grad_norm": 0.25543448328971863, + "learning_rate": 1.638147219616725e-07, + "loss": 0.1866, + "step": 5392 + }, + { + "epoch": 1.4350718467269825, + "grad_norm": 0.24871470034122467, + "learning_rate": 1.6380171837826994e-07, + "loss": 0.1775, + "step": 5393 + }, + { + "epoch": 1.4353379457158062, + "grad_norm": 0.37195006012916565, + "learning_rate": 1.6378871297511272e-07, + "loss": 0.1952, + "step": 5394 + }, + { + "epoch": 1.43560404470463, + "grad_norm": 0.27582648396492004, + "learning_rate": 1.6377570575257177e-07, + "loss": 0.1927, + "step": 5395 + }, + { + "epoch": 1.435870143693454, + "grad_norm": 0.3151394724845886, + "learning_rate": 1.6376269671101808e-07, + "loss": 0.1914, + "step": 5396 + }, + { + "epoch": 1.4361362426822777, + "grad_norm": 0.29652613401412964, + "learning_rate": 1.6374968585082275e-07, + "loss": 0.1984, + "step": 5397 + }, + { + "epoch": 1.4364023416711016, + "grad_norm": 0.36827197670936584, + "learning_rate": 1.637366731723568e-07, + "loss": 0.191, + "step": 5398 + }, + { + "epoch": 1.4366684406599255, + "grad_norm": 0.36314937472343445, + "learning_rate": 1.637236586759914e-07, + "loss": 0.2139, + "step": 5399 + }, + { + "epoch": 1.4369345396487494, + "grad_norm": 0.32054638862609863, + "learning_rate": 1.637106423620978e-07, + "loss": 0.2028, + "step": 5400 + }, + { + "epoch": 1.4372006386375733, + "grad_norm": 0.30883488059043884, + "learning_rate": 1.636976242310472e-07, + "loss": 0.1906, + "step": 5401 + }, + { + "epoch": 1.437466737626397, + "grad_norm": 0.32544636726379395, + "learning_rate": 1.6368460428321088e-07, + "loss": 0.1881, + "step": 5402 + }, + { + "epoch": 1.4377328366152209, + "grad_norm": 0.5387380719184875, + "learning_rate": 1.6367158251896028e-07, + "loss": 0.2096, + "step": 5403 + }, + { + "epoch": 1.4379989356040448, + "grad_norm": 0.29200485348701477, + "learning_rate": 1.6365855893866675e-07, + "loss": 0.1798, + "step": 5404 + }, + { + "epoch": 1.4382650345928685, + "grad_norm": 0.358805388212204, + "learning_rate": 1.6364553354270175e-07, + "loss": 0.1948, + "step": 5405 + }, + { + "epoch": 1.4385311335816924, + "grad_norm": 0.35154181718826294, + "learning_rate": 1.6363250633143677e-07, + "loss": 0.2196, + "step": 5406 + }, + { + "epoch": 1.4387972325705163, + "grad_norm": 0.33320626616477966, + "learning_rate": 1.6361947730524342e-07, + "loss": 0.197, + "step": 5407 + }, + { + "epoch": 1.43906333155934, + "grad_norm": 0.32939422130584717, + "learning_rate": 1.6360644646449333e-07, + "loss": 0.202, + "step": 5408 + }, + { + "epoch": 1.4393294305481639, + "grad_norm": 0.29902151226997375, + "learning_rate": 1.635934138095581e-07, + "loss": 0.1964, + "step": 5409 + }, + { + "epoch": 1.4395955295369878, + "grad_norm": 0.3162310719490051, + "learning_rate": 1.6358037934080951e-07, + "loss": 0.1898, + "step": 5410 + }, + { + "epoch": 1.4398616285258117, + "grad_norm": 0.2689666152000427, + "learning_rate": 1.6356734305861928e-07, + "loss": 0.1897, + "step": 5411 + }, + { + "epoch": 1.4401277275146356, + "grad_norm": 0.2604924142360687, + "learning_rate": 1.6355430496335924e-07, + "loss": 0.1873, + "step": 5412 + }, + { + "epoch": 1.4403938265034593, + "grad_norm": 0.34831109642982483, + "learning_rate": 1.6354126505540128e-07, + "loss": 0.1984, + "step": 5413 + }, + { + "epoch": 1.4406599254922832, + "grad_norm": 0.29147496819496155, + "learning_rate": 1.6352822333511733e-07, + "loss": 0.204, + "step": 5414 + }, + { + "epoch": 1.440926024481107, + "grad_norm": 0.28053024411201477, + "learning_rate": 1.6351517980287938e-07, + "loss": 0.1947, + "step": 5415 + }, + { + "epoch": 1.4411921234699308, + "grad_norm": 0.2744915187358856, + "learning_rate": 1.635021344590594e-07, + "loss": 0.198, + "step": 5416 + }, + { + "epoch": 1.4414582224587547, + "grad_norm": 0.363307386636734, + "learning_rate": 1.634890873040295e-07, + "loss": 0.1929, + "step": 5417 + }, + { + "epoch": 1.4417243214475786, + "grad_norm": 0.28835222125053406, + "learning_rate": 1.634760383381618e-07, + "loss": 0.1799, + "step": 5418 + }, + { + "epoch": 1.4419904204364022, + "grad_norm": 0.27800917625427246, + "learning_rate": 1.6346298756182853e-07, + "loss": 0.2011, + "step": 5419 + }, + { + "epoch": 1.4422565194252261, + "grad_norm": 0.27709245681762695, + "learning_rate": 1.6344993497540187e-07, + "loss": 0.1865, + "step": 5420 + }, + { + "epoch": 1.44252261841405, + "grad_norm": 0.5173304677009583, + "learning_rate": 1.6343688057925412e-07, + "loss": 0.1777, + "step": 5421 + }, + { + "epoch": 1.4427887174028737, + "grad_norm": 0.36348676681518555, + "learning_rate": 1.6342382437375765e-07, + "loss": 0.1863, + "step": 5422 + }, + { + "epoch": 1.4430548163916976, + "grad_norm": 0.2667511999607086, + "learning_rate": 1.6341076635928483e-07, + "loss": 0.1896, + "step": 5423 + }, + { + "epoch": 1.4433209153805215, + "grad_norm": 0.25871211290359497, + "learning_rate": 1.6339770653620808e-07, + "loss": 0.1911, + "step": 5424 + }, + { + "epoch": 1.4435870143693454, + "grad_norm": 0.321966290473938, + "learning_rate": 1.633846449048999e-07, + "loss": 0.2017, + "step": 5425 + }, + { + "epoch": 1.4438531133581693, + "grad_norm": 0.29550185799598694, + "learning_rate": 1.6337158146573286e-07, + "loss": 0.1913, + "step": 5426 + }, + { + "epoch": 1.444119212346993, + "grad_norm": 0.33662787079811096, + "learning_rate": 1.6335851621907956e-07, + "loss": 0.1986, + "step": 5427 + }, + { + "epoch": 1.444385311335817, + "grad_norm": 0.36755236983299255, + "learning_rate": 1.6334544916531256e-07, + "loss": 0.1885, + "step": 5428 + }, + { + "epoch": 1.4446514103246408, + "grad_norm": 0.32172858715057373, + "learning_rate": 1.633323803048047e-07, + "loss": 0.1804, + "step": 5429 + }, + { + "epoch": 1.4449175093134645, + "grad_norm": 0.3097333312034607, + "learning_rate": 1.6331930963792861e-07, + "loss": 0.1995, + "step": 5430 + }, + { + "epoch": 1.4451836083022884, + "grad_norm": 0.3038673400878906, + "learning_rate": 1.6330623716505716e-07, + "loss": 0.2234, + "step": 5431 + }, + { + "epoch": 1.4454497072911123, + "grad_norm": 0.29623791575431824, + "learning_rate": 1.6329316288656316e-07, + "loss": 0.188, + "step": 5432 + }, + { + "epoch": 1.445715806279936, + "grad_norm": 0.4915785491466522, + "learning_rate": 1.6328008680281956e-07, + "loss": 0.2023, + "step": 5433 + }, + { + "epoch": 1.44598190526876, + "grad_norm": 0.2447122484445572, + "learning_rate": 1.6326700891419928e-07, + "loss": 0.1693, + "step": 5434 + }, + { + "epoch": 1.4462480042575838, + "grad_norm": 0.24488434195518494, + "learning_rate": 1.6325392922107538e-07, + "loss": 0.1736, + "step": 5435 + }, + { + "epoch": 1.4465141032464077, + "grad_norm": 0.2851096987724304, + "learning_rate": 1.6324084772382083e-07, + "loss": 0.1983, + "step": 5436 + }, + { + "epoch": 1.4467802022352316, + "grad_norm": 0.2893214523792267, + "learning_rate": 1.6322776442280878e-07, + "loss": 0.1894, + "step": 5437 + }, + { + "epoch": 1.4470463012240553, + "grad_norm": 0.2579323649406433, + "learning_rate": 1.6321467931841247e-07, + "loss": 0.1703, + "step": 5438 + }, + { + "epoch": 1.4473124002128792, + "grad_norm": 0.3630504012107849, + "learning_rate": 1.6320159241100499e-07, + "loss": 0.1947, + "step": 5439 + }, + { + "epoch": 1.4475784992017031, + "grad_norm": 0.27568671107292175, + "learning_rate": 1.6318850370095967e-07, + "loss": 0.1782, + "step": 5440 + }, + { + "epoch": 1.4478445981905268, + "grad_norm": 0.2816028892993927, + "learning_rate": 1.6317541318864985e-07, + "loss": 0.1884, + "step": 5441 + }, + { + "epoch": 1.4481106971793507, + "grad_norm": 0.36476024985313416, + "learning_rate": 1.6316232087444885e-07, + "loss": 0.2226, + "step": 5442 + }, + { + "epoch": 1.4483767961681746, + "grad_norm": 0.3945571482181549, + "learning_rate": 1.6314922675873008e-07, + "loss": 0.1975, + "step": 5443 + }, + { + "epoch": 1.4486428951569983, + "grad_norm": 0.28285330533981323, + "learning_rate": 1.6313613084186706e-07, + "loss": 0.2016, + "step": 5444 + }, + { + "epoch": 1.4489089941458222, + "grad_norm": 0.2697678506374359, + "learning_rate": 1.631230331242333e-07, + "loss": 0.1929, + "step": 5445 + }, + { + "epoch": 1.449175093134646, + "grad_norm": 0.2479167878627777, + "learning_rate": 1.6310993360620235e-07, + "loss": 0.1777, + "step": 5446 + }, + { + "epoch": 1.44944119212347, + "grad_norm": 0.41531747579574585, + "learning_rate": 1.6309683228814784e-07, + "loss": 0.2131, + "step": 5447 + }, + { + "epoch": 1.449707291112294, + "grad_norm": 0.33481425046920776, + "learning_rate": 1.630837291704435e-07, + "loss": 0.2199, + "step": 5448 + }, + { + "epoch": 1.4499733901011176, + "grad_norm": 0.33005931973457336, + "learning_rate": 1.6307062425346297e-07, + "loss": 0.1876, + "step": 5449 + }, + { + "epoch": 1.4502394890899415, + "grad_norm": 0.4180528223514557, + "learning_rate": 1.6305751753758007e-07, + "loss": 0.2166, + "step": 5450 + }, + { + "epoch": 1.4505055880787654, + "grad_norm": 1.680098295211792, + "learning_rate": 1.6304440902316862e-07, + "loss": 0.1774, + "step": 5451 + }, + { + "epoch": 1.450771687067589, + "grad_norm": 0.270987868309021, + "learning_rate": 1.6303129871060254e-07, + "loss": 0.1952, + "step": 5452 + }, + { + "epoch": 1.451037786056413, + "grad_norm": 0.23970094323158264, + "learning_rate": 1.6301818660025575e-07, + "loss": 0.1768, + "step": 5453 + }, + { + "epoch": 1.4513038850452369, + "grad_norm": 0.3260112404823303, + "learning_rate": 1.6300507269250218e-07, + "loss": 0.1921, + "step": 5454 + }, + { + "epoch": 1.4515699840340606, + "grad_norm": 0.39860692620277405, + "learning_rate": 1.6299195698771592e-07, + "loss": 0.2315, + "step": 5455 + }, + { + "epoch": 1.4518360830228845, + "grad_norm": 0.28959089517593384, + "learning_rate": 1.6297883948627105e-07, + "loss": 0.1946, + "step": 5456 + }, + { + "epoch": 1.4521021820117084, + "grad_norm": 0.26897943019866943, + "learning_rate": 1.629657201885417e-07, + "loss": 0.1825, + "step": 5457 + }, + { + "epoch": 1.4523682810005323, + "grad_norm": 0.4180390536785126, + "learning_rate": 1.6295259909490204e-07, + "loss": 0.2058, + "step": 5458 + }, + { + "epoch": 1.452634379989356, + "grad_norm": 0.28219297528266907, + "learning_rate": 1.6293947620572634e-07, + "loss": 0.1838, + "step": 5459 + }, + { + "epoch": 1.4529004789781799, + "grad_norm": 0.29636332392692566, + "learning_rate": 1.6292635152138887e-07, + "loss": 0.1856, + "step": 5460 + }, + { + "epoch": 1.4531665779670038, + "grad_norm": 0.2744136452674866, + "learning_rate": 1.6291322504226397e-07, + "loss": 0.1921, + "step": 5461 + }, + { + "epoch": 1.4534326769558277, + "grad_norm": 0.27434325218200684, + "learning_rate": 1.6290009676872606e-07, + "loss": 0.1732, + "step": 5462 + }, + { + "epoch": 1.4536987759446514, + "grad_norm": 0.2601545453071594, + "learning_rate": 1.6288696670114958e-07, + "loss": 0.1896, + "step": 5463 + }, + { + "epoch": 1.4539648749334753, + "grad_norm": 0.266568124294281, + "learning_rate": 1.6287383483990902e-07, + "loss": 0.1722, + "step": 5464 + }, + { + "epoch": 1.4542309739222992, + "grad_norm": 0.28203025460243225, + "learning_rate": 1.6286070118537892e-07, + "loss": 0.1861, + "step": 5465 + }, + { + "epoch": 1.4544970729111228, + "grad_norm": 0.27690833806991577, + "learning_rate": 1.6284756573793387e-07, + "loss": 0.1884, + "step": 5466 + }, + { + "epoch": 1.4547631718999467, + "grad_norm": 0.3483751714229584, + "learning_rate": 1.6283442849794855e-07, + "loss": 0.1804, + "step": 5467 + }, + { + "epoch": 1.4550292708887707, + "grad_norm": 0.26611968874931335, + "learning_rate": 1.6282128946579765e-07, + "loss": 0.1751, + "step": 5468 + }, + { + "epoch": 1.4552953698775943, + "grad_norm": 0.28071334958076477, + "learning_rate": 1.628081486418559e-07, + "loss": 0.1796, + "step": 5469 + }, + { + "epoch": 1.4555614688664182, + "grad_norm": 0.4214198887348175, + "learning_rate": 1.6279500602649812e-07, + "loss": 0.2091, + "step": 5470 + }, + { + "epoch": 1.4558275678552421, + "grad_norm": 0.2786974310874939, + "learning_rate": 1.6278186162009914e-07, + "loss": 0.2056, + "step": 5471 + }, + { + "epoch": 1.456093666844066, + "grad_norm": 0.26906055212020874, + "learning_rate": 1.6276871542303395e-07, + "loss": 0.2028, + "step": 5472 + }, + { + "epoch": 1.45635976583289, + "grad_norm": 0.3083672523498535, + "learning_rate": 1.627555674356774e-07, + "loss": 0.1857, + "step": 5473 + }, + { + "epoch": 1.4566258648217136, + "grad_norm": 0.5700640082359314, + "learning_rate": 1.6274241765840453e-07, + "loss": 0.1728, + "step": 5474 + }, + { + "epoch": 1.4568919638105375, + "grad_norm": 0.31414249539375305, + "learning_rate": 1.6272926609159043e-07, + "loss": 0.1884, + "step": 5475 + }, + { + "epoch": 1.4571580627993614, + "grad_norm": 0.353585422039032, + "learning_rate": 1.6271611273561018e-07, + "loss": 0.202, + "step": 5476 + }, + { + "epoch": 1.4574241617881851, + "grad_norm": 0.34835299849510193, + "learning_rate": 1.6270295759083897e-07, + "loss": 0.1924, + "step": 5477 + }, + { + "epoch": 1.457690260777009, + "grad_norm": 0.2663891315460205, + "learning_rate": 1.6268980065765197e-07, + "loss": 0.1726, + "step": 5478 + }, + { + "epoch": 1.457956359765833, + "grad_norm": 0.2513434886932373, + "learning_rate": 1.6267664193642445e-07, + "loss": 0.1872, + "step": 5479 + }, + { + "epoch": 1.4582224587546566, + "grad_norm": 0.259259432554245, + "learning_rate": 1.6266348142753176e-07, + "loss": 0.1779, + "step": 5480 + }, + { + "epoch": 1.4584885577434805, + "grad_norm": 0.2841672897338867, + "learning_rate": 1.6265031913134922e-07, + "loss": 0.1875, + "step": 5481 + }, + { + "epoch": 1.4587546567323044, + "grad_norm": 0.24590803682804108, + "learning_rate": 1.6263715504825227e-07, + "loss": 0.1858, + "step": 5482 + }, + { + "epoch": 1.4590207557211283, + "grad_norm": 0.39289888739585876, + "learning_rate": 1.626239891786164e-07, + "loss": 0.1919, + "step": 5483 + }, + { + "epoch": 1.4592868547099522, + "grad_norm": 0.33084940910339355, + "learning_rate": 1.6261082152281712e-07, + "loss": 0.2055, + "step": 5484 + }, + { + "epoch": 1.459552953698776, + "grad_norm": 0.4423815608024597, + "learning_rate": 1.6259765208122992e-07, + "loss": 0.2045, + "step": 5485 + }, + { + "epoch": 1.4598190526875998, + "grad_norm": 0.25872528553009033, + "learning_rate": 1.6258448085423052e-07, + "loss": 0.188, + "step": 5486 + }, + { + "epoch": 1.4600851516764237, + "grad_norm": 0.3252466320991516, + "learning_rate": 1.6257130784219452e-07, + "loss": 0.2033, + "step": 5487 + }, + { + "epoch": 1.4603512506652474, + "grad_norm": 0.26780012249946594, + "learning_rate": 1.6255813304549768e-07, + "loss": 0.2044, + "step": 5488 + }, + { + "epoch": 1.4606173496540713, + "grad_norm": 0.3010644316673279, + "learning_rate": 1.6254495646451578e-07, + "loss": 0.1966, + "step": 5489 + }, + { + "epoch": 1.4608834486428952, + "grad_norm": 0.2806564271450043, + "learning_rate": 1.625317780996246e-07, + "loss": 0.2112, + "step": 5490 + }, + { + "epoch": 1.461149547631719, + "grad_norm": 0.33813410997390747, + "learning_rate": 1.6251859795120005e-07, + "loss": 0.1725, + "step": 5491 + }, + { + "epoch": 1.4614156466205428, + "grad_norm": 0.34866663813591003, + "learning_rate": 1.6250541601961804e-07, + "loss": 0.1904, + "step": 5492 + }, + { + "epoch": 1.4616817456093667, + "grad_norm": 0.5412802696228027, + "learning_rate": 1.6249223230525455e-07, + "loss": 0.2134, + "step": 5493 + }, + { + "epoch": 1.4619478445981906, + "grad_norm": 0.28928127884864807, + "learning_rate": 1.624790468084856e-07, + "loss": 0.2095, + "step": 5494 + }, + { + "epoch": 1.4622139435870143, + "grad_norm": 0.40977296233177185, + "learning_rate": 1.6246585952968727e-07, + "loss": 0.2003, + "step": 5495 + }, + { + "epoch": 1.4624800425758382, + "grad_norm": 0.3404540419578552, + "learning_rate": 1.6245267046923569e-07, + "loss": 0.1918, + "step": 5496 + }, + { + "epoch": 1.462746141564662, + "grad_norm": 0.2696182131767273, + "learning_rate": 1.6243947962750703e-07, + "loss": 0.1961, + "step": 5497 + }, + { + "epoch": 1.463012240553486, + "grad_norm": 0.3165881335735321, + "learning_rate": 1.6242628700487754e-07, + "loss": 0.2142, + "step": 5498 + }, + { + "epoch": 1.4632783395423097, + "grad_norm": 0.39410924911499023, + "learning_rate": 1.624130926017235e-07, + "loss": 0.1956, + "step": 5499 + }, + { + "epoch": 1.4635444385311336, + "grad_norm": 0.34121987223625183, + "learning_rate": 1.623998964184212e-07, + "loss": 0.1734, + "step": 5500 + }, + { + "epoch": 1.4638105375199575, + "grad_norm": 0.28231948614120483, + "learning_rate": 1.6238669845534707e-07, + "loss": 0.1886, + "step": 5501 + }, + { + "epoch": 1.4640766365087812, + "grad_norm": 0.2522384226322174, + "learning_rate": 1.6237349871287754e-07, + "loss": 0.1801, + "step": 5502 + }, + { + "epoch": 1.464342735497605, + "grad_norm": 0.296726793050766, + "learning_rate": 1.6236029719138905e-07, + "loss": 0.1974, + "step": 5503 + }, + { + "epoch": 1.464608834486429, + "grad_norm": 0.3721039593219757, + "learning_rate": 1.6234709389125816e-07, + "loss": 0.1995, + "step": 5504 + }, + { + "epoch": 1.4648749334752527, + "grad_norm": 0.26331695914268494, + "learning_rate": 1.623338888128615e-07, + "loss": 0.1944, + "step": 5505 + }, + { + "epoch": 1.4651410324640766, + "grad_norm": 0.2847069799900055, + "learning_rate": 1.623206819565756e-07, + "loss": 0.1981, + "step": 5506 + }, + { + "epoch": 1.4654071314529005, + "grad_norm": 0.27991151809692383, + "learning_rate": 1.6230747332277725e-07, + "loss": 0.2077, + "step": 5507 + }, + { + "epoch": 1.4656732304417244, + "grad_norm": 0.2583695948123932, + "learning_rate": 1.6229426291184313e-07, + "loss": 0.1794, + "step": 5508 + }, + { + "epoch": 1.4659393294305483, + "grad_norm": 0.2817884683609009, + "learning_rate": 1.6228105072415006e-07, + "loss": 0.1956, + "step": 5509 + }, + { + "epoch": 1.466205428419372, + "grad_norm": 0.255887895822525, + "learning_rate": 1.6226783676007485e-07, + "loss": 0.1797, + "step": 5510 + }, + { + "epoch": 1.4664715274081959, + "grad_norm": 0.2719939351081848, + "learning_rate": 1.622546210199944e-07, + "loss": 0.203, + "step": 5511 + }, + { + "epoch": 1.4667376263970198, + "grad_norm": 0.27925848960876465, + "learning_rate": 1.6224140350428565e-07, + "loss": 0.2018, + "step": 5512 + }, + { + "epoch": 1.4670037253858434, + "grad_norm": 0.35316571593284607, + "learning_rate": 1.622281842133256e-07, + "loss": 0.1863, + "step": 5513 + }, + { + "epoch": 1.4672698243746674, + "grad_norm": 0.3927878439426422, + "learning_rate": 1.6221496314749128e-07, + "loss": 0.17, + "step": 5514 + }, + { + "epoch": 1.4675359233634913, + "grad_norm": 0.285064697265625, + "learning_rate": 1.622017403071598e-07, + "loss": 0.2082, + "step": 5515 + }, + { + "epoch": 1.467802022352315, + "grad_norm": 0.3296312093734741, + "learning_rate": 1.6218851569270821e-07, + "loss": 0.1977, + "step": 5516 + }, + { + "epoch": 1.4680681213411388, + "grad_norm": 0.2872386574745178, + "learning_rate": 1.6217528930451385e-07, + "loss": 0.1903, + "step": 5517 + }, + { + "epoch": 1.4683342203299627, + "grad_norm": 0.2619377672672272, + "learning_rate": 1.6216206114295389e-07, + "loss": 0.1845, + "step": 5518 + }, + { + "epoch": 1.4686003193187867, + "grad_norm": 0.25638046860694885, + "learning_rate": 1.621488312084056e-07, + "loss": 0.173, + "step": 5519 + }, + { + "epoch": 1.4688664183076106, + "grad_norm": 0.29354774951934814, + "learning_rate": 1.6213559950124638e-07, + "loss": 0.2013, + "step": 5520 + }, + { + "epoch": 1.4691325172964342, + "grad_norm": 0.5400954484939575, + "learning_rate": 1.621223660218536e-07, + "loss": 0.1781, + "step": 5521 + }, + { + "epoch": 1.4693986162852581, + "grad_norm": 0.2695242464542389, + "learning_rate": 1.621091307706047e-07, + "loss": 0.1797, + "step": 5522 + }, + { + "epoch": 1.469664715274082, + "grad_norm": 0.25860339403152466, + "learning_rate": 1.6209589374787713e-07, + "loss": 0.1882, + "step": 5523 + }, + { + "epoch": 1.4699308142629057, + "grad_norm": 0.29801440238952637, + "learning_rate": 1.6208265495404857e-07, + "loss": 0.2007, + "step": 5524 + }, + { + "epoch": 1.4701969132517296, + "grad_norm": 0.31492024660110474, + "learning_rate": 1.6206941438949647e-07, + "loss": 0.1856, + "step": 5525 + }, + { + "epoch": 1.4704630122405535, + "grad_norm": 0.38650888204574585, + "learning_rate": 1.6205617205459855e-07, + "loss": 0.2054, + "step": 5526 + }, + { + "epoch": 1.4707291112293772, + "grad_norm": 0.43842193484306335, + "learning_rate": 1.6204292794973248e-07, + "loss": 0.2326, + "step": 5527 + }, + { + "epoch": 1.4709952102182011, + "grad_norm": 0.287841260433197, + "learning_rate": 1.6202968207527604e-07, + "loss": 0.1802, + "step": 5528 + }, + { + "epoch": 1.471261309207025, + "grad_norm": 0.4084819555282593, + "learning_rate": 1.6201643443160707e-07, + "loss": 0.1848, + "step": 5529 + }, + { + "epoch": 1.471527408195849, + "grad_norm": 0.32320573925971985, + "learning_rate": 1.620031850191033e-07, + "loss": 0.2026, + "step": 5530 + }, + { + "epoch": 1.4717935071846728, + "grad_norm": 0.2410281002521515, + "learning_rate": 1.6198993383814273e-07, + "loss": 0.1783, + "step": 5531 + }, + { + "epoch": 1.4720596061734965, + "grad_norm": 0.40013206005096436, + "learning_rate": 1.6197668088910327e-07, + "loss": 0.2163, + "step": 5532 + }, + { + "epoch": 1.4723257051623204, + "grad_norm": 0.42650002241134644, + "learning_rate": 1.6196342617236294e-07, + "loss": 0.1962, + "step": 5533 + }, + { + "epoch": 1.4725918041511443, + "grad_norm": 0.24698691070079803, + "learning_rate": 1.6195016968829976e-07, + "loss": 0.1749, + "step": 5534 + }, + { + "epoch": 1.472857903139968, + "grad_norm": 0.2870747148990631, + "learning_rate": 1.6193691143729185e-07, + "loss": 0.1795, + "step": 5535 + }, + { + "epoch": 1.473124002128792, + "grad_norm": 0.3793070912361145, + "learning_rate": 1.6192365141971736e-07, + "loss": 0.209, + "step": 5536 + }, + { + "epoch": 1.4733901011176158, + "grad_norm": 0.3078005313873291, + "learning_rate": 1.6191038963595451e-07, + "loss": 0.1849, + "step": 5537 + }, + { + "epoch": 1.4736562001064395, + "grad_norm": 0.2778741121292114, + "learning_rate": 1.6189712608638155e-07, + "loss": 0.1831, + "step": 5538 + }, + { + "epoch": 1.4739222990952634, + "grad_norm": 0.2729390561580658, + "learning_rate": 1.6188386077137678e-07, + "loss": 0.1899, + "step": 5539 + }, + { + "epoch": 1.4741883980840873, + "grad_norm": 0.364472359418869, + "learning_rate": 1.6187059369131853e-07, + "loss": 0.2065, + "step": 5540 + }, + { + "epoch": 1.474454497072911, + "grad_norm": 0.31157511472702026, + "learning_rate": 1.618573248465852e-07, + "loss": 0.1888, + "step": 5541 + }, + { + "epoch": 1.474720596061735, + "grad_norm": 0.3516826629638672, + "learning_rate": 1.618440542375553e-07, + "loss": 0.2042, + "step": 5542 + }, + { + "epoch": 1.4749866950505588, + "grad_norm": 0.34669947624206543, + "learning_rate": 1.6183078186460728e-07, + "loss": 0.2123, + "step": 5543 + }, + { + "epoch": 1.4752527940393827, + "grad_norm": 0.3215189278125763, + "learning_rate": 1.6181750772811972e-07, + "loss": 0.1883, + "step": 5544 + }, + { + "epoch": 1.4755188930282066, + "grad_norm": 0.3491046726703644, + "learning_rate": 1.6180423182847123e-07, + "loss": 0.1899, + "step": 5545 + }, + { + "epoch": 1.4757849920170303, + "grad_norm": 0.3500817120075226, + "learning_rate": 1.6179095416604044e-07, + "loss": 0.1854, + "step": 5546 + }, + { + "epoch": 1.4760510910058542, + "grad_norm": 0.3827027678489685, + "learning_rate": 1.6177767474120607e-07, + "loss": 0.2259, + "step": 5547 + }, + { + "epoch": 1.476317189994678, + "grad_norm": 0.3113328218460083, + "learning_rate": 1.617643935543469e-07, + "loss": 0.1805, + "step": 5548 + }, + { + "epoch": 1.4765832889835018, + "grad_norm": 0.31585991382598877, + "learning_rate": 1.617511106058417e-07, + "loss": 0.1798, + "step": 5549 + }, + { + "epoch": 1.4768493879723257, + "grad_norm": 0.3122009336948395, + "learning_rate": 1.6173782589606934e-07, + "loss": 0.1733, + "step": 5550 + }, + { + "epoch": 1.4771154869611496, + "grad_norm": 0.27844226360321045, + "learning_rate": 1.6172453942540877e-07, + "loss": 0.1911, + "step": 5551 + }, + { + "epoch": 1.4773815859499733, + "grad_norm": 0.24440401792526245, + "learning_rate": 1.6171125119423884e-07, + "loss": 0.1778, + "step": 5552 + }, + { + "epoch": 1.4776476849387972, + "grad_norm": 0.26075711846351624, + "learning_rate": 1.6169796120293863e-07, + "loss": 0.1787, + "step": 5553 + }, + { + "epoch": 1.477913783927621, + "grad_norm": 0.5754770040512085, + "learning_rate": 1.616846694518872e-07, + "loss": 0.2026, + "step": 5554 + }, + { + "epoch": 1.478179882916445, + "grad_norm": 0.39961910247802734, + "learning_rate": 1.6167137594146368e-07, + "loss": 0.192, + "step": 5555 + }, + { + "epoch": 1.4784459819052689, + "grad_norm": 0.3806179165840149, + "learning_rate": 1.6165808067204717e-07, + "loss": 0.1888, + "step": 5556 + }, + { + "epoch": 1.4787120808940926, + "grad_norm": 0.3556552231311798, + "learning_rate": 1.616447836440169e-07, + "loss": 0.1985, + "step": 5557 + }, + { + "epoch": 1.4789781798829165, + "grad_norm": 0.2742424011230469, + "learning_rate": 1.6163148485775216e-07, + "loss": 0.1886, + "step": 5558 + }, + { + "epoch": 1.4792442788717404, + "grad_norm": 0.2901027798652649, + "learning_rate": 1.6161818431363218e-07, + "loss": 0.18, + "step": 5559 + }, + { + "epoch": 1.479510377860564, + "grad_norm": 0.26173368096351624, + "learning_rate": 1.616048820120364e-07, + "loss": 0.1995, + "step": 5560 + }, + { + "epoch": 1.479776476849388, + "grad_norm": 0.2651582658290863, + "learning_rate": 1.615915779533442e-07, + "loss": 0.1746, + "step": 5561 + }, + { + "epoch": 1.4800425758382119, + "grad_norm": 0.4599456191062927, + "learning_rate": 1.6157827213793505e-07, + "loss": 0.1813, + "step": 5562 + }, + { + "epoch": 1.4803086748270355, + "grad_norm": 0.3906121551990509, + "learning_rate": 1.6156496456618843e-07, + "loss": 0.2181, + "step": 5563 + }, + { + "epoch": 1.4805747738158594, + "grad_norm": 0.26760897040367126, + "learning_rate": 1.615516552384839e-07, + "loss": 0.184, + "step": 5564 + }, + { + "epoch": 1.4808408728046834, + "grad_norm": 0.3235023617744446, + "learning_rate": 1.615383441552011e-07, + "loss": 0.182, + "step": 5565 + }, + { + "epoch": 1.4811069717935073, + "grad_norm": 0.26443684101104736, + "learning_rate": 1.6152503131671967e-07, + "loss": 0.1827, + "step": 5566 + }, + { + "epoch": 1.4813730707823312, + "grad_norm": 0.2753816246986389, + "learning_rate": 1.6151171672341931e-07, + "loss": 0.1834, + "step": 5567 + }, + { + "epoch": 1.4816391697711548, + "grad_norm": 0.3064538538455963, + "learning_rate": 1.6149840037567981e-07, + "loss": 0.1979, + "step": 5568 + }, + { + "epoch": 1.4819052687599787, + "grad_norm": 0.27975016832351685, + "learning_rate": 1.6148508227388098e-07, + "loss": 0.1942, + "step": 5569 + }, + { + "epoch": 1.4821713677488026, + "grad_norm": 0.275838702917099, + "learning_rate": 1.6147176241840263e-07, + "loss": 0.2015, + "step": 5570 + }, + { + "epoch": 1.4824374667376263, + "grad_norm": 0.3388344943523407, + "learning_rate": 1.614584408096247e-07, + "loss": 0.2093, + "step": 5571 + }, + { + "epoch": 1.4827035657264502, + "grad_norm": 0.28812143206596375, + "learning_rate": 1.6144511744792712e-07, + "loss": 0.1785, + "step": 5572 + }, + { + "epoch": 1.4829696647152741, + "grad_norm": 0.26841023564338684, + "learning_rate": 1.6143179233369e-07, + "loss": 0.1824, + "step": 5573 + }, + { + "epoch": 1.4832357637040978, + "grad_norm": 0.2452857345342636, + "learning_rate": 1.6141846546729324e-07, + "loss": 0.1862, + "step": 5574 + }, + { + "epoch": 1.4835018626929217, + "grad_norm": 0.2698812484741211, + "learning_rate": 1.6140513684911712e-07, + "loss": 0.1835, + "step": 5575 + }, + { + "epoch": 1.4837679616817456, + "grad_norm": 0.29529255628585815, + "learning_rate": 1.6139180647954167e-07, + "loss": 0.2084, + "step": 5576 + }, + { + "epoch": 1.4840340606705695, + "grad_norm": 0.2819269895553589, + "learning_rate": 1.6137847435894716e-07, + "loss": 0.1879, + "step": 5577 + }, + { + "epoch": 1.4843001596593932, + "grad_norm": 0.32444605231285095, + "learning_rate": 1.6136514048771384e-07, + "loss": 0.2011, + "step": 5578 + }, + { + "epoch": 1.4845662586482171, + "grad_norm": 0.357843816280365, + "learning_rate": 1.61351804866222e-07, + "loss": 0.201, + "step": 5579 + }, + { + "epoch": 1.484832357637041, + "grad_norm": 0.28322991728782654, + "learning_rate": 1.61338467494852e-07, + "loss": 0.1944, + "step": 5580 + }, + { + "epoch": 1.485098456625865, + "grad_norm": 0.2889121472835541, + "learning_rate": 1.6132512837398427e-07, + "loss": 0.1912, + "step": 5581 + }, + { + "epoch": 1.4853645556146886, + "grad_norm": 0.27391496300697327, + "learning_rate": 1.6131178750399927e-07, + "loss": 0.1848, + "step": 5582 + }, + { + "epoch": 1.4856306546035125, + "grad_norm": 0.3244014084339142, + "learning_rate": 1.612984448852775e-07, + "loss": 0.2021, + "step": 5583 + }, + { + "epoch": 1.4858967535923364, + "grad_norm": 0.44970688223838806, + "learning_rate": 1.6128510051819953e-07, + "loss": 0.2153, + "step": 5584 + }, + { + "epoch": 1.48616285258116, + "grad_norm": 0.27899229526519775, + "learning_rate": 1.6127175440314594e-07, + "loss": 0.2007, + "step": 5585 + }, + { + "epoch": 1.486428951569984, + "grad_norm": 0.2732450067996979, + "learning_rate": 1.6125840654049738e-07, + "loss": 0.1945, + "step": 5586 + }, + { + "epoch": 1.486695050558808, + "grad_norm": 0.4331805408000946, + "learning_rate": 1.6124505693063463e-07, + "loss": 0.199, + "step": 5587 + }, + { + "epoch": 1.4869611495476316, + "grad_norm": 0.3251754939556122, + "learning_rate": 1.6123170557393837e-07, + "loss": 0.1889, + "step": 5588 + }, + { + "epoch": 1.4872272485364555, + "grad_norm": 0.49851569533348083, + "learning_rate": 1.6121835247078942e-07, + "loss": 0.1872, + "step": 5589 + }, + { + "epoch": 1.4874933475252794, + "grad_norm": 0.3013269007205963, + "learning_rate": 1.6120499762156867e-07, + "loss": 0.189, + "step": 5590 + }, + { + "epoch": 1.4877594465141033, + "grad_norm": 0.28260305523872375, + "learning_rate": 1.6119164102665703e-07, + "loss": 0.1785, + "step": 5591 + }, + { + "epoch": 1.4880255455029272, + "grad_norm": 0.26979705691337585, + "learning_rate": 1.6117828268643545e-07, + "loss": 0.2136, + "step": 5592 + }, + { + "epoch": 1.4882916444917509, + "grad_norm": 0.264117568731308, + "learning_rate": 1.611649226012849e-07, + "loss": 0.1771, + "step": 5593 + }, + { + "epoch": 1.4885577434805748, + "grad_norm": 0.3241789937019348, + "learning_rate": 1.6115156077158648e-07, + "loss": 0.1725, + "step": 5594 + }, + { + "epoch": 1.4888238424693987, + "grad_norm": 0.3635896146297455, + "learning_rate": 1.6113819719772128e-07, + "loss": 0.1917, + "step": 5595 + }, + { + "epoch": 1.4890899414582224, + "grad_norm": 0.2726893424987793, + "learning_rate": 1.6112483188007047e-07, + "loss": 0.1927, + "step": 5596 + }, + { + "epoch": 1.4893560404470463, + "grad_norm": 0.25934964418411255, + "learning_rate": 1.611114648190152e-07, + "loss": 0.1809, + "step": 5597 + }, + { + "epoch": 1.4896221394358702, + "grad_norm": 0.2661920189857483, + "learning_rate": 1.610980960149368e-07, + "loss": 0.1828, + "step": 5598 + }, + { + "epoch": 1.4898882384246939, + "grad_norm": 0.2717171311378479, + "learning_rate": 1.6108472546821657e-07, + "loss": 0.1917, + "step": 5599 + }, + { + "epoch": 1.4901543374135178, + "grad_norm": 0.3000451624393463, + "learning_rate": 1.6107135317923578e-07, + "loss": 0.1942, + "step": 5600 + }, + { + "epoch": 1.4904204364023417, + "grad_norm": 0.24007929861545563, + "learning_rate": 1.6105797914837594e-07, + "loss": 0.1606, + "step": 5601 + }, + { + "epoch": 1.4906865353911656, + "grad_norm": 0.28771883249282837, + "learning_rate": 1.6104460337601845e-07, + "loss": 0.1989, + "step": 5602 + }, + { + "epoch": 1.4909526343799895, + "grad_norm": 0.2590891122817993, + "learning_rate": 1.610312258625448e-07, + "loss": 0.1804, + "step": 5603 + }, + { + "epoch": 1.4912187333688132, + "grad_norm": 0.37823501229286194, + "learning_rate": 1.610178466083366e-07, + "loss": 0.2086, + "step": 5604 + }, + { + "epoch": 1.491484832357637, + "grad_norm": 0.33722832798957825, + "learning_rate": 1.610044656137754e-07, + "loss": 0.1997, + "step": 5605 + }, + { + "epoch": 1.491750931346461, + "grad_norm": 0.36074885725975037, + "learning_rate": 1.6099108287924285e-07, + "loss": 0.1903, + "step": 5606 + }, + { + "epoch": 1.4920170303352847, + "grad_norm": 0.39230337738990784, + "learning_rate": 1.609776984051207e-07, + "loss": 0.2021, + "step": 5607 + }, + { + "epoch": 1.4922831293241086, + "grad_norm": 0.4228077828884125, + "learning_rate": 1.6096431219179068e-07, + "loss": 0.21, + "step": 5608 + }, + { + "epoch": 1.4925492283129325, + "grad_norm": 0.2665228843688965, + "learning_rate": 1.609509242396346e-07, + "loss": 0.17, + "step": 5609 + }, + { + "epoch": 1.4928153273017561, + "grad_norm": 0.3081474304199219, + "learning_rate": 1.609375345490343e-07, + "loss": 0.1908, + "step": 5610 + }, + { + "epoch": 1.49308142629058, + "grad_norm": 0.2903885245323181, + "learning_rate": 1.609241431203717e-07, + "loss": 0.1966, + "step": 5611 + }, + { + "epoch": 1.493347525279404, + "grad_norm": 0.2622372806072235, + "learning_rate": 1.6091074995402871e-07, + "loss": 0.1895, + "step": 5612 + }, + { + "epoch": 1.4936136242682279, + "grad_norm": 0.3290199935436249, + "learning_rate": 1.6089735505038737e-07, + "loss": 0.1899, + "step": 5613 + }, + { + "epoch": 1.4938797232570518, + "grad_norm": 0.37768083810806274, + "learning_rate": 1.608839584098297e-07, + "loss": 0.1958, + "step": 5614 + }, + { + "epoch": 1.4941458222458754, + "grad_norm": 0.39441531896591187, + "learning_rate": 1.608705600327378e-07, + "loss": 0.1833, + "step": 5615 + }, + { + "epoch": 1.4944119212346993, + "grad_norm": 0.280860036611557, + "learning_rate": 1.6085715991949385e-07, + "loss": 0.1762, + "step": 5616 + }, + { + "epoch": 1.4946780202235233, + "grad_norm": 0.24670535326004028, + "learning_rate": 1.6084375807048005e-07, + "loss": 0.1786, + "step": 5617 + }, + { + "epoch": 1.494944119212347, + "grad_norm": 0.2623610496520996, + "learning_rate": 1.608303544860786e-07, + "loss": 0.1978, + "step": 5618 + }, + { + "epoch": 1.4952102182011708, + "grad_norm": 0.2873885929584503, + "learning_rate": 1.6081694916667187e-07, + "loss": 0.1822, + "step": 5619 + }, + { + "epoch": 1.4954763171899947, + "grad_norm": 0.4015539288520813, + "learning_rate": 1.6080354211264212e-07, + "loss": 0.1815, + "step": 5620 + }, + { + "epoch": 1.4957424161788184, + "grad_norm": 0.38336095213890076, + "learning_rate": 1.607901333243718e-07, + "loss": 0.1996, + "step": 5621 + }, + { + "epoch": 1.4960085151676423, + "grad_norm": 0.28187116980552673, + "learning_rate": 1.6077672280224334e-07, + "loss": 0.2054, + "step": 5622 + }, + { + "epoch": 1.4962746141564662, + "grad_norm": 0.4277878701686859, + "learning_rate": 1.6076331054663926e-07, + "loss": 0.215, + "step": 5623 + }, + { + "epoch": 1.49654071314529, + "grad_norm": 0.32912325859069824, + "learning_rate": 1.6074989655794206e-07, + "loss": 0.1964, + "step": 5624 + }, + { + "epoch": 1.4968068121341138, + "grad_norm": 0.3204365670681, + "learning_rate": 1.6073648083653438e-07, + "loss": 0.2056, + "step": 5625 + }, + { + "epoch": 1.4970729111229377, + "grad_norm": 0.37892621755599976, + "learning_rate": 1.6072306338279883e-07, + "loss": 0.1991, + "step": 5626 + }, + { + "epoch": 1.4973390101117616, + "grad_norm": 0.31562528014183044, + "learning_rate": 1.607096441971181e-07, + "loss": 0.2065, + "step": 5627 + }, + { + "epoch": 1.4976051091005855, + "grad_norm": 0.47971025109291077, + "learning_rate": 1.60696223279875e-07, + "loss": 0.1895, + "step": 5628 + }, + { + "epoch": 1.4978712080894092, + "grad_norm": 0.30929896235466003, + "learning_rate": 1.6068280063145226e-07, + "loss": 0.203, + "step": 5629 + }, + { + "epoch": 1.4981373070782331, + "grad_norm": 0.3057677745819092, + "learning_rate": 1.606693762522327e-07, + "loss": 0.2069, + "step": 5630 + }, + { + "epoch": 1.498403406067057, + "grad_norm": 0.347260445356369, + "learning_rate": 1.606559501425993e-07, + "loss": 0.2048, + "step": 5631 + }, + { + "epoch": 1.4986695050558807, + "grad_norm": 0.2537480294704437, + "learning_rate": 1.6064252230293488e-07, + "loss": 0.1756, + "step": 5632 + }, + { + "epoch": 1.4989356040447046, + "grad_norm": 0.26912572979927063, + "learning_rate": 1.6062909273362256e-07, + "loss": 0.1682, + "step": 5633 + }, + { + "epoch": 1.4992017030335285, + "grad_norm": 0.3800032436847687, + "learning_rate": 1.6061566143504526e-07, + "loss": 0.2048, + "step": 5634 + }, + { + "epoch": 1.4994678020223522, + "grad_norm": 0.3102400004863739, + "learning_rate": 1.6060222840758612e-07, + "loss": 0.1827, + "step": 5635 + }, + { + "epoch": 1.499733901011176, + "grad_norm": 0.25319790840148926, + "learning_rate": 1.6058879365162827e-07, + "loss": 0.2, + "step": 5636 + }, + { + "epoch": 1.5, + "grad_norm": 0.2922842502593994, + "learning_rate": 1.6057535716755494e-07, + "loss": 0.1981, + "step": 5637 + }, + { + "epoch": 1.5002660989888237, + "grad_norm": 0.2585084140300751, + "learning_rate": 1.605619189557493e-07, + "loss": 0.1823, + "step": 5638 + }, + { + "epoch": 1.5005321979776478, + "grad_norm": 0.28366509079933167, + "learning_rate": 1.6054847901659468e-07, + "loss": 0.1687, + "step": 5639 + }, + { + "epoch": 1.5007982969664715, + "grad_norm": 0.3192025125026703, + "learning_rate": 1.605350373504744e-07, + "loss": 0.1943, + "step": 5640 + }, + { + "epoch": 1.5010643959552954, + "grad_norm": 0.3368593454360962, + "learning_rate": 1.605215939577718e-07, + "loss": 0.2009, + "step": 5641 + }, + { + "epoch": 1.5013304949441193, + "grad_norm": 0.36907604336738586, + "learning_rate": 1.605081488388704e-07, + "loss": 0.1977, + "step": 5642 + }, + { + "epoch": 1.501596593932943, + "grad_norm": 0.3029351234436035, + "learning_rate": 1.6049470199415363e-07, + "loss": 0.1726, + "step": 5643 + }, + { + "epoch": 1.5018626929217669, + "grad_norm": 0.3102242350578308, + "learning_rate": 1.60481253424005e-07, + "loss": 0.182, + "step": 5644 + }, + { + "epoch": 1.5021287919105908, + "grad_norm": 0.26319634914398193, + "learning_rate": 1.6046780312880815e-07, + "loss": 0.1837, + "step": 5645 + }, + { + "epoch": 1.5023948908994145, + "grad_norm": 0.4396512508392334, + "learning_rate": 1.6045435110894667e-07, + "loss": 0.1808, + "step": 5646 + }, + { + "epoch": 1.5026609898882384, + "grad_norm": 0.28959256410598755, + "learning_rate": 1.6044089736480425e-07, + "loss": 0.2117, + "step": 5647 + }, + { + "epoch": 1.5029270888770623, + "grad_norm": 0.30066612362861633, + "learning_rate": 1.604274418967646e-07, + "loss": 0.2034, + "step": 5648 + }, + { + "epoch": 1.503193187865886, + "grad_norm": 0.29187455773353577, + "learning_rate": 1.604139847052115e-07, + "loss": 0.2038, + "step": 5649 + }, + { + "epoch": 1.50345928685471, + "grad_norm": 0.2418321967124939, + "learning_rate": 1.6040052579052884e-07, + "loss": 0.1741, + "step": 5650 + }, + { + "epoch": 1.5037253858435338, + "grad_norm": 0.3612293601036072, + "learning_rate": 1.603870651531004e-07, + "loss": 0.2254, + "step": 5651 + }, + { + "epoch": 1.5039914848323577, + "grad_norm": 0.2677214443683624, + "learning_rate": 1.603736027933102e-07, + "loss": 0.1773, + "step": 5652 + }, + { + "epoch": 1.5042575838211816, + "grad_norm": 0.2563003897666931, + "learning_rate": 1.603601387115421e-07, + "loss": 0.1953, + "step": 5653 + }, + { + "epoch": 1.5045236828100053, + "grad_norm": 0.43840134143829346, + "learning_rate": 1.6034667290818023e-07, + "loss": 0.2126, + "step": 5654 + }, + { + "epoch": 1.5047897817988292, + "grad_norm": 0.2945207357406616, + "learning_rate": 1.6033320538360858e-07, + "loss": 0.1972, + "step": 5655 + }, + { + "epoch": 1.505055880787653, + "grad_norm": 0.3216281235218048, + "learning_rate": 1.6031973613821136e-07, + "loss": 0.2115, + "step": 5656 + }, + { + "epoch": 1.5053219797764767, + "grad_norm": 0.26826485991477966, + "learning_rate": 1.6030626517237265e-07, + "loss": 0.1867, + "step": 5657 + }, + { + "epoch": 1.5055880787653007, + "grad_norm": 0.25288522243499756, + "learning_rate": 1.6029279248647673e-07, + "loss": 0.1895, + "step": 5658 + }, + { + "epoch": 1.5058541777541246, + "grad_norm": 0.26399222016334534, + "learning_rate": 1.6027931808090784e-07, + "loss": 0.1968, + "step": 5659 + }, + { + "epoch": 1.5061202767429482, + "grad_norm": 0.6246997117996216, + "learning_rate": 1.6026584195605032e-07, + "loss": 0.202, + "step": 5660 + }, + { + "epoch": 1.5063863757317724, + "grad_norm": 0.3167211413383484, + "learning_rate": 1.602523641122885e-07, + "loss": 0.1657, + "step": 5661 + }, + { + "epoch": 1.506652474720596, + "grad_norm": 0.3972227871417999, + "learning_rate": 1.602388845500068e-07, + "loss": 0.2249, + "step": 5662 + }, + { + "epoch": 1.50691857370942, + "grad_norm": 0.2696121037006378, + "learning_rate": 1.6022540326958973e-07, + "loss": 0.1833, + "step": 5663 + }, + { + "epoch": 1.5071846726982439, + "grad_norm": 0.2613315284252167, + "learning_rate": 1.602119202714218e-07, + "loss": 0.1946, + "step": 5664 + }, + { + "epoch": 1.5074507716870675, + "grad_norm": 0.33936917781829834, + "learning_rate": 1.6019843555588748e-07, + "loss": 0.2034, + "step": 5665 + }, + { + "epoch": 1.5077168706758914, + "grad_norm": 0.308160662651062, + "learning_rate": 1.601849491233715e-07, + "loss": 0.2072, + "step": 5666 + }, + { + "epoch": 1.5079829696647153, + "grad_norm": 0.27438563108444214, + "learning_rate": 1.6017146097425845e-07, + "loss": 0.2006, + "step": 5667 + }, + { + "epoch": 1.508249068653539, + "grad_norm": 0.4322134554386139, + "learning_rate": 1.6015797110893307e-07, + "loss": 0.2104, + "step": 5668 + }, + { + "epoch": 1.508515167642363, + "grad_norm": 0.2847597599029541, + "learning_rate": 1.601444795277801e-07, + "loss": 0.2031, + "step": 5669 + }, + { + "epoch": 1.5087812666311868, + "grad_norm": 0.2786535322666168, + "learning_rate": 1.6013098623118435e-07, + "loss": 0.1987, + "step": 5670 + }, + { + "epoch": 1.5090473656200105, + "grad_norm": 0.2635231018066406, + "learning_rate": 1.6011749121953068e-07, + "loss": 0.1963, + "step": 5671 + }, + { + "epoch": 1.5093134646088346, + "grad_norm": 0.2895680069923401, + "learning_rate": 1.60103994493204e-07, + "loss": 0.1913, + "step": 5672 + }, + { + "epoch": 1.5095795635976583, + "grad_norm": 0.33629825711250305, + "learning_rate": 1.6009049605258926e-07, + "loss": 0.1813, + "step": 5673 + }, + { + "epoch": 1.509845662586482, + "grad_norm": 0.38237324357032776, + "learning_rate": 1.600769958980715e-07, + "loss": 0.2026, + "step": 5674 + }, + { + "epoch": 1.5101117615753061, + "grad_norm": 0.2620169222354889, + "learning_rate": 1.6006349403003567e-07, + "loss": 0.1703, + "step": 5675 + }, + { + "epoch": 1.5103778605641298, + "grad_norm": 0.32493025064468384, + "learning_rate": 1.6004999044886698e-07, + "loss": 0.1751, + "step": 5676 + }, + { + "epoch": 1.5106439595529537, + "grad_norm": 0.2741832435131073, + "learning_rate": 1.600364851549505e-07, + "loss": 0.1884, + "step": 5677 + }, + { + "epoch": 1.5109100585417776, + "grad_norm": 0.24761444330215454, + "learning_rate": 1.600229781486715e-07, + "loss": 0.1748, + "step": 5678 + }, + { + "epoch": 1.5111761575306013, + "grad_norm": 0.27576056122779846, + "learning_rate": 1.6000946943041518e-07, + "loss": 0.1883, + "step": 5679 + }, + { + "epoch": 1.5114422565194252, + "grad_norm": 0.26301121711730957, + "learning_rate": 1.5999595900056683e-07, + "loss": 0.1896, + "step": 5680 + }, + { + "epoch": 1.5117083555082491, + "grad_norm": 0.25352418422698975, + "learning_rate": 1.5998244685951182e-07, + "loss": 0.1711, + "step": 5681 + }, + { + "epoch": 1.5119744544970728, + "grad_norm": 0.26682671904563904, + "learning_rate": 1.5996893300763552e-07, + "loss": 0.1834, + "step": 5682 + }, + { + "epoch": 1.5122405534858967, + "grad_norm": 0.35749319195747375, + "learning_rate": 1.599554174453234e-07, + "loss": 0.2249, + "step": 5683 + }, + { + "epoch": 1.5125066524747206, + "grad_norm": 0.25020596385002136, + "learning_rate": 1.5994190017296095e-07, + "loss": 0.1777, + "step": 5684 + }, + { + "epoch": 1.5127727514635443, + "grad_norm": 0.2493213564157486, + "learning_rate": 1.599283811909337e-07, + "loss": 0.1633, + "step": 5685 + }, + { + "epoch": 1.5130388504523684, + "grad_norm": 0.35861536860466003, + "learning_rate": 1.599148604996272e-07, + "loss": 0.1992, + "step": 5686 + }, + { + "epoch": 1.513304949441192, + "grad_norm": 0.37098002433776855, + "learning_rate": 1.599013380994272e-07, + "loss": 0.2134, + "step": 5687 + }, + { + "epoch": 1.513571048430016, + "grad_norm": 0.36927443742752075, + "learning_rate": 1.5988781399071924e-07, + "loss": 0.2031, + "step": 5688 + }, + { + "epoch": 1.51383714741884, + "grad_norm": 0.2536589503288269, + "learning_rate": 1.5987428817388917e-07, + "loss": 0.1787, + "step": 5689 + }, + { + "epoch": 1.5141032464076636, + "grad_norm": 0.4420652985572815, + "learning_rate": 1.5986076064932273e-07, + "loss": 0.203, + "step": 5690 + }, + { + "epoch": 1.5143693453964875, + "grad_norm": 0.27486446499824524, + "learning_rate": 1.5984723141740575e-07, + "loss": 0.1875, + "step": 5691 + }, + { + "epoch": 1.5146354443853114, + "grad_norm": 0.3420518934726715, + "learning_rate": 1.598337004785241e-07, + "loss": 0.2078, + "step": 5692 + }, + { + "epoch": 1.514901543374135, + "grad_norm": 0.27418991923332214, + "learning_rate": 1.5982016783306374e-07, + "loss": 0.1829, + "step": 5693 + }, + { + "epoch": 1.515167642362959, + "grad_norm": 0.46611934900283813, + "learning_rate": 1.598066334814106e-07, + "loss": 0.2223, + "step": 5694 + }, + { + "epoch": 1.5154337413517829, + "grad_norm": 0.33801066875457764, + "learning_rate": 1.5979309742395078e-07, + "loss": 0.1848, + "step": 5695 + }, + { + "epoch": 1.5156998403406066, + "grad_norm": 0.2817387282848358, + "learning_rate": 1.597795596610703e-07, + "loss": 0.1751, + "step": 5696 + }, + { + "epoch": 1.5159659393294307, + "grad_norm": 0.2812216877937317, + "learning_rate": 1.597660201931553e-07, + "loss": 0.194, + "step": 5697 + }, + { + "epoch": 1.5162320383182544, + "grad_norm": 0.3449695110321045, + "learning_rate": 1.5975247902059196e-07, + "loss": 0.2032, + "step": 5698 + }, + { + "epoch": 1.5164981373070783, + "grad_norm": 0.26253554224967957, + "learning_rate": 1.5973893614376646e-07, + "loss": 0.1899, + "step": 5699 + }, + { + "epoch": 1.5167642362959022, + "grad_norm": 0.3848259449005127, + "learning_rate": 1.5972539156306515e-07, + "loss": 0.2018, + "step": 5700 + }, + { + "epoch": 1.5170303352847259, + "grad_norm": 0.2931479215621948, + "learning_rate": 1.5971184527887428e-07, + "loss": 0.1976, + "step": 5701 + }, + { + "epoch": 1.5172964342735498, + "grad_norm": 1.0635032653808594, + "learning_rate": 1.5969829729158026e-07, + "loss": 0.201, + "step": 5702 + }, + { + "epoch": 1.5175625332623737, + "grad_norm": 0.26839327812194824, + "learning_rate": 1.5968474760156947e-07, + "loss": 0.1843, + "step": 5703 + }, + { + "epoch": 1.5178286322511974, + "grad_norm": 0.3710900843143463, + "learning_rate": 1.596711962092284e-07, + "loss": 0.2084, + "step": 5704 + }, + { + "epoch": 1.5180947312400213, + "grad_norm": 0.34855711460113525, + "learning_rate": 1.5965764311494358e-07, + "loss": 0.2069, + "step": 5705 + }, + { + "epoch": 1.5183608302288452, + "grad_norm": 0.2682419717311859, + "learning_rate": 1.5964408831910153e-07, + "loss": 0.1912, + "step": 5706 + }, + { + "epoch": 1.5186269292176688, + "grad_norm": 0.2535572350025177, + "learning_rate": 1.596305318220889e-07, + "loss": 0.1878, + "step": 5707 + }, + { + "epoch": 1.518893028206493, + "grad_norm": 0.2531608045101166, + "learning_rate": 1.5961697362429227e-07, + "loss": 0.1788, + "step": 5708 + }, + { + "epoch": 1.5191591271953166, + "grad_norm": 0.33744391798973083, + "learning_rate": 1.5960341372609845e-07, + "loss": 0.2196, + "step": 5709 + }, + { + "epoch": 1.5194252261841406, + "grad_norm": 0.251177579164505, + "learning_rate": 1.5958985212789412e-07, + "loss": 0.1771, + "step": 5710 + }, + { + "epoch": 1.5196913251729645, + "grad_norm": 0.2753576636314392, + "learning_rate": 1.5957628883006615e-07, + "loss": 0.1871, + "step": 5711 + }, + { + "epoch": 1.5199574241617881, + "grad_norm": 0.2840087413787842, + "learning_rate": 1.5956272383300133e-07, + "loss": 0.1933, + "step": 5712 + }, + { + "epoch": 1.520223523150612, + "grad_norm": 0.3392810523509979, + "learning_rate": 1.595491571370866e-07, + "loss": 0.1992, + "step": 5713 + }, + { + "epoch": 1.520489622139436, + "grad_norm": 0.45489266514778137, + "learning_rate": 1.5953558874270892e-07, + "loss": 0.1945, + "step": 5714 + }, + { + "epoch": 1.5207557211282596, + "grad_norm": 0.26395511627197266, + "learning_rate": 1.5952201865025523e-07, + "loss": 0.1836, + "step": 5715 + }, + { + "epoch": 1.5210218201170835, + "grad_norm": 0.3136470913887024, + "learning_rate": 1.5950844686011265e-07, + "loss": 0.1999, + "step": 5716 + }, + { + "epoch": 1.5212879191059074, + "grad_norm": 0.30970823764801025, + "learning_rate": 1.5949487337266821e-07, + "loss": 0.2061, + "step": 5717 + }, + { + "epoch": 1.5215540180947311, + "grad_norm": 0.2569981515407562, + "learning_rate": 1.5948129818830908e-07, + "loss": 0.189, + "step": 5718 + }, + { + "epoch": 1.5218201170835552, + "grad_norm": 0.3652020990848541, + "learning_rate": 1.5946772130742248e-07, + "loss": 0.1938, + "step": 5719 + }, + { + "epoch": 1.522086216072379, + "grad_norm": 0.26675522327423096, + "learning_rate": 1.5945414273039562e-07, + "loss": 0.1693, + "step": 5720 + }, + { + "epoch": 1.5223523150612026, + "grad_norm": 0.2642328441143036, + "learning_rate": 1.5944056245761575e-07, + "loss": 0.1888, + "step": 5721 + }, + { + "epoch": 1.5226184140500267, + "grad_norm": 0.2517027258872986, + "learning_rate": 1.594269804894703e-07, + "loss": 0.166, + "step": 5722 + }, + { + "epoch": 1.5228845130388504, + "grad_norm": 0.25819602608680725, + "learning_rate": 1.5941339682634658e-07, + "loss": 0.1888, + "step": 5723 + }, + { + "epoch": 1.5231506120276743, + "grad_norm": 0.273853063583374, + "learning_rate": 1.5939981146863206e-07, + "loss": 0.2013, + "step": 5724 + }, + { + "epoch": 1.5234167110164982, + "grad_norm": 0.2865620255470276, + "learning_rate": 1.593862244167142e-07, + "loss": 0.1813, + "step": 5725 + }, + { + "epoch": 1.523682810005322, + "grad_norm": 0.3503710627555847, + "learning_rate": 1.5937263567098054e-07, + "loss": 0.1962, + "step": 5726 + }, + { + "epoch": 1.5239489089941458, + "grad_norm": 0.2925717532634735, + "learning_rate": 1.593590452318187e-07, + "loss": 0.1949, + "step": 5727 + }, + { + "epoch": 1.5242150079829697, + "grad_norm": 0.43404731154441833, + "learning_rate": 1.5934545309961621e-07, + "loss": 0.1988, + "step": 5728 + }, + { + "epoch": 1.5244811069717934, + "grad_norm": 0.2677386701107025, + "learning_rate": 1.5933185927476082e-07, + "loss": 0.1904, + "step": 5729 + }, + { + "epoch": 1.5247472059606173, + "grad_norm": 0.3639408051967621, + "learning_rate": 1.5931826375764027e-07, + "loss": 0.2036, + "step": 5730 + }, + { + "epoch": 1.5250133049494412, + "grad_norm": 0.405720978975296, + "learning_rate": 1.5930466654864226e-07, + "loss": 0.193, + "step": 5731 + }, + { + "epoch": 1.5252794039382649, + "grad_norm": 0.29278644919395447, + "learning_rate": 1.5929106764815467e-07, + "loss": 0.2028, + "step": 5732 + }, + { + "epoch": 1.525545502927089, + "grad_norm": 0.38977399468421936, + "learning_rate": 1.5927746705656531e-07, + "loss": 0.2047, + "step": 5733 + }, + { + "epoch": 1.5258116019159127, + "grad_norm": 0.2908373475074768, + "learning_rate": 1.5926386477426218e-07, + "loss": 0.1987, + "step": 5734 + }, + { + "epoch": 1.5260777009047366, + "grad_norm": 0.48685023188591003, + "learning_rate": 1.5925026080163314e-07, + "loss": 0.2196, + "step": 5735 + }, + { + "epoch": 1.5263437998935605, + "grad_norm": 0.24608665704727173, + "learning_rate": 1.5923665513906627e-07, + "loss": 0.1779, + "step": 5736 + }, + { + "epoch": 1.5266098988823842, + "grad_norm": 0.2737818956375122, + "learning_rate": 1.5922304778694965e-07, + "loss": 0.1897, + "step": 5737 + }, + { + "epoch": 1.526875997871208, + "grad_norm": 0.270900696516037, + "learning_rate": 1.5920943874567135e-07, + "loss": 0.1888, + "step": 5738 + }, + { + "epoch": 1.527142096860032, + "grad_norm": 0.31160569190979004, + "learning_rate": 1.5919582801561956e-07, + "loss": 0.1934, + "step": 5739 + }, + { + "epoch": 1.5274081958488557, + "grad_norm": 0.2516506016254425, + "learning_rate": 1.5918221559718244e-07, + "loss": 0.1887, + "step": 5740 + }, + { + "epoch": 1.5276742948376796, + "grad_norm": 0.2670067548751831, + "learning_rate": 1.5916860149074825e-07, + "loss": 0.1933, + "step": 5741 + }, + { + "epoch": 1.5279403938265035, + "grad_norm": 0.33205142617225647, + "learning_rate": 1.5915498569670535e-07, + "loss": 0.1745, + "step": 5742 + }, + { + "epoch": 1.5282064928153272, + "grad_norm": 0.4011404812335968, + "learning_rate": 1.5914136821544203e-07, + "loss": 0.2032, + "step": 5743 + }, + { + "epoch": 1.5284725918041513, + "grad_norm": 0.2823942303657532, + "learning_rate": 1.591277490473467e-07, + "loss": 0.1987, + "step": 5744 + }, + { + "epoch": 1.528738690792975, + "grad_norm": 0.3336486220359802, + "learning_rate": 1.591141281928078e-07, + "loss": 0.1963, + "step": 5745 + }, + { + "epoch": 1.5290047897817989, + "grad_norm": 0.25309866666793823, + "learning_rate": 1.5910050565221385e-07, + "loss": 0.1755, + "step": 5746 + }, + { + "epoch": 1.5292708887706228, + "grad_norm": 0.2757182717323303, + "learning_rate": 1.5908688142595338e-07, + "loss": 0.2007, + "step": 5747 + }, + { + "epoch": 1.5295369877594465, + "grad_norm": 0.28348928689956665, + "learning_rate": 1.5907325551441503e-07, + "loss": 0.1958, + "step": 5748 + }, + { + "epoch": 1.5298030867482704, + "grad_norm": 0.3098370134830475, + "learning_rate": 1.5905962791798733e-07, + "loss": 0.2069, + "step": 5749 + }, + { + "epoch": 1.5300691857370943, + "grad_norm": 0.32574573159217834, + "learning_rate": 1.5904599863705906e-07, + "loss": 0.1945, + "step": 5750 + }, + { + "epoch": 1.530335284725918, + "grad_norm": 0.25183960795402527, + "learning_rate": 1.590323676720189e-07, + "loss": 0.1761, + "step": 5751 + }, + { + "epoch": 1.5306013837147419, + "grad_norm": 0.4128146469593048, + "learning_rate": 1.5901873502325565e-07, + "loss": 0.193, + "step": 5752 + }, + { + "epoch": 1.5308674827035658, + "grad_norm": 0.31971126794815063, + "learning_rate": 1.5900510069115816e-07, + "loss": 0.1878, + "step": 5753 + }, + { + "epoch": 1.5311335816923894, + "grad_norm": 0.28747954964637756, + "learning_rate": 1.589914646761153e-07, + "loss": 0.2024, + "step": 5754 + }, + { + "epoch": 1.5313996806812136, + "grad_norm": 0.3339640200138092, + "learning_rate": 1.5897782697851595e-07, + "loss": 0.1793, + "step": 5755 + }, + { + "epoch": 1.5316657796700373, + "grad_norm": 0.41809895634651184, + "learning_rate": 1.5896418759874918e-07, + "loss": 0.1995, + "step": 5756 + }, + { + "epoch": 1.531931878658861, + "grad_norm": 0.35737714171409607, + "learning_rate": 1.589505465372039e-07, + "loss": 0.1941, + "step": 5757 + }, + { + "epoch": 1.532197977647685, + "grad_norm": 0.24420058727264404, + "learning_rate": 1.589369037942693e-07, + "loss": 0.1824, + "step": 5758 + }, + { + "epoch": 1.5324640766365087, + "grad_norm": 0.27380260825157166, + "learning_rate": 1.589232593703344e-07, + "loss": 0.187, + "step": 5759 + }, + { + "epoch": 1.5327301756253326, + "grad_norm": 0.32281777262687683, + "learning_rate": 1.5890961326578844e-07, + "loss": 0.2, + "step": 5760 + }, + { + "epoch": 1.5329962746141566, + "grad_norm": 0.33370649814605713, + "learning_rate": 1.588959654810206e-07, + "loss": 0.1818, + "step": 5761 + }, + { + "epoch": 1.5332623736029802, + "grad_norm": 0.28226399421691895, + "learning_rate": 1.588823160164201e-07, + "loss": 0.1829, + "step": 5762 + }, + { + "epoch": 1.5335284725918041, + "grad_norm": 0.34926220774650574, + "learning_rate": 1.5886866487237632e-07, + "loss": 0.1996, + "step": 5763 + }, + { + "epoch": 1.533794571580628, + "grad_norm": 0.3033214211463928, + "learning_rate": 1.5885501204927857e-07, + "loss": 0.1745, + "step": 5764 + }, + { + "epoch": 1.5340606705694517, + "grad_norm": 0.2778796851634979, + "learning_rate": 1.588413575475163e-07, + "loss": 0.1927, + "step": 5765 + }, + { + "epoch": 1.5343267695582756, + "grad_norm": 0.4828703999519348, + "learning_rate": 1.5882770136747894e-07, + "loss": 0.1956, + "step": 5766 + }, + { + "epoch": 1.5345928685470995, + "grad_norm": 0.2870752513408661, + "learning_rate": 1.58814043509556e-07, + "loss": 0.1972, + "step": 5767 + }, + { + "epoch": 1.5348589675359232, + "grad_norm": 0.34477075934410095, + "learning_rate": 1.58800383974137e-07, + "loss": 0.2066, + "step": 5768 + }, + { + "epoch": 1.5351250665247473, + "grad_norm": 0.36452150344848633, + "learning_rate": 1.5878672276161158e-07, + "loss": 0.1965, + "step": 5769 + }, + { + "epoch": 1.535391165513571, + "grad_norm": 0.3584480881690979, + "learning_rate": 1.5877305987236936e-07, + "loss": 0.2, + "step": 5770 + }, + { + "epoch": 1.535657264502395, + "grad_norm": 0.3044946491718292, + "learning_rate": 1.5875939530680002e-07, + "loss": 0.1879, + "step": 5771 + }, + { + "epoch": 1.5359233634912188, + "grad_norm": 0.2763666808605194, + "learning_rate": 1.5874572906529333e-07, + "loss": 0.1862, + "step": 5772 + }, + { + "epoch": 1.5361894624800425, + "grad_norm": 0.34688693284988403, + "learning_rate": 1.5873206114823908e-07, + "loss": 0.1986, + "step": 5773 + }, + { + "epoch": 1.5364555614688664, + "grad_norm": 0.24820555746555328, + "learning_rate": 1.587183915560271e-07, + "loss": 0.1676, + "step": 5774 + }, + { + "epoch": 1.5367216604576903, + "grad_norm": 0.2596490681171417, + "learning_rate": 1.5870472028904727e-07, + "loss": 0.1846, + "step": 5775 + }, + { + "epoch": 1.536987759446514, + "grad_norm": 0.26451337337493896, + "learning_rate": 1.5869104734768951e-07, + "loss": 0.1788, + "step": 5776 + }, + { + "epoch": 1.537253858435338, + "grad_norm": 0.3308299481868744, + "learning_rate": 1.5867737273234382e-07, + "loss": 0.1999, + "step": 5777 + }, + { + "epoch": 1.5375199574241618, + "grad_norm": 0.3132760524749756, + "learning_rate": 1.5866369644340023e-07, + "loss": 0.1981, + "step": 5778 + }, + { + "epoch": 1.5377860564129855, + "grad_norm": 0.44991186261177063, + "learning_rate": 1.586500184812488e-07, + "loss": 0.1972, + "step": 5779 + }, + { + "epoch": 1.5380521554018096, + "grad_norm": 0.33744972944259644, + "learning_rate": 1.5863633884627967e-07, + "loss": 0.1853, + "step": 5780 + }, + { + "epoch": 1.5383182543906333, + "grad_norm": 0.28341448307037354, + "learning_rate": 1.58622657538883e-07, + "loss": 0.1985, + "step": 5781 + }, + { + "epoch": 1.5385843533794572, + "grad_norm": 0.43001776933670044, + "learning_rate": 1.5860897455944898e-07, + "loss": 0.1791, + "step": 5782 + }, + { + "epoch": 1.538850452368281, + "grad_norm": 0.39919257164001465, + "learning_rate": 1.5859528990836797e-07, + "loss": 0.2076, + "step": 5783 + }, + { + "epoch": 1.5391165513571048, + "grad_norm": 0.34484243392944336, + "learning_rate": 1.5858160358603017e-07, + "loss": 0.1961, + "step": 5784 + }, + { + "epoch": 1.5393826503459287, + "grad_norm": 0.3501985967159271, + "learning_rate": 1.58567915592826e-07, + "loss": 0.1706, + "step": 5785 + }, + { + "epoch": 1.5396487493347526, + "grad_norm": 0.2412191778421402, + "learning_rate": 1.5855422592914586e-07, + "loss": 0.1877, + "step": 5786 + }, + { + "epoch": 1.5399148483235763, + "grad_norm": 0.2975636124610901, + "learning_rate": 1.5854053459538022e-07, + "loss": 0.2032, + "step": 5787 + }, + { + "epoch": 1.5401809473124002, + "grad_norm": 0.2925753891468048, + "learning_rate": 1.5852684159191957e-07, + "loss": 0.2109, + "step": 5788 + }, + { + "epoch": 1.540447046301224, + "grad_norm": 0.26660236716270447, + "learning_rate": 1.5851314691915448e-07, + "loss": 0.1745, + "step": 5789 + }, + { + "epoch": 1.5407131452900478, + "grad_norm": 0.28732889890670776, + "learning_rate": 1.584994505774755e-07, + "loss": 0.2185, + "step": 5790 + }, + { + "epoch": 1.540979244278872, + "grad_norm": 0.32745474576950073, + "learning_rate": 1.5848575256727332e-07, + "loss": 0.1822, + "step": 5791 + }, + { + "epoch": 1.5412453432676956, + "grad_norm": 0.24326995015144348, + "learning_rate": 1.5847205288893863e-07, + "loss": 0.1794, + "step": 5792 + }, + { + "epoch": 1.5415114422565193, + "grad_norm": 0.3336643576622009, + "learning_rate": 1.5845835154286218e-07, + "loss": 0.1884, + "step": 5793 + }, + { + "epoch": 1.5417775412453434, + "grad_norm": 0.2939460575580597, + "learning_rate": 1.5844464852943475e-07, + "loss": 0.184, + "step": 5794 + }, + { + "epoch": 1.542043640234167, + "grad_norm": 0.2794884145259857, + "learning_rate": 1.584309438490472e-07, + "loss": 0.1869, + "step": 5795 + }, + { + "epoch": 1.542309739222991, + "grad_norm": 0.347869336605072, + "learning_rate": 1.5841723750209037e-07, + "loss": 0.1988, + "step": 5796 + }, + { + "epoch": 1.5425758382118149, + "grad_norm": 0.4081515073776245, + "learning_rate": 1.5840352948895523e-07, + "loss": 0.218, + "step": 5797 + }, + { + "epoch": 1.5428419372006386, + "grad_norm": 0.26127809286117554, + "learning_rate": 1.5838981981003273e-07, + "loss": 0.1775, + "step": 5798 + }, + { + "epoch": 1.5431080361894625, + "grad_norm": 0.2565302848815918, + "learning_rate": 1.5837610846571394e-07, + "loss": 0.1906, + "step": 5799 + }, + { + "epoch": 1.5433741351782864, + "grad_norm": 0.35519492626190186, + "learning_rate": 1.5836239545638988e-07, + "loss": 0.1984, + "step": 5800 + }, + { + "epoch": 1.54364023416711, + "grad_norm": 0.2762712240219116, + "learning_rate": 1.5834868078245173e-07, + "loss": 0.1862, + "step": 5801 + }, + { + "epoch": 1.543906333155934, + "grad_norm": 0.27193111181259155, + "learning_rate": 1.5833496444429061e-07, + "loss": 0.1879, + "step": 5802 + }, + { + "epoch": 1.5441724321447579, + "grad_norm": 0.276645690202713, + "learning_rate": 1.5832124644229779e-07, + "loss": 0.1695, + "step": 5803 + }, + { + "epoch": 1.5444385311335815, + "grad_norm": 0.28299564123153687, + "learning_rate": 1.5830752677686446e-07, + "loss": 0.1958, + "step": 5804 + }, + { + "epoch": 1.5447046301224057, + "grad_norm": 0.2754419147968292, + "learning_rate": 1.5829380544838203e-07, + "loss": 0.1898, + "step": 5805 + }, + { + "epoch": 1.5449707291112293, + "grad_norm": 0.42504236102104187, + "learning_rate": 1.5828008245724177e-07, + "loss": 0.1848, + "step": 5806 + }, + { + "epoch": 1.5452368281000533, + "grad_norm": 0.3365946412086487, + "learning_rate": 1.5826635780383516e-07, + "loss": 0.1984, + "step": 5807 + }, + { + "epoch": 1.5455029270888772, + "grad_norm": 0.29839497804641724, + "learning_rate": 1.582526314885536e-07, + "loss": 0.1849, + "step": 5808 + }, + { + "epoch": 1.5457690260777008, + "grad_norm": 0.26320287585258484, + "learning_rate": 1.5823890351178865e-07, + "loss": 0.1744, + "step": 5809 + }, + { + "epoch": 1.5460351250665247, + "grad_norm": 0.33099400997161865, + "learning_rate": 1.5822517387393177e-07, + "loss": 0.1821, + "step": 5810 + }, + { + "epoch": 1.5463012240553486, + "grad_norm": 0.25222936272621155, + "learning_rate": 1.5821144257537466e-07, + "loss": 0.175, + "step": 5811 + }, + { + "epoch": 1.5465673230441723, + "grad_norm": 0.3862810730934143, + "learning_rate": 1.5819770961650888e-07, + "loss": 0.2058, + "step": 5812 + }, + { + "epoch": 1.5468334220329962, + "grad_norm": 0.25974345207214355, + "learning_rate": 1.5818397499772616e-07, + "loss": 0.1835, + "step": 5813 + }, + { + "epoch": 1.5470995210218201, + "grad_norm": 0.24872156977653503, + "learning_rate": 1.5817023871941828e-07, + "loss": 0.1766, + "step": 5814 + }, + { + "epoch": 1.5473656200106438, + "grad_norm": 0.2741093635559082, + "learning_rate": 1.5815650078197694e-07, + "loss": 0.2065, + "step": 5815 + }, + { + "epoch": 1.547631718999468, + "grad_norm": 0.35464540123939514, + "learning_rate": 1.5814276118579405e-07, + "loss": 0.2078, + "step": 5816 + }, + { + "epoch": 1.5478978179882916, + "grad_norm": 0.26515305042266846, + "learning_rate": 1.5812901993126144e-07, + "loss": 0.1808, + "step": 5817 + }, + { + "epoch": 1.5481639169771155, + "grad_norm": 0.4336557686328888, + "learning_rate": 1.5811527701877108e-07, + "loss": 0.1856, + "step": 5818 + }, + { + "epoch": 1.5484300159659394, + "grad_norm": 0.35108375549316406, + "learning_rate": 1.581015324487149e-07, + "loss": 0.1862, + "step": 5819 + }, + { + "epoch": 1.5486961149547631, + "grad_norm": 0.28332486748695374, + "learning_rate": 1.5808778622148495e-07, + "loss": 0.1872, + "step": 5820 + }, + { + "epoch": 1.548962213943587, + "grad_norm": 0.33035480976104736, + "learning_rate": 1.580740383374733e-07, + "loss": 0.1958, + "step": 5821 + }, + { + "epoch": 1.549228312932411, + "grad_norm": 0.3233822286128998, + "learning_rate": 1.5806028879707207e-07, + "loss": 0.2122, + "step": 5822 + }, + { + "epoch": 1.5494944119212346, + "grad_norm": 0.3950711786746979, + "learning_rate": 1.5804653760067341e-07, + "loss": 0.2068, + "step": 5823 + }, + { + "epoch": 1.5497605109100585, + "grad_norm": 0.3814365267753601, + "learning_rate": 1.5803278474866953e-07, + "loss": 0.2068, + "step": 5824 + }, + { + "epoch": 1.5500266098988824, + "grad_norm": 0.2431117594242096, + "learning_rate": 1.5801903024145273e-07, + "loss": 0.1689, + "step": 5825 + }, + { + "epoch": 1.550292708887706, + "grad_norm": 0.28052061796188354, + "learning_rate": 1.5800527407941527e-07, + "loss": 0.2031, + "step": 5826 + }, + { + "epoch": 1.5505588078765302, + "grad_norm": 0.3266732692718506, + "learning_rate": 1.5799151626294954e-07, + "loss": 0.2158, + "step": 5827 + }, + { + "epoch": 1.550824906865354, + "grad_norm": 0.2898867130279541, + "learning_rate": 1.579777567924479e-07, + "loss": 0.1987, + "step": 5828 + }, + { + "epoch": 1.5510910058541778, + "grad_norm": 0.27439653873443604, + "learning_rate": 1.579639956683028e-07, + "loss": 0.181, + "step": 5829 + }, + { + "epoch": 1.5513571048430017, + "grad_norm": 0.26492658257484436, + "learning_rate": 1.5795023289090682e-07, + "loss": 0.1846, + "step": 5830 + }, + { + "epoch": 1.5516232038318254, + "grad_norm": 0.3713756799697876, + "learning_rate": 1.5793646846065238e-07, + "loss": 0.2039, + "step": 5831 + }, + { + "epoch": 1.5518893028206493, + "grad_norm": 0.2911236882209778, + "learning_rate": 1.5792270237793216e-07, + "loss": 0.1992, + "step": 5832 + }, + { + "epoch": 1.5521554018094732, + "grad_norm": 0.29352375864982605, + "learning_rate": 1.5790893464313874e-07, + "loss": 0.2001, + "step": 5833 + }, + { + "epoch": 1.5524215007982969, + "grad_norm": 0.32910168170928955, + "learning_rate": 1.5789516525666482e-07, + "loss": 0.2047, + "step": 5834 + }, + { + "epoch": 1.5526875997871208, + "grad_norm": 0.27646273374557495, + "learning_rate": 1.5788139421890315e-07, + "loss": 0.1822, + "step": 5835 + }, + { + "epoch": 1.5529536987759447, + "grad_norm": 0.3086317181587219, + "learning_rate": 1.5786762153024654e-07, + "loss": 0.1955, + "step": 5836 + }, + { + "epoch": 1.5532197977647684, + "grad_norm": 0.2641565799713135, + "learning_rate": 1.578538471910877e-07, + "loss": 0.1854, + "step": 5837 + }, + { + "epoch": 1.5534858967535925, + "grad_norm": 0.3074115812778473, + "learning_rate": 1.578400712018196e-07, + "loss": 0.2049, + "step": 5838 + }, + { + "epoch": 1.5537519957424162, + "grad_norm": 0.27938130497932434, + "learning_rate": 1.5782629356283516e-07, + "loss": 0.2035, + "step": 5839 + }, + { + "epoch": 1.5540180947312399, + "grad_norm": 0.27323609590530396, + "learning_rate": 1.578125142745273e-07, + "loss": 0.1626, + "step": 5840 + }, + { + "epoch": 1.554284193720064, + "grad_norm": 0.27317118644714355, + "learning_rate": 1.5779873333728903e-07, + "loss": 0.1964, + "step": 5841 + }, + { + "epoch": 1.5545502927088877, + "grad_norm": 0.3751439154148102, + "learning_rate": 1.5778495075151345e-07, + "loss": 0.1872, + "step": 5842 + }, + { + "epoch": 1.5548163916977116, + "grad_norm": 0.2662947177886963, + "learning_rate": 1.5777116651759368e-07, + "loss": 0.2038, + "step": 5843 + }, + { + "epoch": 1.5550824906865355, + "grad_norm": 0.28245261311531067, + "learning_rate": 1.577573806359228e-07, + "loss": 0.1874, + "step": 5844 + }, + { + "epoch": 1.5553485896753592, + "grad_norm": 0.2940865457057953, + "learning_rate": 1.5774359310689404e-07, + "loss": 0.1969, + "step": 5845 + }, + { + "epoch": 1.555614688664183, + "grad_norm": 0.27063673734664917, + "learning_rate": 1.5772980393090072e-07, + "loss": 0.1773, + "step": 5846 + }, + { + "epoch": 1.555880787653007, + "grad_norm": 0.2880435585975647, + "learning_rate": 1.5771601310833607e-07, + "loss": 0.1799, + "step": 5847 + }, + { + "epoch": 1.5561468866418307, + "grad_norm": 0.34247028827667236, + "learning_rate": 1.577022206395934e-07, + "loss": 0.2125, + "step": 5848 + }, + { + "epoch": 1.5564129856306546, + "grad_norm": 0.29363304376602173, + "learning_rate": 1.5768842652506618e-07, + "loss": 0.1904, + "step": 5849 + }, + { + "epoch": 1.5566790846194785, + "grad_norm": 0.3509308695793152, + "learning_rate": 1.576746307651478e-07, + "loss": 0.2134, + "step": 5850 + }, + { + "epoch": 1.5569451836083021, + "grad_norm": 0.35485944151878357, + "learning_rate": 1.5766083336023176e-07, + "loss": 0.1895, + "step": 5851 + }, + { + "epoch": 1.5572112825971263, + "grad_norm": 0.45365849137306213, + "learning_rate": 1.5764703431071156e-07, + "loss": 0.1994, + "step": 5852 + }, + { + "epoch": 1.55747738158595, + "grad_norm": 0.37504300475120544, + "learning_rate": 1.5763323361698077e-07, + "loss": 0.2054, + "step": 5853 + }, + { + "epoch": 1.5577434805747739, + "grad_norm": 0.2924870550632477, + "learning_rate": 1.5761943127943308e-07, + "loss": 0.1972, + "step": 5854 + }, + { + "epoch": 1.5580095795635978, + "grad_norm": 1.6340265274047852, + "learning_rate": 1.5760562729846213e-07, + "loss": 0.19, + "step": 5855 + }, + { + "epoch": 1.5582756785524214, + "grad_norm": 0.36500439047813416, + "learning_rate": 1.5759182167446162e-07, + "loss": 0.1937, + "step": 5856 + }, + { + "epoch": 1.5585417775412453, + "grad_norm": 0.2515943944454193, + "learning_rate": 1.5757801440782533e-07, + "loss": 0.17, + "step": 5857 + }, + { + "epoch": 1.5588078765300692, + "grad_norm": 0.2961878776550293, + "learning_rate": 1.5756420549894703e-07, + "loss": 0.2019, + "step": 5858 + }, + { + "epoch": 1.559073975518893, + "grad_norm": 0.2911471724510193, + "learning_rate": 1.5755039494822065e-07, + "loss": 0.1926, + "step": 5859 + }, + { + "epoch": 1.5593400745077168, + "grad_norm": 0.28604331612586975, + "learning_rate": 1.5753658275604002e-07, + "loss": 0.19, + "step": 5860 + }, + { + "epoch": 1.5596061734965407, + "grad_norm": 0.3584529757499695, + "learning_rate": 1.5752276892279918e-07, + "loss": 0.1886, + "step": 5861 + }, + { + "epoch": 1.5598722724853644, + "grad_norm": 0.32110586762428284, + "learning_rate": 1.5750895344889204e-07, + "loss": 0.1837, + "step": 5862 + }, + { + "epoch": 1.5601383714741885, + "grad_norm": 0.32585984468460083, + "learning_rate": 1.5749513633471268e-07, + "loss": 0.1754, + "step": 5863 + }, + { + "epoch": 1.5604044704630122, + "grad_norm": 0.31944334506988525, + "learning_rate": 1.574813175806552e-07, + "loss": 0.2264, + "step": 5864 + }, + { + "epoch": 1.5606705694518361, + "grad_norm": 0.271262526512146, + "learning_rate": 1.5746749718711375e-07, + "loss": 0.1882, + "step": 5865 + }, + { + "epoch": 1.56093666844066, + "grad_norm": 0.387521356344223, + "learning_rate": 1.5745367515448252e-07, + "loss": 0.178, + "step": 5866 + }, + { + "epoch": 1.5612027674294837, + "grad_norm": 0.7003674507141113, + "learning_rate": 1.574398514831557e-07, + "loss": 0.2038, + "step": 5867 + }, + { + "epoch": 1.5614688664183076, + "grad_norm": 0.5741886496543884, + "learning_rate": 1.5742602617352757e-07, + "loss": 0.1956, + "step": 5868 + }, + { + "epoch": 1.5617349654071315, + "grad_norm": 0.3034327030181885, + "learning_rate": 1.574121992259925e-07, + "loss": 0.1935, + "step": 5869 + }, + { + "epoch": 1.5620010643959552, + "grad_norm": 0.31791627407073975, + "learning_rate": 1.5739837064094486e-07, + "loss": 0.1794, + "step": 5870 + }, + { + "epoch": 1.5622671633847791, + "grad_norm": 0.3617646396160126, + "learning_rate": 1.5738454041877902e-07, + "loss": 0.202, + "step": 5871 + }, + { + "epoch": 1.562533262373603, + "grad_norm": 0.3531014025211334, + "learning_rate": 1.573707085598895e-07, + "loss": 0.1923, + "step": 5872 + }, + { + "epoch": 1.5627993613624267, + "grad_norm": 0.39875224232673645, + "learning_rate": 1.5735687506467077e-07, + "loss": 0.1996, + "step": 5873 + }, + { + "epoch": 1.5630654603512508, + "grad_norm": 0.25632444024086, + "learning_rate": 1.5734303993351744e-07, + "loss": 0.1826, + "step": 5874 + }, + { + "epoch": 1.5633315593400745, + "grad_norm": 0.270659476518631, + "learning_rate": 1.5732920316682403e-07, + "loss": 0.1869, + "step": 5875 + }, + { + "epoch": 1.5635976583288982, + "grad_norm": 0.37395069003105164, + "learning_rate": 1.5731536476498529e-07, + "loss": 0.1933, + "step": 5876 + }, + { + "epoch": 1.5638637573177223, + "grad_norm": 0.3851953446865082, + "learning_rate": 1.5730152472839585e-07, + "loss": 0.1858, + "step": 5877 + }, + { + "epoch": 1.564129856306546, + "grad_norm": 0.28053879737854004, + "learning_rate": 1.572876830574505e-07, + "loss": 0.1933, + "step": 5878 + }, + { + "epoch": 1.56439595529537, + "grad_norm": 0.2969510555267334, + "learning_rate": 1.5727383975254402e-07, + "loss": 0.193, + "step": 5879 + }, + { + "epoch": 1.5646620542841938, + "grad_norm": 0.35207489132881165, + "learning_rate": 1.5725999481407124e-07, + "loss": 0.2264, + "step": 5880 + }, + { + "epoch": 1.5649281532730175, + "grad_norm": 0.2915736734867096, + "learning_rate": 1.5724614824242706e-07, + "loss": 0.199, + "step": 5881 + }, + { + "epoch": 1.5651942522618414, + "grad_norm": 0.519149661064148, + "learning_rate": 1.5723230003800636e-07, + "loss": 0.2153, + "step": 5882 + }, + { + "epoch": 1.5654603512506653, + "grad_norm": 0.28251370787620544, + "learning_rate": 1.572184502012042e-07, + "loss": 0.1802, + "step": 5883 + }, + { + "epoch": 1.565726450239489, + "grad_norm": 0.27821624279022217, + "learning_rate": 1.5720459873241553e-07, + "loss": 0.2, + "step": 5884 + }, + { + "epoch": 1.5659925492283129, + "grad_norm": 0.2460559606552124, + "learning_rate": 1.5719074563203552e-07, + "loss": 0.1753, + "step": 5885 + }, + { + "epoch": 1.5662586482171368, + "grad_norm": 0.27052345871925354, + "learning_rate": 1.5717689090045918e-07, + "loss": 0.1743, + "step": 5886 + }, + { + "epoch": 1.5665247472059605, + "grad_norm": 0.4188387095928192, + "learning_rate": 1.5716303453808176e-07, + "loss": 0.2065, + "step": 5887 + }, + { + "epoch": 1.5667908461947846, + "grad_norm": 0.6607703566551208, + "learning_rate": 1.571491765452984e-07, + "loss": 0.1777, + "step": 5888 + }, + { + "epoch": 1.5670569451836083, + "grad_norm": 0.3264921009540558, + "learning_rate": 1.571353169225044e-07, + "loss": 0.2004, + "step": 5889 + }, + { + "epoch": 1.5673230441724322, + "grad_norm": 0.3292381763458252, + "learning_rate": 1.5712145567009505e-07, + "loss": 0.1806, + "step": 5890 + }, + { + "epoch": 1.567589143161256, + "grad_norm": 0.25210994482040405, + "learning_rate": 1.571075927884657e-07, + "loss": 0.1713, + "step": 5891 + }, + { + "epoch": 1.5678552421500798, + "grad_norm": 0.2922484874725342, + "learning_rate": 1.570937282780118e-07, + "loss": 0.1776, + "step": 5892 + }, + { + "epoch": 1.5681213411389037, + "grad_norm": 0.2698950171470642, + "learning_rate": 1.570798621391287e-07, + "loss": 0.2, + "step": 5893 + }, + { + "epoch": 1.5683874401277276, + "grad_norm": 0.3583938479423523, + "learning_rate": 1.5706599437221197e-07, + "loss": 0.1956, + "step": 5894 + }, + { + "epoch": 1.5686535391165513, + "grad_norm": 0.26286831498146057, + "learning_rate": 1.5705212497765712e-07, + "loss": 0.2076, + "step": 5895 + }, + { + "epoch": 1.5689196381053752, + "grad_norm": 0.2558910548686981, + "learning_rate": 1.570382539558597e-07, + "loss": 0.1846, + "step": 5896 + }, + { + "epoch": 1.569185737094199, + "grad_norm": 0.3475746810436249, + "learning_rate": 1.570243813072154e-07, + "loss": 0.2062, + "step": 5897 + }, + { + "epoch": 1.5694518360830227, + "grad_norm": 0.2950824201107025, + "learning_rate": 1.5701050703211985e-07, + "loss": 0.1977, + "step": 5898 + }, + { + "epoch": 1.5697179350718469, + "grad_norm": 0.3885333836078644, + "learning_rate": 1.5699663113096881e-07, + "loss": 0.1832, + "step": 5899 + }, + { + "epoch": 1.5699840340606706, + "grad_norm": 0.4116020202636719, + "learning_rate": 1.56982753604158e-07, + "loss": 0.2139, + "step": 5900 + }, + { + "epoch": 1.5702501330494945, + "grad_norm": 0.3677133321762085, + "learning_rate": 1.5696887445208327e-07, + "loss": 0.1968, + "step": 5901 + }, + { + "epoch": 1.5705162320383184, + "grad_norm": 0.28123652935028076, + "learning_rate": 1.5695499367514047e-07, + "loss": 0.1786, + "step": 5902 + }, + { + "epoch": 1.570782331027142, + "grad_norm": 0.2896256744861603, + "learning_rate": 1.569411112737255e-07, + "loss": 0.1811, + "step": 5903 + }, + { + "epoch": 1.571048430015966, + "grad_norm": 0.26802217960357666, + "learning_rate": 1.5692722724823435e-07, + "loss": 0.1806, + "step": 5904 + }, + { + "epoch": 1.5713145290047899, + "grad_norm": 0.3522365391254425, + "learning_rate": 1.56913341599063e-07, + "loss": 0.1843, + "step": 5905 + }, + { + "epoch": 1.5715806279936135, + "grad_norm": 0.33383452892303467, + "learning_rate": 1.568994543266075e-07, + "loss": 0.1954, + "step": 5906 + }, + { + "epoch": 1.5718467269824374, + "grad_norm": 0.33842945098876953, + "learning_rate": 1.5688556543126392e-07, + "loss": 0.2132, + "step": 5907 + }, + { + "epoch": 1.5721128259712613, + "grad_norm": 0.36341169476509094, + "learning_rate": 1.568716749134284e-07, + "loss": 0.2121, + "step": 5908 + }, + { + "epoch": 1.572378924960085, + "grad_norm": 0.3279040455818176, + "learning_rate": 1.568577827734972e-07, + "loss": 0.1858, + "step": 5909 + }, + { + "epoch": 1.5726450239489091, + "grad_norm": 0.27244266867637634, + "learning_rate": 1.5684388901186645e-07, + "loss": 0.2066, + "step": 5910 + }, + { + "epoch": 1.5729111229377328, + "grad_norm": 0.29784002900123596, + "learning_rate": 1.5682999362893248e-07, + "loss": 0.2025, + "step": 5911 + }, + { + "epoch": 1.5731772219265565, + "grad_norm": 0.27372291684150696, + "learning_rate": 1.568160966250916e-07, + "loss": 0.1879, + "step": 5912 + }, + { + "epoch": 1.5734433209153806, + "grad_norm": 0.2704727351665497, + "learning_rate": 1.568021980007402e-07, + "loss": 0.1909, + "step": 5913 + }, + { + "epoch": 1.5737094199042043, + "grad_norm": 0.3547273278236389, + "learning_rate": 1.567882977562747e-07, + "loss": 0.1812, + "step": 5914 + }, + { + "epoch": 1.5739755188930282, + "grad_norm": 0.2628956437110901, + "learning_rate": 1.5677439589209153e-07, + "loss": 0.1907, + "step": 5915 + }, + { + "epoch": 1.5742416178818521, + "grad_norm": 0.25710800290107727, + "learning_rate": 1.5676049240858722e-07, + "loss": 0.1799, + "step": 5916 + }, + { + "epoch": 1.5745077168706758, + "grad_norm": 0.409402459859848, + "learning_rate": 1.5674658730615832e-07, + "loss": 0.1985, + "step": 5917 + }, + { + "epoch": 1.5747738158594997, + "grad_norm": 0.292655885219574, + "learning_rate": 1.5673268058520144e-07, + "loss": 0.2139, + "step": 5918 + }, + { + "epoch": 1.5750399148483236, + "grad_norm": 0.3482542037963867, + "learning_rate": 1.5671877224611324e-07, + "loss": 0.1792, + "step": 5919 + }, + { + "epoch": 1.5753060138371473, + "grad_norm": 0.2843920886516571, + "learning_rate": 1.5670486228929037e-07, + "loss": 0.1847, + "step": 5920 + }, + { + "epoch": 1.5755721128259714, + "grad_norm": 0.271060049533844, + "learning_rate": 1.5669095071512958e-07, + "loss": 0.1968, + "step": 5921 + }, + { + "epoch": 1.575838211814795, + "grad_norm": 0.26494401693344116, + "learning_rate": 1.566770375240277e-07, + "loss": 0.1878, + "step": 5922 + }, + { + "epoch": 1.5761043108036188, + "grad_norm": 0.36239778995513916, + "learning_rate": 1.5666312271638155e-07, + "loss": 0.2112, + "step": 5923 + }, + { + "epoch": 1.576370409792443, + "grad_norm": 0.2574513256549835, + "learning_rate": 1.5664920629258797e-07, + "loss": 0.1856, + "step": 5924 + }, + { + "epoch": 1.5766365087812666, + "grad_norm": 0.2523724436759949, + "learning_rate": 1.5663528825304396e-07, + "loss": 0.1761, + "step": 5925 + }, + { + "epoch": 1.5769026077700905, + "grad_norm": 0.2807365953922272, + "learning_rate": 1.566213685981464e-07, + "loss": 0.2021, + "step": 5926 + }, + { + "epoch": 1.5771687067589144, + "grad_norm": 0.4169559180736542, + "learning_rate": 1.5660744732829234e-07, + "loss": 0.1816, + "step": 5927 + }, + { + "epoch": 1.577434805747738, + "grad_norm": 0.26069697737693787, + "learning_rate": 1.5659352444387888e-07, + "loss": 0.1825, + "step": 5928 + }, + { + "epoch": 1.577700904736562, + "grad_norm": 0.3215121924877167, + "learning_rate": 1.5657959994530305e-07, + "loss": 0.1896, + "step": 5929 + }, + { + "epoch": 1.577967003725386, + "grad_norm": 0.2588464915752411, + "learning_rate": 1.5656567383296212e-07, + "loss": 0.1878, + "step": 5930 + }, + { + "epoch": 1.5782331027142096, + "grad_norm": 0.45609548687934875, + "learning_rate": 1.5655174610725318e-07, + "loss": 0.1895, + "step": 5931 + }, + { + "epoch": 1.5784992017030335, + "grad_norm": 0.26973602175712585, + "learning_rate": 1.5653781676857355e-07, + "loss": 0.191, + "step": 5932 + }, + { + "epoch": 1.5787653006918574, + "grad_norm": 0.2900135815143585, + "learning_rate": 1.5652388581732047e-07, + "loss": 0.2054, + "step": 5933 + }, + { + "epoch": 1.579031399680681, + "grad_norm": 0.3037365972995758, + "learning_rate": 1.565099532538913e-07, + "loss": 0.189, + "step": 5934 + }, + { + "epoch": 1.5792974986695052, + "grad_norm": 0.30707937479019165, + "learning_rate": 1.5649601907868345e-07, + "loss": 0.1855, + "step": 5935 + }, + { + "epoch": 1.5795635976583289, + "grad_norm": 0.29644569754600525, + "learning_rate": 1.564820832920943e-07, + "loss": 0.2005, + "step": 5936 + }, + { + "epoch": 1.5798296966471528, + "grad_norm": 0.2918766140937805, + "learning_rate": 1.564681458945214e-07, + "loss": 0.2074, + "step": 5937 + }, + { + "epoch": 1.5800957956359767, + "grad_norm": 0.30776044726371765, + "learning_rate": 1.564542068863622e-07, + "loss": 0.1795, + "step": 5938 + }, + { + "epoch": 1.5803618946248004, + "grad_norm": 0.34780699014663696, + "learning_rate": 1.5644026626801433e-07, + "loss": 0.1921, + "step": 5939 + }, + { + "epoch": 1.5806279936136243, + "grad_norm": 0.3132425546646118, + "learning_rate": 1.5642632403987533e-07, + "loss": 0.196, + "step": 5940 + }, + { + "epoch": 1.5808940926024482, + "grad_norm": 0.24918754398822784, + "learning_rate": 1.5641238020234292e-07, + "loss": 0.1655, + "step": 5941 + }, + { + "epoch": 1.5811601915912719, + "grad_norm": 0.3243626654148102, + "learning_rate": 1.563984347558148e-07, + "loss": 0.1947, + "step": 5942 + }, + { + "epoch": 1.5814262905800958, + "grad_norm": 0.34633493423461914, + "learning_rate": 1.5638448770068876e-07, + "loss": 0.2072, + "step": 5943 + }, + { + "epoch": 1.5816923895689197, + "grad_norm": 0.35415613651275635, + "learning_rate": 1.5637053903736248e-07, + "loss": 0.1873, + "step": 5944 + }, + { + "epoch": 1.5819584885577433, + "grad_norm": 0.3559149503707886, + "learning_rate": 1.563565887662339e-07, + "loss": 0.1983, + "step": 5945 + }, + { + "epoch": 1.5822245875465675, + "grad_norm": 0.3347538709640503, + "learning_rate": 1.5634263688770092e-07, + "loss": 0.193, + "step": 5946 + }, + { + "epoch": 1.5824906865353912, + "grad_norm": 0.3446230888366699, + "learning_rate": 1.5632868340216138e-07, + "loss": 0.2056, + "step": 5947 + }, + { + "epoch": 1.582756785524215, + "grad_norm": 0.3424939513206482, + "learning_rate": 1.5631472831001337e-07, + "loss": 0.1907, + "step": 5948 + }, + { + "epoch": 1.583022884513039, + "grad_norm": 0.380878746509552, + "learning_rate": 1.5630077161165486e-07, + "loss": 0.197, + "step": 5949 + }, + { + "epoch": 1.5832889835018626, + "grad_norm": 0.3597518801689148, + "learning_rate": 1.5628681330748393e-07, + "loss": 0.1936, + "step": 5950 + }, + { + "epoch": 1.5835550824906865, + "grad_norm": 0.31406643986701965, + "learning_rate": 1.562728533978987e-07, + "loss": 0.2068, + "step": 5951 + }, + { + "epoch": 1.5838211814795105, + "grad_norm": 0.33459240198135376, + "learning_rate": 1.5625889188329734e-07, + "loss": 0.1839, + "step": 5952 + }, + { + "epoch": 1.5840872804683341, + "grad_norm": 0.27734100818634033, + "learning_rate": 1.5624492876407807e-07, + "loss": 0.1736, + "step": 5953 + }, + { + "epoch": 1.584353379457158, + "grad_norm": 0.37713509798049927, + "learning_rate": 1.5623096404063913e-07, + "loss": 0.2041, + "step": 5954 + }, + { + "epoch": 1.584619478445982, + "grad_norm": 0.2518230974674225, + "learning_rate": 1.5621699771337882e-07, + "loss": 0.1716, + "step": 5955 + }, + { + "epoch": 1.5848855774348056, + "grad_norm": 0.28125840425491333, + "learning_rate": 1.562030297826955e-07, + "loss": 0.1828, + "step": 5956 + }, + { + "epoch": 1.5851516764236298, + "grad_norm": 0.2628069221973419, + "learning_rate": 1.5618906024898756e-07, + "loss": 0.179, + "step": 5957 + }, + { + "epoch": 1.5854177754124534, + "grad_norm": 0.2682287096977234, + "learning_rate": 1.5617508911265346e-07, + "loss": 0.1843, + "step": 5958 + }, + { + "epoch": 1.5856838744012771, + "grad_norm": 0.3829602599143982, + "learning_rate": 1.5616111637409164e-07, + "loss": 0.1898, + "step": 5959 + }, + { + "epoch": 1.5859499733901012, + "grad_norm": 0.2696757912635803, + "learning_rate": 1.5614714203370068e-07, + "loss": 0.1875, + "step": 5960 + }, + { + "epoch": 1.586216072378925, + "grad_norm": 0.36157333850860596, + "learning_rate": 1.5613316609187913e-07, + "loss": 0.1969, + "step": 5961 + }, + { + "epoch": 1.5864821713677488, + "grad_norm": 0.27693819999694824, + "learning_rate": 1.561191885490256e-07, + "loss": 0.1975, + "step": 5962 + }, + { + "epoch": 1.5867482703565727, + "grad_norm": 0.31606781482696533, + "learning_rate": 1.561052094055388e-07, + "loss": 0.179, + "step": 5963 + }, + { + "epoch": 1.5870143693453964, + "grad_norm": 0.2729770243167877, + "learning_rate": 1.5609122866181736e-07, + "loss": 0.2042, + "step": 5964 + }, + { + "epoch": 1.5872804683342203, + "grad_norm": 0.5463472008705139, + "learning_rate": 1.5607724631826016e-07, + "loss": 0.1925, + "step": 5965 + }, + { + "epoch": 1.5875465673230442, + "grad_norm": 0.27150052785873413, + "learning_rate": 1.5606326237526591e-07, + "loss": 0.1923, + "step": 5966 + }, + { + "epoch": 1.587812666311868, + "grad_norm": 0.2835661768913269, + "learning_rate": 1.5604927683323348e-07, + "loss": 0.1961, + "step": 5967 + }, + { + "epoch": 1.5880787653006918, + "grad_norm": 0.28722208738327026, + "learning_rate": 1.5603528969256182e-07, + "loss": 0.179, + "step": 5968 + }, + { + "epoch": 1.5883448642895157, + "grad_norm": 0.4767057001590729, + "learning_rate": 1.5602130095364977e-07, + "loss": 0.2188, + "step": 5969 + }, + { + "epoch": 1.5886109632783394, + "grad_norm": 0.2775588631629944, + "learning_rate": 1.560073106168964e-07, + "loss": 0.1827, + "step": 5970 + }, + { + "epoch": 1.5888770622671635, + "grad_norm": 0.26759225130081177, + "learning_rate": 1.5599331868270075e-07, + "loss": 0.1863, + "step": 5971 + }, + { + "epoch": 1.5891431612559872, + "grad_norm": 0.4633178114891052, + "learning_rate": 1.5597932515146184e-07, + "loss": 0.1992, + "step": 5972 + }, + { + "epoch": 1.589409260244811, + "grad_norm": 0.3361787796020508, + "learning_rate": 1.5596533002357882e-07, + "loss": 0.1924, + "step": 5973 + }, + { + "epoch": 1.589675359233635, + "grad_norm": 0.5090377926826477, + "learning_rate": 1.5595133329945085e-07, + "loss": 0.2027, + "step": 5974 + }, + { + "epoch": 1.5899414582224587, + "grad_norm": 0.3147890865802765, + "learning_rate": 1.5593733497947716e-07, + "loss": 0.194, + "step": 5975 + }, + { + "epoch": 1.5902075572112826, + "grad_norm": 0.26486659049987793, + "learning_rate": 1.55923335064057e-07, + "loss": 0.1975, + "step": 5976 + }, + { + "epoch": 1.5904736562001065, + "grad_norm": 0.275516539812088, + "learning_rate": 1.559093335535897e-07, + "loss": 0.1838, + "step": 5977 + }, + { + "epoch": 1.5907397551889302, + "grad_norm": 0.26187020540237427, + "learning_rate": 1.558953304484746e-07, + "loss": 0.1846, + "step": 5978 + }, + { + "epoch": 1.591005854177754, + "grad_norm": 0.33912283182144165, + "learning_rate": 1.5588132574911107e-07, + "loss": 0.1985, + "step": 5979 + }, + { + "epoch": 1.591271953166578, + "grad_norm": 0.29685378074645996, + "learning_rate": 1.558673194558986e-07, + "loss": 0.1984, + "step": 5980 + }, + { + "epoch": 1.5915380521554017, + "grad_norm": 0.23849733173847198, + "learning_rate": 1.5585331156923664e-07, + "loss": 0.1845, + "step": 5981 + }, + { + "epoch": 1.5918041511442258, + "grad_norm": 0.3264961242675781, + "learning_rate": 1.5583930208952475e-07, + "loss": 0.2087, + "step": 5982 + }, + { + "epoch": 1.5920702501330495, + "grad_norm": 0.3640095889568329, + "learning_rate": 1.5582529101716245e-07, + "loss": 0.1855, + "step": 5983 + }, + { + "epoch": 1.5923363491218734, + "grad_norm": 0.292054682970047, + "learning_rate": 1.5581127835254945e-07, + "loss": 0.1776, + "step": 5984 + }, + { + "epoch": 1.5926024481106973, + "grad_norm": 0.4094792902469635, + "learning_rate": 1.5579726409608536e-07, + "loss": 0.1894, + "step": 5985 + }, + { + "epoch": 1.592868547099521, + "grad_norm": 0.47771549224853516, + "learning_rate": 1.5578324824816992e-07, + "loss": 0.2307, + "step": 5986 + }, + { + "epoch": 1.5931346460883449, + "grad_norm": 0.3121718466281891, + "learning_rate": 1.5576923080920285e-07, + "loss": 0.1937, + "step": 5987 + }, + { + "epoch": 1.5934007450771688, + "grad_norm": 0.24864789843559265, + "learning_rate": 1.5575521177958404e-07, + "loss": 0.1807, + "step": 5988 + }, + { + "epoch": 1.5936668440659925, + "grad_norm": 0.7031896710395813, + "learning_rate": 1.5574119115971325e-07, + "loss": 0.195, + "step": 5989 + }, + { + "epoch": 1.5939329430548164, + "grad_norm": 0.2914625406265259, + "learning_rate": 1.5572716894999044e-07, + "loss": 0.19, + "step": 5990 + }, + { + "epoch": 1.5941990420436403, + "grad_norm": 0.3063860833644867, + "learning_rate": 1.557131451508155e-07, + "loss": 0.1864, + "step": 5991 + }, + { + "epoch": 1.594465141032464, + "grad_norm": 0.2634023427963257, + "learning_rate": 1.556991197625885e-07, + "loss": 0.1943, + "step": 5992 + }, + { + "epoch": 1.594731240021288, + "grad_norm": 0.3685142397880554, + "learning_rate": 1.5568509278570935e-07, + "loss": 0.201, + "step": 5993 + }, + { + "epoch": 1.5949973390101118, + "grad_norm": 1.4723609685897827, + "learning_rate": 1.5567106422057824e-07, + "loss": 0.187, + "step": 5994 + }, + { + "epoch": 1.5952634379989354, + "grad_norm": 0.27166473865509033, + "learning_rate": 1.5565703406759526e-07, + "loss": 0.1733, + "step": 5995 + }, + { + "epoch": 1.5955295369877596, + "grad_norm": 0.3603450655937195, + "learning_rate": 1.5564300232716051e-07, + "loss": 0.1871, + "step": 5996 + }, + { + "epoch": 1.5957956359765832, + "grad_norm": 0.3703204393386841, + "learning_rate": 1.5562896899967433e-07, + "loss": 0.1988, + "step": 5997 + }, + { + "epoch": 1.5960617349654072, + "grad_norm": 0.2585746645927429, + "learning_rate": 1.556149340855369e-07, + "loss": 0.1762, + "step": 5998 + }, + { + "epoch": 1.596327833954231, + "grad_norm": 0.2880663275718689, + "learning_rate": 1.556008975851485e-07, + "loss": 0.1912, + "step": 5999 + }, + { + "epoch": 1.5965939329430547, + "grad_norm": 0.3221277892589569, + "learning_rate": 1.555868594989095e-07, + "loss": 0.1997, + "step": 6000 + }, + { + "epoch": 1.5968600319318786, + "grad_norm": 0.4464738070964813, + "learning_rate": 1.5557281982722036e-07, + "loss": 0.1994, + "step": 6001 + }, + { + "epoch": 1.5971261309207025, + "grad_norm": 0.2879481315612793, + "learning_rate": 1.5555877857048144e-07, + "loss": 0.1926, + "step": 6002 + }, + { + "epoch": 1.5973922299095262, + "grad_norm": 0.3146118223667145, + "learning_rate": 1.5554473572909327e-07, + "loss": 0.2006, + "step": 6003 + }, + { + "epoch": 1.5976583288983501, + "grad_norm": 0.5507379770278931, + "learning_rate": 1.5553069130345636e-07, + "loss": 0.2132, + "step": 6004 + }, + { + "epoch": 1.597924427887174, + "grad_norm": 0.2888251543045044, + "learning_rate": 1.555166452939713e-07, + "loss": 0.2002, + "step": 6005 + }, + { + "epoch": 1.5981905268759977, + "grad_norm": 0.4304949939250946, + "learning_rate": 1.5550259770103867e-07, + "loss": 0.1903, + "step": 6006 + }, + { + "epoch": 1.5984566258648218, + "grad_norm": 0.530389130115509, + "learning_rate": 1.554885485250592e-07, + "loss": 0.2011, + "step": 6007 + }, + { + "epoch": 1.5987227248536455, + "grad_norm": 0.2641719877719879, + "learning_rate": 1.5547449776643356e-07, + "loss": 0.1839, + "step": 6008 + }, + { + "epoch": 1.5989888238424694, + "grad_norm": 0.27631592750549316, + "learning_rate": 1.5546044542556247e-07, + "loss": 0.1965, + "step": 6009 + }, + { + "epoch": 1.5992549228312933, + "grad_norm": 0.2746683657169342, + "learning_rate": 1.5544639150284682e-07, + "loss": 0.1708, + "step": 6010 + }, + { + "epoch": 1.599521021820117, + "grad_norm": 0.3601562976837158, + "learning_rate": 1.554323359986874e-07, + "loss": 0.2015, + "step": 6011 + }, + { + "epoch": 1.599787120808941, + "grad_norm": 0.4634384512901306, + "learning_rate": 1.5541827891348512e-07, + "loss": 0.2194, + "step": 6012 + }, + { + "epoch": 1.6000532197977648, + "grad_norm": 0.3985755741596222, + "learning_rate": 1.5540422024764092e-07, + "loss": 0.2049, + "step": 6013 + }, + { + "epoch": 1.6003193187865885, + "grad_norm": 0.30070629715919495, + "learning_rate": 1.5539016000155575e-07, + "loss": 0.2027, + "step": 6014 + }, + { + "epoch": 1.6005854177754124, + "grad_norm": 0.4255672097206116, + "learning_rate": 1.5537609817563068e-07, + "loss": 0.2064, + "step": 6015 + }, + { + "epoch": 1.6008515167642363, + "grad_norm": 0.3735525608062744, + "learning_rate": 1.5536203477026675e-07, + "loss": 0.1847, + "step": 6016 + }, + { + "epoch": 1.60111761575306, + "grad_norm": 0.2989587187767029, + "learning_rate": 1.553479697858651e-07, + "loss": 0.2103, + "step": 6017 + }, + { + "epoch": 1.6013837147418841, + "grad_norm": 0.24356499314308167, + "learning_rate": 1.5533390322282684e-07, + "loss": 0.1567, + "step": 6018 + }, + { + "epoch": 1.6016498137307078, + "grad_norm": 0.46417680382728577, + "learning_rate": 1.5531983508155326e-07, + "loss": 0.1972, + "step": 6019 + }, + { + "epoch": 1.6019159127195317, + "grad_norm": 0.2833282947540283, + "learning_rate": 1.5530576536244553e-07, + "loss": 0.2056, + "step": 6020 + }, + { + "epoch": 1.6021820117083556, + "grad_norm": 0.3242397904396057, + "learning_rate": 1.5529169406590503e-07, + "loss": 0.1891, + "step": 6021 + }, + { + "epoch": 1.6024481106971793, + "grad_norm": 0.2781291604042053, + "learning_rate": 1.5527762119233302e-07, + "loss": 0.1988, + "step": 6022 + }, + { + "epoch": 1.6027142096860032, + "grad_norm": 0.28562822937965393, + "learning_rate": 1.552635467421309e-07, + "loss": 0.1997, + "step": 6023 + }, + { + "epoch": 1.602980308674827, + "grad_norm": 0.33335596323013306, + "learning_rate": 1.552494707157002e-07, + "loss": 0.1927, + "step": 6024 + }, + { + "epoch": 1.6032464076636508, + "grad_norm": 0.3003462851047516, + "learning_rate": 1.552353931134423e-07, + "loss": 0.214, + "step": 6025 + }, + { + "epoch": 1.6035125066524747, + "grad_norm": 0.27974367141723633, + "learning_rate": 1.5522131393575873e-07, + "loss": 0.1877, + "step": 6026 + }, + { + "epoch": 1.6037786056412986, + "grad_norm": 0.2782125473022461, + "learning_rate": 1.5520723318305104e-07, + "loss": 0.1826, + "step": 6027 + }, + { + "epoch": 1.6040447046301223, + "grad_norm": 0.4424399137496948, + "learning_rate": 1.5519315085572088e-07, + "loss": 0.2039, + "step": 6028 + }, + { + "epoch": 1.6043108036189464, + "grad_norm": 0.38946428894996643, + "learning_rate": 1.5517906695416993e-07, + "loss": 0.1993, + "step": 6029 + }, + { + "epoch": 1.60457690260777, + "grad_norm": 0.24794377386569977, + "learning_rate": 1.5516498147879986e-07, + "loss": 0.1596, + "step": 6030 + }, + { + "epoch": 1.604843001596594, + "grad_norm": 0.33192378282546997, + "learning_rate": 1.5515089443001245e-07, + "loss": 0.1814, + "step": 6031 + }, + { + "epoch": 1.605109100585418, + "grad_norm": 0.2536700963973999, + "learning_rate": 1.551368058082094e-07, + "loss": 0.1789, + "step": 6032 + }, + { + "epoch": 1.6053751995742416, + "grad_norm": 0.34934478998184204, + "learning_rate": 1.551227156137926e-07, + "loss": 0.176, + "step": 6033 + }, + { + "epoch": 1.6056412985630655, + "grad_norm": 0.2540617287158966, + "learning_rate": 1.55108623847164e-07, + "loss": 0.1867, + "step": 6034 + }, + { + "epoch": 1.6059073975518894, + "grad_norm": 0.324046790599823, + "learning_rate": 1.550945305087254e-07, + "loss": 0.2059, + "step": 6035 + }, + { + "epoch": 1.606173496540713, + "grad_norm": 0.3279077112674713, + "learning_rate": 1.5508043559887887e-07, + "loss": 0.1813, + "step": 6036 + }, + { + "epoch": 1.606439595529537, + "grad_norm": 0.37033748626708984, + "learning_rate": 1.5506633911802637e-07, + "loss": 0.2132, + "step": 6037 + }, + { + "epoch": 1.6067056945183609, + "grad_norm": 0.3013935983181, + "learning_rate": 1.5505224106657e-07, + "loss": 0.2018, + "step": 6038 + }, + { + "epoch": 1.6069717935071846, + "grad_norm": 0.2975693941116333, + "learning_rate": 1.5503814144491183e-07, + "loss": 0.1881, + "step": 6039 + }, + { + "epoch": 1.6072378924960087, + "grad_norm": 0.2539640963077545, + "learning_rate": 1.55024040253454e-07, + "loss": 0.1723, + "step": 6040 + }, + { + "epoch": 1.6075039914848324, + "grad_norm": 0.33799147605895996, + "learning_rate": 1.5500993749259875e-07, + "loss": 0.1884, + "step": 6041 + }, + { + "epoch": 1.607770090473656, + "grad_norm": 0.2746155560016632, + "learning_rate": 1.549958331627483e-07, + "loss": 0.1844, + "step": 6042 + }, + { + "epoch": 1.6080361894624802, + "grad_norm": 0.26995596289634705, + "learning_rate": 1.5498172726430495e-07, + "loss": 0.1904, + "step": 6043 + }, + { + "epoch": 1.6083022884513039, + "grad_norm": 0.2663706839084625, + "learning_rate": 1.5496761979767097e-07, + "loss": 0.1933, + "step": 6044 + }, + { + "epoch": 1.6085683874401278, + "grad_norm": 0.24862146377563477, + "learning_rate": 1.5495351076324878e-07, + "loss": 0.1788, + "step": 6045 + }, + { + "epoch": 1.6088344864289517, + "grad_norm": 0.27445530891418457, + "learning_rate": 1.549394001614408e-07, + "loss": 0.189, + "step": 6046 + }, + { + "epoch": 1.6091005854177753, + "grad_norm": 0.2649177014827728, + "learning_rate": 1.5492528799264948e-07, + "loss": 0.1967, + "step": 6047 + }, + { + "epoch": 1.6093666844065992, + "grad_norm": 0.25861260294914246, + "learning_rate": 1.5491117425727734e-07, + "loss": 0.1688, + "step": 6048 + }, + { + "epoch": 1.6096327833954232, + "grad_norm": 0.2836475968360901, + "learning_rate": 1.5489705895572693e-07, + "loss": 0.1906, + "step": 6049 + }, + { + "epoch": 1.6098988823842468, + "grad_norm": 0.25113120675086975, + "learning_rate": 1.5488294208840086e-07, + "loss": 0.1802, + "step": 6050 + }, + { + "epoch": 1.6101649813730707, + "grad_norm": 0.3456827402114868, + "learning_rate": 1.548688236557017e-07, + "loss": 0.1838, + "step": 6051 + }, + { + "epoch": 1.6104310803618946, + "grad_norm": 0.2931756377220154, + "learning_rate": 1.5485470365803224e-07, + "loss": 0.1904, + "step": 6052 + }, + { + "epoch": 1.6106971793507183, + "grad_norm": 0.31236588954925537, + "learning_rate": 1.5484058209579513e-07, + "loss": 0.2104, + "step": 6053 + }, + { + "epoch": 1.6109632783395424, + "grad_norm": 0.2914807200431824, + "learning_rate": 1.548264589693932e-07, + "loss": 0.1955, + "step": 6054 + }, + { + "epoch": 1.6112293773283661, + "grad_norm": 0.32763969898223877, + "learning_rate": 1.5481233427922925e-07, + "loss": 0.1902, + "step": 6055 + }, + { + "epoch": 1.61149547631719, + "grad_norm": 0.45043739676475525, + "learning_rate": 1.5479820802570612e-07, + "loss": 0.2084, + "step": 6056 + }, + { + "epoch": 1.611761575306014, + "grad_norm": 0.3838343620300293, + "learning_rate": 1.5478408020922676e-07, + "loss": 0.1902, + "step": 6057 + }, + { + "epoch": 1.6120276742948376, + "grad_norm": 0.2617356479167938, + "learning_rate": 1.547699508301941e-07, + "loss": 0.1931, + "step": 6058 + }, + { + "epoch": 1.6122937732836615, + "grad_norm": 0.2614731192588806, + "learning_rate": 1.5475581988901115e-07, + "loss": 0.1765, + "step": 6059 + }, + { + "epoch": 1.6125598722724854, + "grad_norm": 0.4074210226535797, + "learning_rate": 1.5474168738608093e-07, + "loss": 0.1899, + "step": 6060 + }, + { + "epoch": 1.612825971261309, + "grad_norm": 0.33867430686950684, + "learning_rate": 1.5472755332180656e-07, + "loss": 0.2162, + "step": 6061 + }, + { + "epoch": 1.613092070250133, + "grad_norm": 0.26250067353248596, + "learning_rate": 1.5471341769659116e-07, + "loss": 0.1714, + "step": 6062 + }, + { + "epoch": 1.613358169238957, + "grad_norm": 0.27383726835250854, + "learning_rate": 1.546992805108379e-07, + "loss": 0.175, + "step": 6063 + }, + { + "epoch": 1.6136242682277806, + "grad_norm": 0.29483890533447266, + "learning_rate": 1.5468514176494997e-07, + "loss": 0.198, + "step": 6064 + }, + { + "epoch": 1.6138903672166047, + "grad_norm": 0.3326926827430725, + "learning_rate": 1.5467100145933072e-07, + "loss": 0.1858, + "step": 6065 + }, + { + "epoch": 1.6141564662054284, + "grad_norm": 0.36813822388648987, + "learning_rate": 1.546568595943834e-07, + "loss": 0.2006, + "step": 6066 + }, + { + "epoch": 1.6144225651942523, + "grad_norm": 0.3306993246078491, + "learning_rate": 1.5464271617051138e-07, + "loss": 0.1943, + "step": 6067 + }, + { + "epoch": 1.6146886641830762, + "grad_norm": 0.3340526819229126, + "learning_rate": 1.5462857118811803e-07, + "loss": 0.1855, + "step": 6068 + }, + { + "epoch": 1.6149547631719, + "grad_norm": 0.4200807809829712, + "learning_rate": 1.5461442464760684e-07, + "loss": 0.2164, + "step": 6069 + }, + { + "epoch": 1.6152208621607238, + "grad_norm": 0.31242266297340393, + "learning_rate": 1.5460027654938127e-07, + "loss": 0.187, + "step": 6070 + }, + { + "epoch": 1.6154869611495477, + "grad_norm": 0.2986430525779724, + "learning_rate": 1.5458612689384487e-07, + "loss": 0.1852, + "step": 6071 + }, + { + "epoch": 1.6157530601383714, + "grad_norm": 0.32214033603668213, + "learning_rate": 1.545719756814012e-07, + "loss": 0.1959, + "step": 6072 + }, + { + "epoch": 1.6160191591271953, + "grad_norm": 0.26945051550865173, + "learning_rate": 1.545578229124539e-07, + "loss": 0.1815, + "step": 6073 + }, + { + "epoch": 1.6162852581160192, + "grad_norm": 0.3206941485404968, + "learning_rate": 1.545436685874066e-07, + "loss": 0.1832, + "step": 6074 + }, + { + "epoch": 1.6165513571048429, + "grad_norm": 0.3286367654800415, + "learning_rate": 1.5452951270666307e-07, + "loss": 0.1907, + "step": 6075 + }, + { + "epoch": 1.616817456093667, + "grad_norm": 0.24780608713626862, + "learning_rate": 1.54515355270627e-07, + "loss": 0.1786, + "step": 6076 + }, + { + "epoch": 1.6170835550824907, + "grad_norm": 0.4259870648384094, + "learning_rate": 1.5450119627970223e-07, + "loss": 0.2075, + "step": 6077 + }, + { + "epoch": 1.6173496540713144, + "grad_norm": 0.3454227149486542, + "learning_rate": 1.544870357342926e-07, + "loss": 0.2171, + "step": 6078 + }, + { + "epoch": 1.6176157530601385, + "grad_norm": 0.36112236976623535, + "learning_rate": 1.5447287363480196e-07, + "loss": 0.1959, + "step": 6079 + }, + { + "epoch": 1.6178818520489622, + "grad_norm": 0.28647175431251526, + "learning_rate": 1.5445870998163426e-07, + "loss": 0.1929, + "step": 6080 + }, + { + "epoch": 1.618147951037786, + "grad_norm": 0.2700497508049011, + "learning_rate": 1.544445447751935e-07, + "loss": 0.1867, + "step": 6081 + }, + { + "epoch": 1.61841405002661, + "grad_norm": 0.257484495639801, + "learning_rate": 1.544303780158837e-07, + "loss": 0.1921, + "step": 6082 + }, + { + "epoch": 1.6186801490154337, + "grad_norm": 0.3113182783126831, + "learning_rate": 1.5441620970410888e-07, + "loss": 0.2046, + "step": 6083 + }, + { + "epoch": 1.6189462480042576, + "grad_norm": 0.28099972009658813, + "learning_rate": 1.5440203984027322e-07, + "loss": 0.1982, + "step": 6084 + }, + { + "epoch": 1.6192123469930815, + "grad_norm": 0.2826294004917145, + "learning_rate": 1.543878684247808e-07, + "loss": 0.1932, + "step": 6085 + }, + { + "epoch": 1.6194784459819052, + "grad_norm": 0.2697887718677521, + "learning_rate": 1.5437369545803588e-07, + "loss": 0.1879, + "step": 6086 + }, + { + "epoch": 1.619744544970729, + "grad_norm": 0.3517386019229889, + "learning_rate": 1.5435952094044267e-07, + "loss": 0.2065, + "step": 6087 + }, + { + "epoch": 1.620010643959553, + "grad_norm": 0.2968355417251587, + "learning_rate": 1.5434534487240545e-07, + "loss": 0.2116, + "step": 6088 + }, + { + "epoch": 1.6202767429483766, + "grad_norm": 0.2566031217575073, + "learning_rate": 1.5433116725432856e-07, + "loss": 0.1768, + "step": 6089 + }, + { + "epoch": 1.6205428419372008, + "grad_norm": 0.29163652658462524, + "learning_rate": 1.5431698808661637e-07, + "loss": 0.1778, + "step": 6090 + }, + { + "epoch": 1.6208089409260245, + "grad_norm": 0.26837509870529175, + "learning_rate": 1.543028073696733e-07, + "loss": 0.1877, + "step": 6091 + }, + { + "epoch": 1.6210750399148484, + "grad_norm": 0.2980532944202423, + "learning_rate": 1.5428862510390381e-07, + "loss": 0.1959, + "step": 6092 + }, + { + "epoch": 1.6213411389036723, + "grad_norm": 0.28201839327812195, + "learning_rate": 1.5427444128971243e-07, + "loss": 0.182, + "step": 6093 + }, + { + "epoch": 1.621607237892496, + "grad_norm": 0.26037704944610596, + "learning_rate": 1.542602559275037e-07, + "loss": 0.177, + "step": 6094 + }, + { + "epoch": 1.6218733368813198, + "grad_norm": 0.364778071641922, + "learning_rate": 1.542460690176822e-07, + "loss": 0.2042, + "step": 6095 + }, + { + "epoch": 1.6221394358701438, + "grad_norm": 0.30561745166778564, + "learning_rate": 1.5423188056065258e-07, + "loss": 0.2076, + "step": 6096 + }, + { + "epoch": 1.6224055348589674, + "grad_norm": 0.35316717624664307, + "learning_rate": 1.542176905568195e-07, + "loss": 0.2185, + "step": 6097 + }, + { + "epoch": 1.6226716338477913, + "grad_norm": 0.34939390420913696, + "learning_rate": 1.5420349900658772e-07, + "loss": 0.2002, + "step": 6098 + }, + { + "epoch": 1.6229377328366152, + "grad_norm": 0.44271332025527954, + "learning_rate": 1.54189305910362e-07, + "loss": 0.2103, + "step": 6099 + }, + { + "epoch": 1.623203831825439, + "grad_norm": 0.4551871418952942, + "learning_rate": 1.5417511126854719e-07, + "loss": 0.2166, + "step": 6100 + }, + { + "epoch": 1.623469930814263, + "grad_norm": 0.38580188155174255, + "learning_rate": 1.5416091508154808e-07, + "loss": 0.1884, + "step": 6101 + }, + { + "epoch": 1.6237360298030867, + "grad_norm": 0.37711259722709656, + "learning_rate": 1.5414671734976962e-07, + "loss": 0.1938, + "step": 6102 + }, + { + "epoch": 1.6240021287919106, + "grad_norm": 0.27269813418388367, + "learning_rate": 1.5413251807361676e-07, + "loss": 0.186, + "step": 6103 + }, + { + "epoch": 1.6242682277807345, + "grad_norm": 0.2531214654445648, + "learning_rate": 1.5411831725349448e-07, + "loss": 0.1793, + "step": 6104 + }, + { + "epoch": 1.6245343267695582, + "grad_norm": 0.3481731414794922, + "learning_rate": 1.5410411488980782e-07, + "loss": 0.1778, + "step": 6105 + }, + { + "epoch": 1.6248004257583821, + "grad_norm": 0.2670612335205078, + "learning_rate": 1.5408991098296185e-07, + "loss": 0.1914, + "step": 6106 + }, + { + "epoch": 1.625066524747206, + "grad_norm": 0.28121063113212585, + "learning_rate": 1.540757055333617e-07, + "loss": 0.1922, + "step": 6107 + }, + { + "epoch": 1.6253326237360297, + "grad_norm": 0.2642480134963989, + "learning_rate": 1.5406149854141255e-07, + "loss": 0.1872, + "step": 6108 + }, + { + "epoch": 1.6255987227248536, + "grad_norm": 0.26807844638824463, + "learning_rate": 1.5404729000751958e-07, + "loss": 0.1909, + "step": 6109 + }, + { + "epoch": 1.6258648217136775, + "grad_norm": 0.3146337866783142, + "learning_rate": 1.540330799320881e-07, + "loss": 0.1877, + "step": 6110 + }, + { + "epoch": 1.6261309207025012, + "grad_norm": 0.2494082897901535, + "learning_rate": 1.5401886831552337e-07, + "loss": 0.1769, + "step": 6111 + }, + { + "epoch": 1.6263970196913253, + "grad_norm": 0.27168190479278564, + "learning_rate": 1.5400465515823076e-07, + "loss": 0.1796, + "step": 6112 + }, + { + "epoch": 1.626663118680149, + "grad_norm": 0.24155108630657196, + "learning_rate": 1.539904404606156e-07, + "loss": 0.1684, + "step": 6113 + }, + { + "epoch": 1.6269292176689727, + "grad_norm": 0.2649226784706116, + "learning_rate": 1.539762242230834e-07, + "loss": 0.1958, + "step": 6114 + }, + { + "epoch": 1.6271953166577968, + "grad_norm": 0.26001089811325073, + "learning_rate": 1.539620064460396e-07, + "loss": 0.1929, + "step": 6115 + }, + { + "epoch": 1.6274614156466205, + "grad_norm": 0.37110158801078796, + "learning_rate": 1.5394778712988971e-07, + "loss": 0.2, + "step": 6116 + }, + { + "epoch": 1.6277275146354444, + "grad_norm": 0.2824512720108032, + "learning_rate": 1.539335662750393e-07, + "loss": 0.1809, + "step": 6117 + }, + { + "epoch": 1.6279936136242683, + "grad_norm": 0.257127970457077, + "learning_rate": 1.5391934388189398e-07, + "loss": 0.1877, + "step": 6118 + }, + { + "epoch": 1.628259712613092, + "grad_norm": 0.4496106803417206, + "learning_rate": 1.539051199508594e-07, + "loss": 0.1926, + "step": 6119 + }, + { + "epoch": 1.628525811601916, + "grad_norm": 0.2958521544933319, + "learning_rate": 1.5389089448234127e-07, + "loss": 0.2012, + "step": 6120 + }, + { + "epoch": 1.6287919105907398, + "grad_norm": 0.25222116708755493, + "learning_rate": 1.5387666747674533e-07, + "loss": 0.1753, + "step": 6121 + }, + { + "epoch": 1.6290580095795635, + "grad_norm": 0.6995816230773926, + "learning_rate": 1.5386243893447733e-07, + "loss": 0.1723, + "step": 6122 + }, + { + "epoch": 1.6293241085683874, + "grad_norm": 0.28212812542915344, + "learning_rate": 1.538482088559431e-07, + "loss": 0.194, + "step": 6123 + }, + { + "epoch": 1.6295902075572113, + "grad_norm": 0.2911141514778137, + "learning_rate": 1.538339772415486e-07, + "loss": 0.2059, + "step": 6124 + }, + { + "epoch": 1.629856306546035, + "grad_norm": 0.38409313559532166, + "learning_rate": 1.538197440916996e-07, + "loss": 0.1881, + "step": 6125 + }, + { + "epoch": 1.630122405534859, + "grad_norm": 0.3033789396286011, + "learning_rate": 1.5380550940680214e-07, + "loss": 0.208, + "step": 6126 + }, + { + "epoch": 1.6303885045236828, + "grad_norm": 0.41033971309661865, + "learning_rate": 1.5379127318726224e-07, + "loss": 0.1913, + "step": 6127 + }, + { + "epoch": 1.6306546035125067, + "grad_norm": 0.2978284955024719, + "learning_rate": 1.5377703543348593e-07, + "loss": 0.1976, + "step": 6128 + }, + { + "epoch": 1.6309207025013306, + "grad_norm": 0.3878209888935089, + "learning_rate": 1.5376279614587927e-07, + "loss": 0.2135, + "step": 6129 + }, + { + "epoch": 1.6311868014901543, + "grad_norm": 0.29614418745040894, + "learning_rate": 1.537485553248484e-07, + "loss": 0.1927, + "step": 6130 + }, + { + "epoch": 1.6314529004789782, + "grad_norm": 0.2665020525455475, + "learning_rate": 1.5373431297079953e-07, + "loss": 0.1937, + "step": 6131 + }, + { + "epoch": 1.631718999467802, + "grad_norm": 0.4616990387439728, + "learning_rate": 1.5372006908413884e-07, + "loss": 0.1751, + "step": 6132 + }, + { + "epoch": 1.6319850984566258, + "grad_norm": 0.27845773100852966, + "learning_rate": 1.5370582366527263e-07, + "loss": 0.178, + "step": 6133 + }, + { + "epoch": 1.6322511974454497, + "grad_norm": 0.26891854405403137, + "learning_rate": 1.536915767146072e-07, + "loss": 0.1822, + "step": 6134 + }, + { + "epoch": 1.6325172964342736, + "grad_norm": 0.2794334888458252, + "learning_rate": 1.5367732823254888e-07, + "loss": 0.1992, + "step": 6135 + }, + { + "epoch": 1.6327833954230973, + "grad_norm": 0.32125696539878845, + "learning_rate": 1.5366307821950407e-07, + "loss": 0.1828, + "step": 6136 + }, + { + "epoch": 1.6330494944119214, + "grad_norm": 0.3524264991283417, + "learning_rate": 1.5364882667587925e-07, + "loss": 0.197, + "step": 6137 + }, + { + "epoch": 1.633315593400745, + "grad_norm": 0.3204840123653412, + "learning_rate": 1.5363457360208086e-07, + "loss": 0.1917, + "step": 6138 + }, + { + "epoch": 1.633581692389569, + "grad_norm": 0.29035598039627075, + "learning_rate": 1.5362031899851546e-07, + "loss": 0.1908, + "step": 6139 + }, + { + "epoch": 1.6338477913783929, + "grad_norm": 0.2907634973526001, + "learning_rate": 1.5360606286558958e-07, + "loss": 0.1971, + "step": 6140 + }, + { + "epoch": 1.6341138903672165, + "grad_norm": 0.26929226517677307, + "learning_rate": 1.5359180520370983e-07, + "loss": 0.1779, + "step": 6141 + }, + { + "epoch": 1.6343799893560405, + "grad_norm": 0.5080406665802002, + "learning_rate": 1.5357754601328295e-07, + "loss": 0.1822, + "step": 6142 + }, + { + "epoch": 1.6346460883448644, + "grad_norm": 0.27301040291786194, + "learning_rate": 1.5356328529471557e-07, + "loss": 0.1826, + "step": 6143 + }, + { + "epoch": 1.634912187333688, + "grad_norm": 0.2382994443178177, + "learning_rate": 1.535490230484144e-07, + "loss": 0.1765, + "step": 6144 + }, + { + "epoch": 1.635178286322512, + "grad_norm": 0.33735090494155884, + "learning_rate": 1.535347592747863e-07, + "loss": 0.2101, + "step": 6145 + }, + { + "epoch": 1.6354443853113358, + "grad_norm": 0.36736470460891724, + "learning_rate": 1.535204939742381e-07, + "loss": 0.1788, + "step": 6146 + }, + { + "epoch": 1.6357104843001595, + "grad_norm": 0.6067717671394348, + "learning_rate": 1.5350622714717666e-07, + "loss": 0.1671, + "step": 6147 + }, + { + "epoch": 1.6359765832889837, + "grad_norm": 0.47613295912742615, + "learning_rate": 1.5349195879400886e-07, + "loss": 0.2212, + "step": 6148 + }, + { + "epoch": 1.6362426822778073, + "grad_norm": 0.26697680354118347, + "learning_rate": 1.5347768891514167e-07, + "loss": 0.1905, + "step": 6149 + }, + { + "epoch": 1.6365087812666312, + "grad_norm": 0.4059295654296875, + "learning_rate": 1.5346341751098216e-07, + "loss": 0.202, + "step": 6150 + }, + { + "epoch": 1.6367748802554551, + "grad_norm": 0.3745461404323578, + "learning_rate": 1.5344914458193734e-07, + "loss": 0.2192, + "step": 6151 + }, + { + "epoch": 1.6370409792442788, + "grad_norm": 0.28256651759147644, + "learning_rate": 1.5343487012841433e-07, + "loss": 0.1851, + "step": 6152 + }, + { + "epoch": 1.6373070782331027, + "grad_norm": 0.266184538602829, + "learning_rate": 1.534205941508202e-07, + "loss": 0.1677, + "step": 6153 + }, + { + "epoch": 1.6375731772219266, + "grad_norm": 0.2622245252132416, + "learning_rate": 1.5340631664956217e-07, + "loss": 0.1771, + "step": 6154 + }, + { + "epoch": 1.6378392762107503, + "grad_norm": 0.26364850997924805, + "learning_rate": 1.5339203762504745e-07, + "loss": 0.1783, + "step": 6155 + }, + { + "epoch": 1.6381053751995742, + "grad_norm": 0.2758297920227051, + "learning_rate": 1.5337775707768333e-07, + "loss": 0.1936, + "step": 6156 + }, + { + "epoch": 1.6383714741883981, + "grad_norm": 0.3095303773880005, + "learning_rate": 1.533634750078771e-07, + "loss": 0.1892, + "step": 6157 + }, + { + "epoch": 1.6386375731772218, + "grad_norm": 0.282334566116333, + "learning_rate": 1.5334919141603615e-07, + "loss": 0.1909, + "step": 6158 + }, + { + "epoch": 1.638903672166046, + "grad_norm": 0.28055739402770996, + "learning_rate": 1.533349063025678e-07, + "loss": 0.2004, + "step": 6159 + }, + { + "epoch": 1.6391697711548696, + "grad_norm": 0.2973990738391876, + "learning_rate": 1.5332061966787962e-07, + "loss": 0.2021, + "step": 6160 + }, + { + "epoch": 1.6394358701436933, + "grad_norm": 0.3415314555168152, + "learning_rate": 1.5330633151237898e-07, + "loss": 0.2055, + "step": 6161 + }, + { + "epoch": 1.6397019691325174, + "grad_norm": 0.2956550717353821, + "learning_rate": 1.5329204183647342e-07, + "loss": 0.1931, + "step": 6162 + }, + { + "epoch": 1.639968068121341, + "grad_norm": 0.3550122380256653, + "learning_rate": 1.5327775064057057e-07, + "loss": 0.2046, + "step": 6163 + }, + { + "epoch": 1.640234167110165, + "grad_norm": 0.4182848632335663, + "learning_rate": 1.5326345792507796e-07, + "loss": 0.2009, + "step": 6164 + }, + { + "epoch": 1.640500266098989, + "grad_norm": 0.4812760055065155, + "learning_rate": 1.5324916369040333e-07, + "loss": 0.2042, + "step": 6165 + }, + { + "epoch": 1.6407663650878126, + "grad_norm": 0.36734262108802795, + "learning_rate": 1.532348679369543e-07, + "loss": 0.1969, + "step": 6166 + }, + { + "epoch": 1.6410324640766365, + "grad_norm": 0.26227661967277527, + "learning_rate": 1.532205706651387e-07, + "loss": 0.1834, + "step": 6167 + }, + { + "epoch": 1.6412985630654604, + "grad_norm": 0.28265267610549927, + "learning_rate": 1.5320627187536427e-07, + "loss": 0.1877, + "step": 6168 + }, + { + "epoch": 1.641564662054284, + "grad_norm": 0.26601511240005493, + "learning_rate": 1.5319197156803884e-07, + "loss": 0.1888, + "step": 6169 + }, + { + "epoch": 1.641830761043108, + "grad_norm": 0.4075879454612732, + "learning_rate": 1.531776697435703e-07, + "loss": 0.1771, + "step": 6170 + }, + { + "epoch": 1.642096860031932, + "grad_norm": 0.33806535601615906, + "learning_rate": 1.5316336640236652e-07, + "loss": 0.1827, + "step": 6171 + }, + { + "epoch": 1.6423629590207556, + "grad_norm": 0.2703101634979248, + "learning_rate": 1.5314906154483555e-07, + "loss": 0.209, + "step": 6172 + }, + { + "epoch": 1.6426290580095797, + "grad_norm": 0.4217463731765747, + "learning_rate": 1.531347551713853e-07, + "loss": 0.1964, + "step": 6173 + }, + { + "epoch": 1.6428951569984034, + "grad_norm": 0.287219762802124, + "learning_rate": 1.5312044728242386e-07, + "loss": 0.193, + "step": 6174 + }, + { + "epoch": 1.6431612559872273, + "grad_norm": 0.33337104320526123, + "learning_rate": 1.5310613787835934e-07, + "loss": 0.1784, + "step": 6175 + }, + { + "epoch": 1.6434273549760512, + "grad_norm": 0.2900780737400055, + "learning_rate": 1.5309182695959982e-07, + "loss": 0.1855, + "step": 6176 + }, + { + "epoch": 1.6436934539648749, + "grad_norm": 0.2927854061126709, + "learning_rate": 1.5307751452655353e-07, + "loss": 0.1897, + "step": 6177 + }, + { + "epoch": 1.6439595529536988, + "grad_norm": 0.27432170510292053, + "learning_rate": 1.5306320057962868e-07, + "loss": 0.197, + "step": 6178 + }, + { + "epoch": 1.6442256519425227, + "grad_norm": 0.37304314970970154, + "learning_rate": 1.5304888511923348e-07, + "loss": 0.2115, + "step": 6179 + }, + { + "epoch": 1.6444917509313464, + "grad_norm": 0.3101567029953003, + "learning_rate": 1.5303456814577632e-07, + "loss": 0.1923, + "step": 6180 + }, + { + "epoch": 1.6447578499201703, + "grad_norm": 0.2899225056171417, + "learning_rate": 1.5302024965966548e-07, + "loss": 0.1824, + "step": 6181 + }, + { + "epoch": 1.6450239489089942, + "grad_norm": 0.28659409284591675, + "learning_rate": 1.5300592966130937e-07, + "loss": 0.1898, + "step": 6182 + }, + { + "epoch": 1.6452900478978179, + "grad_norm": 0.2989765405654907, + "learning_rate": 1.5299160815111648e-07, + "loss": 0.1904, + "step": 6183 + }, + { + "epoch": 1.645556146886642, + "grad_norm": 0.28144538402557373, + "learning_rate": 1.5297728512949517e-07, + "loss": 0.1823, + "step": 6184 + }, + { + "epoch": 1.6458222458754657, + "grad_norm": 0.3580557405948639, + "learning_rate": 1.529629605968541e-07, + "loss": 0.2171, + "step": 6185 + }, + { + "epoch": 1.6460883448642896, + "grad_norm": 0.26256150007247925, + "learning_rate": 1.5294863455360175e-07, + "loss": 0.1817, + "step": 6186 + }, + { + "epoch": 1.6463544438531135, + "grad_norm": 0.2602955102920532, + "learning_rate": 1.529343070001467e-07, + "loss": 0.1911, + "step": 6187 + }, + { + "epoch": 1.6466205428419372, + "grad_norm": 0.2643367052078247, + "learning_rate": 1.5291997793689768e-07, + "loss": 0.1817, + "step": 6188 + }, + { + "epoch": 1.646886641830761, + "grad_norm": 0.2791116535663605, + "learning_rate": 1.5290564736426337e-07, + "loss": 0.1951, + "step": 6189 + }, + { + "epoch": 1.647152740819585, + "grad_norm": 0.2426036149263382, + "learning_rate": 1.5289131528265248e-07, + "loss": 0.1629, + "step": 6190 + }, + { + "epoch": 1.6474188398084086, + "grad_norm": 0.2525857985019684, + "learning_rate": 1.5287698169247377e-07, + "loss": 0.1794, + "step": 6191 + }, + { + "epoch": 1.6476849387972325, + "grad_norm": 0.3005464971065521, + "learning_rate": 1.528626465941361e-07, + "loss": 0.1904, + "step": 6192 + }, + { + "epoch": 1.6479510377860564, + "grad_norm": 0.3004387617111206, + "learning_rate": 1.5284830998804832e-07, + "loss": 0.1891, + "step": 6193 + }, + { + "epoch": 1.6482171367748801, + "grad_norm": 0.36268430948257446, + "learning_rate": 1.5283397187461936e-07, + "loss": 0.201, + "step": 6194 + }, + { + "epoch": 1.6484832357637043, + "grad_norm": 0.27673059701919556, + "learning_rate": 1.5281963225425814e-07, + "loss": 0.1859, + "step": 6195 + }, + { + "epoch": 1.648749334752528, + "grad_norm": 0.3448795676231384, + "learning_rate": 1.528052911273737e-07, + "loss": 0.1891, + "step": 6196 + }, + { + "epoch": 1.6490154337413516, + "grad_norm": 0.3580130636692047, + "learning_rate": 1.5279094849437507e-07, + "loss": 0.2004, + "step": 6197 + }, + { + "epoch": 1.6492815327301757, + "grad_norm": 0.4318599998950958, + "learning_rate": 1.527766043556713e-07, + "loss": 0.2047, + "step": 6198 + }, + { + "epoch": 1.6495476317189994, + "grad_norm": 0.3159469664096832, + "learning_rate": 1.5276225871167148e-07, + "loss": 0.181, + "step": 6199 + }, + { + "epoch": 1.6498137307078233, + "grad_norm": 0.26685506105422974, + "learning_rate": 1.5274791156278487e-07, + "loss": 0.1921, + "step": 6200 + }, + { + "epoch": 1.6500798296966472, + "grad_norm": 0.2700570225715637, + "learning_rate": 1.5273356290942063e-07, + "loss": 0.1826, + "step": 6201 + }, + { + "epoch": 1.650345928685471, + "grad_norm": 0.291838139295578, + "learning_rate": 1.5271921275198797e-07, + "loss": 0.2158, + "step": 6202 + }, + { + "epoch": 1.6506120276742948, + "grad_norm": 0.268160343170166, + "learning_rate": 1.5270486109089629e-07, + "loss": 0.1862, + "step": 6203 + }, + { + "epoch": 1.6508781266631187, + "grad_norm": 0.253345251083374, + "learning_rate": 1.5269050792655485e-07, + "loss": 0.1817, + "step": 6204 + }, + { + "epoch": 1.6511442256519424, + "grad_norm": 0.2882179617881775, + "learning_rate": 1.5267615325937306e-07, + "loss": 0.1961, + "step": 6205 + }, + { + "epoch": 1.6514103246407663, + "grad_norm": 0.3853990137577057, + "learning_rate": 1.526617970897603e-07, + "loss": 0.1907, + "step": 6206 + }, + { + "epoch": 1.6516764236295902, + "grad_norm": 0.292350172996521, + "learning_rate": 1.5264743941812612e-07, + "loss": 0.1957, + "step": 6207 + }, + { + "epoch": 1.651942522618414, + "grad_norm": 0.28203630447387695, + "learning_rate": 1.5263308024487996e-07, + "loss": 0.1937, + "step": 6208 + }, + { + "epoch": 1.652208621607238, + "grad_norm": 0.3175087571144104, + "learning_rate": 1.5261871957043139e-07, + "loss": 0.195, + "step": 6209 + }, + { + "epoch": 1.6524747205960617, + "grad_norm": 0.26492130756378174, + "learning_rate": 1.5260435739519003e-07, + "loss": 0.177, + "step": 6210 + }, + { + "epoch": 1.6527408195848856, + "grad_norm": 0.3088638484477997, + "learning_rate": 1.5258999371956547e-07, + "loss": 0.1848, + "step": 6211 + }, + { + "epoch": 1.6530069185737095, + "grad_norm": 0.2801044285297394, + "learning_rate": 1.5257562854396745e-07, + "loss": 0.1947, + "step": 6212 + }, + { + "epoch": 1.6532730175625332, + "grad_norm": 0.27753302454948425, + "learning_rate": 1.5256126186880564e-07, + "loss": 0.1842, + "step": 6213 + }, + { + "epoch": 1.653539116551357, + "grad_norm": 0.2813799977302551, + "learning_rate": 1.5254689369448986e-07, + "loss": 0.1953, + "step": 6214 + }, + { + "epoch": 1.653805215540181, + "grad_norm": 0.3821420967578888, + "learning_rate": 1.5253252402142985e-07, + "loss": 0.2005, + "step": 6215 + }, + { + "epoch": 1.6540713145290047, + "grad_norm": 0.2581382989883423, + "learning_rate": 1.5251815285003558e-07, + "loss": 0.1801, + "step": 6216 + }, + { + "epoch": 1.6543374135178286, + "grad_norm": 0.27843356132507324, + "learning_rate": 1.5250378018071678e-07, + "loss": 0.1951, + "step": 6217 + }, + { + "epoch": 1.6546035125066525, + "grad_norm": 0.3011777400970459, + "learning_rate": 1.5248940601388353e-07, + "loss": 0.1674, + "step": 6218 + }, + { + "epoch": 1.6548696114954762, + "grad_norm": 0.3338964283466339, + "learning_rate": 1.5247503034994576e-07, + "loss": 0.1913, + "step": 6219 + }, + { + "epoch": 1.6551357104843003, + "grad_norm": 0.38631775975227356, + "learning_rate": 1.5246065318931347e-07, + "loss": 0.1963, + "step": 6220 + }, + { + "epoch": 1.655401809473124, + "grad_norm": 0.39472246170043945, + "learning_rate": 1.5244627453239676e-07, + "loss": 0.1912, + "step": 6221 + }, + { + "epoch": 1.655667908461948, + "grad_norm": 0.2593798041343689, + "learning_rate": 1.5243189437960571e-07, + "loss": 0.1738, + "step": 6222 + }, + { + "epoch": 1.6559340074507718, + "grad_norm": 0.29451119899749756, + "learning_rate": 1.524175127313505e-07, + "loss": 0.1855, + "step": 6223 + }, + { + "epoch": 1.6562001064395955, + "grad_norm": 0.2578170895576477, + "learning_rate": 1.5240312958804128e-07, + "loss": 0.166, + "step": 6224 + }, + { + "epoch": 1.6564662054284194, + "grad_norm": 0.39770814776420593, + "learning_rate": 1.5238874495008832e-07, + "loss": 0.2074, + "step": 6225 + }, + { + "epoch": 1.6567323044172433, + "grad_norm": 0.25587257742881775, + "learning_rate": 1.5237435881790194e-07, + "loss": 0.1801, + "step": 6226 + }, + { + "epoch": 1.656998403406067, + "grad_norm": 0.25849243998527527, + "learning_rate": 1.5235997119189237e-07, + "loss": 0.1686, + "step": 6227 + }, + { + "epoch": 1.6572645023948909, + "grad_norm": 0.28170934319496155, + "learning_rate": 1.5234558207247005e-07, + "loss": 0.1815, + "step": 6228 + }, + { + "epoch": 1.6575306013837148, + "grad_norm": 0.41877326369285583, + "learning_rate": 1.5233119146004535e-07, + "loss": 0.2027, + "step": 6229 + }, + { + "epoch": 1.6577967003725385, + "grad_norm": 0.44740399718284607, + "learning_rate": 1.5231679935502874e-07, + "loss": 0.19, + "step": 6230 + }, + { + "epoch": 1.6580627993613626, + "grad_norm": 0.28127431869506836, + "learning_rate": 1.5230240575783063e-07, + "loss": 0.1741, + "step": 6231 + }, + { + "epoch": 1.6583288983501863, + "grad_norm": 0.4204840660095215, + "learning_rate": 1.522880106688617e-07, + "loss": 0.2084, + "step": 6232 + }, + { + "epoch": 1.65859499733901, + "grad_norm": 0.2846059203147888, + "learning_rate": 1.5227361408853242e-07, + "loss": 0.185, + "step": 6233 + }, + { + "epoch": 1.658861096327834, + "grad_norm": 0.22151042520999908, + "learning_rate": 1.5225921601725346e-07, + "loss": 0.1482, + "step": 6234 + }, + { + "epoch": 1.6591271953166578, + "grad_norm": 0.2831944227218628, + "learning_rate": 1.5224481645543545e-07, + "loss": 0.1832, + "step": 6235 + }, + { + "epoch": 1.6593932943054817, + "grad_norm": 0.27314212918281555, + "learning_rate": 1.522304154034891e-07, + "loss": 0.1846, + "step": 6236 + }, + { + "epoch": 1.6596593932943056, + "grad_norm": 0.367220938205719, + "learning_rate": 1.522160128618252e-07, + "loss": 0.1897, + "step": 6237 + }, + { + "epoch": 1.6599254922831292, + "grad_norm": 1.1826226711273193, + "learning_rate": 1.5220160883085447e-07, + "loss": 0.1962, + "step": 6238 + }, + { + "epoch": 1.6601915912719531, + "grad_norm": 0.2718920111656189, + "learning_rate": 1.521872033109878e-07, + "loss": 0.182, + "step": 6239 + }, + { + "epoch": 1.660457690260777, + "grad_norm": 0.2768304944038391, + "learning_rate": 1.5217279630263603e-07, + "loss": 0.1725, + "step": 6240 + }, + { + "epoch": 1.6607237892496007, + "grad_norm": 0.37012019753456116, + "learning_rate": 1.521583878062101e-07, + "loss": 0.202, + "step": 6241 + }, + { + "epoch": 1.6609898882384246, + "grad_norm": 0.38274288177490234, + "learning_rate": 1.5214397782212095e-07, + "loss": 0.1835, + "step": 6242 + }, + { + "epoch": 1.6612559872272485, + "grad_norm": 0.26480138301849365, + "learning_rate": 1.521295663507796e-07, + "loss": 0.1977, + "step": 6243 + }, + { + "epoch": 1.6615220862160722, + "grad_norm": 0.3848571181297302, + "learning_rate": 1.5211515339259705e-07, + "loss": 0.2174, + "step": 6244 + }, + { + "epoch": 1.6617881852048964, + "grad_norm": 0.2612881362438202, + "learning_rate": 1.5210073894798445e-07, + "loss": 0.173, + "step": 6245 + }, + { + "epoch": 1.66205428419372, + "grad_norm": 0.2653832733631134, + "learning_rate": 1.520863230173529e-07, + "loss": 0.1656, + "step": 6246 + }, + { + "epoch": 1.662320383182544, + "grad_norm": 0.29589003324508667, + "learning_rate": 1.520719056011136e-07, + "loss": 0.1897, + "step": 6247 + }, + { + "epoch": 1.6625864821713678, + "grad_norm": 0.27567028999328613, + "learning_rate": 1.5205748669967768e-07, + "loss": 0.1779, + "step": 6248 + }, + { + "epoch": 1.6628525811601915, + "grad_norm": 0.2644890546798706, + "learning_rate": 1.520430663134565e-07, + "loss": 0.1884, + "step": 6249 + }, + { + "epoch": 1.6631186801490154, + "grad_norm": 0.2740718126296997, + "learning_rate": 1.5202864444286127e-07, + "loss": 0.1826, + "step": 6250 + }, + { + "epoch": 1.6633847791378393, + "grad_norm": 0.3159954249858856, + "learning_rate": 1.5201422108830343e-07, + "loss": 0.2011, + "step": 6251 + }, + { + "epoch": 1.663650878126663, + "grad_norm": 0.2956520617008209, + "learning_rate": 1.5199979625019425e-07, + "loss": 0.2038, + "step": 6252 + }, + { + "epoch": 1.663916977115487, + "grad_norm": 0.2708260118961334, + "learning_rate": 1.5198536992894525e-07, + "loss": 0.1666, + "step": 6253 + }, + { + "epoch": 1.6641830761043108, + "grad_norm": 0.3196771740913391, + "learning_rate": 1.5197094212496783e-07, + "loss": 0.2003, + "step": 6254 + }, + { + "epoch": 1.6644491750931345, + "grad_norm": 0.28127947449684143, + "learning_rate": 1.5195651283867357e-07, + "loss": 0.1885, + "step": 6255 + }, + { + "epoch": 1.6647152740819586, + "grad_norm": 0.26860371232032776, + "learning_rate": 1.5194208207047397e-07, + "loss": 0.2007, + "step": 6256 + }, + { + "epoch": 1.6649813730707823, + "grad_norm": 0.3754587471485138, + "learning_rate": 1.5192764982078062e-07, + "loss": 0.1933, + "step": 6257 + }, + { + "epoch": 1.6652474720596062, + "grad_norm": 0.29148435592651367, + "learning_rate": 1.5191321609000517e-07, + "loss": 0.1872, + "step": 6258 + }, + { + "epoch": 1.6655135710484301, + "grad_norm": 0.32838568091392517, + "learning_rate": 1.5189878087855935e-07, + "loss": 0.1842, + "step": 6259 + }, + { + "epoch": 1.6657796700372538, + "grad_norm": 0.26560357213020325, + "learning_rate": 1.518843441868548e-07, + "loss": 0.1771, + "step": 6260 + }, + { + "epoch": 1.6660457690260777, + "grad_norm": 0.28563228249549866, + "learning_rate": 1.5186990601530332e-07, + "loss": 0.1976, + "step": 6261 + }, + { + "epoch": 1.6663118680149016, + "grad_norm": 0.5655508041381836, + "learning_rate": 1.518554663643167e-07, + "loss": 0.1987, + "step": 6262 + }, + { + "epoch": 1.6665779670037253, + "grad_norm": 0.35643404722213745, + "learning_rate": 1.5184102523430684e-07, + "loss": 0.1819, + "step": 6263 + }, + { + "epoch": 1.6668440659925492, + "grad_norm": 0.2504774332046509, + "learning_rate": 1.5182658262568558e-07, + "loss": 0.1909, + "step": 6264 + }, + { + "epoch": 1.667110164981373, + "grad_norm": 0.2553263008594513, + "learning_rate": 1.5181213853886485e-07, + "loss": 0.1741, + "step": 6265 + }, + { + "epoch": 1.6673762639701968, + "grad_norm": 0.3534398078918457, + "learning_rate": 1.5179769297425668e-07, + "loss": 0.194, + "step": 6266 + }, + { + "epoch": 1.667642362959021, + "grad_norm": 0.2734651267528534, + "learning_rate": 1.5178324593227304e-07, + "loss": 0.1993, + "step": 6267 + }, + { + "epoch": 1.6679084619478446, + "grad_norm": 0.2872273921966553, + "learning_rate": 1.5176879741332594e-07, + "loss": 0.1831, + "step": 6268 + }, + { + "epoch": 1.6681745609366685, + "grad_norm": 0.30232909321784973, + "learning_rate": 1.517543474178276e-07, + "loss": 0.1947, + "step": 6269 + }, + { + "epoch": 1.6684406599254924, + "grad_norm": 0.35424140095710754, + "learning_rate": 1.517398959461901e-07, + "loss": 0.1962, + "step": 6270 + }, + { + "epoch": 1.668706758914316, + "grad_norm": 0.34053143858909607, + "learning_rate": 1.517254429988256e-07, + "loss": 0.1855, + "step": 6271 + }, + { + "epoch": 1.66897285790314, + "grad_norm": 0.3866744637489319, + "learning_rate": 1.5171098857614637e-07, + "loss": 0.2029, + "step": 6272 + }, + { + "epoch": 1.6692389568919639, + "grad_norm": 0.2642154097557068, + "learning_rate": 1.5169653267856464e-07, + "loss": 0.1755, + "step": 6273 + }, + { + "epoch": 1.6695050558807876, + "grad_norm": 0.3460719883441925, + "learning_rate": 1.5168207530649279e-07, + "loss": 0.1914, + "step": 6274 + }, + { + "epoch": 1.6697711548696115, + "grad_norm": 0.3668050467967987, + "learning_rate": 1.5166761646034307e-07, + "loss": 0.1761, + "step": 6275 + }, + { + "epoch": 1.6700372538584354, + "grad_norm": 0.3403479754924774, + "learning_rate": 1.5165315614052798e-07, + "loss": 0.2031, + "step": 6276 + }, + { + "epoch": 1.670303352847259, + "grad_norm": 0.2780033051967621, + "learning_rate": 1.516386943474599e-07, + "loss": 0.1895, + "step": 6277 + }, + { + "epoch": 1.6705694518360832, + "grad_norm": 0.2747201919555664, + "learning_rate": 1.5162423108155133e-07, + "loss": 0.1832, + "step": 6278 + }, + { + "epoch": 1.6708355508249069, + "grad_norm": 0.406863272190094, + "learning_rate": 1.5160976634321478e-07, + "loss": 0.2142, + "step": 6279 + }, + { + "epoch": 1.6711016498137305, + "grad_norm": 0.25116121768951416, + "learning_rate": 1.5159530013286284e-07, + "loss": 0.1771, + "step": 6280 + }, + { + "epoch": 1.6713677488025547, + "grad_norm": 0.2806302309036255, + "learning_rate": 1.5158083245090808e-07, + "loss": 0.1995, + "step": 6281 + }, + { + "epoch": 1.6716338477913784, + "grad_norm": 0.301008015871048, + "learning_rate": 1.5156636329776312e-07, + "loss": 0.1929, + "step": 6282 + }, + { + "epoch": 1.6718999467802023, + "grad_norm": 0.4226575493812561, + "learning_rate": 1.5155189267384071e-07, + "loss": 0.1923, + "step": 6283 + }, + { + "epoch": 1.6721660457690262, + "grad_norm": 0.2501852214336395, + "learning_rate": 1.5153742057955358e-07, + "loss": 0.1797, + "step": 6284 + }, + { + "epoch": 1.6724321447578498, + "grad_norm": 0.3228744566440582, + "learning_rate": 1.515229470153145e-07, + "loss": 0.1882, + "step": 6285 + }, + { + "epoch": 1.6726982437466738, + "grad_norm": 0.31537753343582153, + "learning_rate": 1.5150847198153622e-07, + "loss": 0.1942, + "step": 6286 + }, + { + "epoch": 1.6729643427354977, + "grad_norm": 0.2680068016052246, + "learning_rate": 1.514939954786317e-07, + "loss": 0.1748, + "step": 6287 + }, + { + "epoch": 1.6732304417243213, + "grad_norm": 0.2462097406387329, + "learning_rate": 1.5147951750701378e-07, + "loss": 0.1766, + "step": 6288 + }, + { + "epoch": 1.6734965407131452, + "grad_norm": 0.2604915499687195, + "learning_rate": 1.5146503806709535e-07, + "loss": 0.1712, + "step": 6289 + }, + { + "epoch": 1.6737626397019691, + "grad_norm": 0.8443220853805542, + "learning_rate": 1.5145055715928953e-07, + "loss": 0.1675, + "step": 6290 + }, + { + "epoch": 1.6740287386907928, + "grad_norm": 0.2776978611946106, + "learning_rate": 1.5143607478400923e-07, + "loss": 0.1942, + "step": 6291 + }, + { + "epoch": 1.674294837679617, + "grad_norm": 0.3179949223995209, + "learning_rate": 1.5142159094166758e-07, + "loss": 0.1776, + "step": 6292 + }, + { + "epoch": 1.6745609366684406, + "grad_norm": 0.27214759588241577, + "learning_rate": 1.514071056326776e-07, + "loss": 0.2054, + "step": 6293 + }, + { + "epoch": 1.6748270356572645, + "grad_norm": 0.26389431953430176, + "learning_rate": 1.513926188574526e-07, + "loss": 0.1771, + "step": 6294 + }, + { + "epoch": 1.6750931346460884, + "grad_norm": 0.40090760588645935, + "learning_rate": 1.513781306164056e-07, + "loss": 0.2039, + "step": 6295 + }, + { + "epoch": 1.6753592336349121, + "grad_norm": 0.28704896569252014, + "learning_rate": 1.5136364090994994e-07, + "loss": 0.1806, + "step": 6296 + }, + { + "epoch": 1.675625332623736, + "grad_norm": 0.25436049699783325, + "learning_rate": 1.5134914973849883e-07, + "loss": 0.1776, + "step": 6297 + }, + { + "epoch": 1.67589143161256, + "grad_norm": 0.27525609731674194, + "learning_rate": 1.5133465710246568e-07, + "loss": 0.1869, + "step": 6298 + }, + { + "epoch": 1.6761575306013836, + "grad_norm": 0.26348763704299927, + "learning_rate": 1.5132016300226377e-07, + "loss": 0.1785, + "step": 6299 + }, + { + "epoch": 1.6764236295902075, + "grad_norm": 0.3020409047603607, + "learning_rate": 1.5130566743830652e-07, + "loss": 0.1895, + "step": 6300 + }, + { + "epoch": 1.6766897285790314, + "grad_norm": 0.4291873276233673, + "learning_rate": 1.5129117041100737e-07, + "loss": 0.1927, + "step": 6301 + }, + { + "epoch": 1.676955827567855, + "grad_norm": 0.37860235571861267, + "learning_rate": 1.512766719207798e-07, + "loss": 0.1828, + "step": 6302 + }, + { + "epoch": 1.6772219265566792, + "grad_norm": 0.5328285098075867, + "learning_rate": 1.5126217196803735e-07, + "loss": 0.1989, + "step": 6303 + }, + { + "epoch": 1.677488025545503, + "grad_norm": 0.2983826994895935, + "learning_rate": 1.512476705531936e-07, + "loss": 0.184, + "step": 6304 + }, + { + "epoch": 1.6777541245343268, + "grad_norm": 0.2664678990840912, + "learning_rate": 1.5123316767666216e-07, + "loss": 0.1848, + "step": 6305 + }, + { + "epoch": 1.6780202235231507, + "grad_norm": 0.31602492928504944, + "learning_rate": 1.5121866333885664e-07, + "loss": 0.1862, + "step": 6306 + }, + { + "epoch": 1.6782863225119744, + "grad_norm": 0.24876751005649567, + "learning_rate": 1.5120415754019076e-07, + "loss": 0.1893, + "step": 6307 + }, + { + "epoch": 1.6785524215007983, + "grad_norm": 0.3908171057701111, + "learning_rate": 1.511896502810783e-07, + "loss": 0.2005, + "step": 6308 + }, + { + "epoch": 1.6788185204896222, + "grad_norm": 0.2726259231567383, + "learning_rate": 1.5117514156193294e-07, + "loss": 0.1811, + "step": 6309 + }, + { + "epoch": 1.679084619478446, + "grad_norm": 0.30035483837127686, + "learning_rate": 1.5116063138316856e-07, + "loss": 0.1702, + "step": 6310 + }, + { + "epoch": 1.6793507184672698, + "grad_norm": 0.34190669655799866, + "learning_rate": 1.5114611974519902e-07, + "loss": 0.1703, + "step": 6311 + }, + { + "epoch": 1.6796168174560937, + "grad_norm": 0.2785361409187317, + "learning_rate": 1.5113160664843822e-07, + "loss": 0.1789, + "step": 6312 + }, + { + "epoch": 1.6798829164449174, + "grad_norm": 0.26725855469703674, + "learning_rate": 1.5111709209330008e-07, + "loss": 0.1872, + "step": 6313 + }, + { + "epoch": 1.6801490154337415, + "grad_norm": 0.2693401873111725, + "learning_rate": 1.5110257608019859e-07, + "loss": 0.1932, + "step": 6314 + }, + { + "epoch": 1.6804151144225652, + "grad_norm": 0.31216609477996826, + "learning_rate": 1.510880586095478e-07, + "loss": 0.1851, + "step": 6315 + }, + { + "epoch": 1.6806812134113889, + "grad_norm": 0.30098557472229004, + "learning_rate": 1.5107353968176177e-07, + "loss": 0.2025, + "step": 6316 + }, + { + "epoch": 1.680947312400213, + "grad_norm": 0.32468971610069275, + "learning_rate": 1.5105901929725459e-07, + "loss": 0.2018, + "step": 6317 + }, + { + "epoch": 1.6812134113890367, + "grad_norm": 0.2575266659259796, + "learning_rate": 1.5104449745644042e-07, + "loss": 0.1577, + "step": 6318 + }, + { + "epoch": 1.6814795103778606, + "grad_norm": 0.32336166501045227, + "learning_rate": 1.5102997415973345e-07, + "loss": 0.1853, + "step": 6319 + }, + { + "epoch": 1.6817456093666845, + "grad_norm": 0.33255645632743835, + "learning_rate": 1.510154494075479e-07, + "loss": 0.1986, + "step": 6320 + }, + { + "epoch": 1.6820117083555082, + "grad_norm": 0.37392669916152954, + "learning_rate": 1.5100092320029812e-07, + "loss": 0.2056, + "step": 6321 + }, + { + "epoch": 1.682277807344332, + "grad_norm": 0.29885944724082947, + "learning_rate": 1.509863955383983e-07, + "loss": 0.1909, + "step": 6322 + }, + { + "epoch": 1.682543906333156, + "grad_norm": 0.29467448592185974, + "learning_rate": 1.5097186642226294e-07, + "loss": 0.1777, + "step": 6323 + }, + { + "epoch": 1.6828100053219797, + "grad_norm": 0.342428982257843, + "learning_rate": 1.5095733585230638e-07, + "loss": 0.1971, + "step": 6324 + }, + { + "epoch": 1.6830761043108036, + "grad_norm": 0.2806563973426819, + "learning_rate": 1.50942803828943e-07, + "loss": 0.1801, + "step": 6325 + }, + { + "epoch": 1.6833422032996275, + "grad_norm": 0.2771647274494171, + "learning_rate": 1.5092827035258732e-07, + "loss": 0.1859, + "step": 6326 + }, + { + "epoch": 1.6836083022884512, + "grad_norm": 0.2940712571144104, + "learning_rate": 1.5091373542365392e-07, + "loss": 0.1782, + "step": 6327 + }, + { + "epoch": 1.6838744012772753, + "grad_norm": 0.33710092306137085, + "learning_rate": 1.508991990425573e-07, + "loss": 0.2004, + "step": 6328 + }, + { + "epoch": 1.684140500266099, + "grad_norm": 0.2570360004901886, + "learning_rate": 1.508846612097121e-07, + "loss": 0.1768, + "step": 6329 + }, + { + "epoch": 1.6844065992549229, + "grad_norm": 0.2762267589569092, + "learning_rate": 1.5087012192553296e-07, + "loss": 0.1816, + "step": 6330 + }, + { + "epoch": 1.6846726982437468, + "grad_norm": 0.44320258498191833, + "learning_rate": 1.5085558119043458e-07, + "loss": 0.2152, + "step": 6331 + }, + { + "epoch": 1.6849387972325705, + "grad_norm": 0.3895958960056305, + "learning_rate": 1.5084103900483166e-07, + "loss": 0.1835, + "step": 6332 + }, + { + "epoch": 1.6852048962213944, + "grad_norm": 0.5044550895690918, + "learning_rate": 1.5082649536913902e-07, + "loss": 0.1942, + "step": 6333 + }, + { + "epoch": 1.6854709952102183, + "grad_norm": 0.2666778266429901, + "learning_rate": 1.5081195028377143e-07, + "loss": 0.1748, + "step": 6334 + }, + { + "epoch": 1.685737094199042, + "grad_norm": 0.3371669352054596, + "learning_rate": 1.507974037491438e-07, + "loss": 0.1847, + "step": 6335 + }, + { + "epoch": 1.6860031931878658, + "grad_norm": 0.2783258557319641, + "learning_rate": 1.5078285576567092e-07, + "loss": 0.192, + "step": 6336 + }, + { + "epoch": 1.6862692921766897, + "grad_norm": 0.31091755628585815, + "learning_rate": 1.5076830633376784e-07, + "loss": 0.1847, + "step": 6337 + }, + { + "epoch": 1.6865353911655134, + "grad_norm": 0.26488956809043884, + "learning_rate": 1.5075375545384948e-07, + "loss": 0.1803, + "step": 6338 + }, + { + "epoch": 1.6868014901543376, + "grad_norm": 0.41381198167800903, + "learning_rate": 1.507392031263309e-07, + "loss": 0.1978, + "step": 6339 + }, + { + "epoch": 1.6870675891431612, + "grad_norm": 0.3788425624370575, + "learning_rate": 1.507246493516271e-07, + "loss": 0.1767, + "step": 6340 + }, + { + "epoch": 1.6873336881319851, + "grad_norm": 0.36244428157806396, + "learning_rate": 1.5071009413015326e-07, + "loss": 0.1947, + "step": 6341 + }, + { + "epoch": 1.687599787120809, + "grad_norm": 0.2589481472969055, + "learning_rate": 1.5069553746232448e-07, + "loss": 0.1821, + "step": 6342 + }, + { + "epoch": 1.6878658861096327, + "grad_norm": 0.2790309190750122, + "learning_rate": 1.5068097934855596e-07, + "loss": 0.1986, + "step": 6343 + }, + { + "epoch": 1.6881319850984566, + "grad_norm": 0.3856641352176666, + "learning_rate": 1.506664197892629e-07, + "loss": 0.1911, + "step": 6344 + }, + { + "epoch": 1.6883980840872805, + "grad_norm": 0.2844414710998535, + "learning_rate": 1.5065185878486063e-07, + "loss": 0.1852, + "step": 6345 + }, + { + "epoch": 1.6886641830761042, + "grad_norm": 0.267564594745636, + "learning_rate": 1.5063729633576438e-07, + "loss": 0.1908, + "step": 6346 + }, + { + "epoch": 1.6889302820649281, + "grad_norm": 0.27754929661750793, + "learning_rate": 1.5062273244238955e-07, + "loss": 0.2028, + "step": 6347 + }, + { + "epoch": 1.689196381053752, + "grad_norm": 0.36692342162132263, + "learning_rate": 1.506081671051515e-07, + "loss": 0.2103, + "step": 6348 + }, + { + "epoch": 1.6894624800425757, + "grad_norm": 0.2695501446723938, + "learning_rate": 1.5059360032446572e-07, + "loss": 0.1803, + "step": 6349 + }, + { + "epoch": 1.6897285790313998, + "grad_norm": 0.2802843749523163, + "learning_rate": 1.5057903210074763e-07, + "loss": 0.1831, + "step": 6350 + }, + { + "epoch": 1.6899946780202235, + "grad_norm": 0.28950411081314087, + "learning_rate": 1.5056446243441276e-07, + "loss": 0.1973, + "step": 6351 + }, + { + "epoch": 1.6902607770090472, + "grad_norm": 0.43852025270462036, + "learning_rate": 1.5054989132587667e-07, + "loss": 0.1934, + "step": 6352 + }, + { + "epoch": 1.6905268759978713, + "grad_norm": 0.26411333680152893, + "learning_rate": 1.5053531877555497e-07, + "loss": 0.1966, + "step": 6353 + }, + { + "epoch": 1.690792974986695, + "grad_norm": 0.35139214992523193, + "learning_rate": 1.5052074478386327e-07, + "loss": 0.1894, + "step": 6354 + }, + { + "epoch": 1.691059073975519, + "grad_norm": 0.2932315468788147, + "learning_rate": 1.5050616935121728e-07, + "loss": 0.2065, + "step": 6355 + }, + { + "epoch": 1.6913251729643428, + "grad_norm": 0.30459654331207275, + "learning_rate": 1.504915924780327e-07, + "loss": 0.1671, + "step": 6356 + }, + { + "epoch": 1.6915912719531665, + "grad_norm": 0.24209865927696228, + "learning_rate": 1.5047701416472528e-07, + "loss": 0.176, + "step": 6357 + }, + { + "epoch": 1.6918573709419904, + "grad_norm": 0.3452206552028656, + "learning_rate": 1.5046243441171085e-07, + "loss": 0.201, + "step": 6358 + }, + { + "epoch": 1.6921234699308143, + "grad_norm": 0.4960210919380188, + "learning_rate": 1.5044785321940524e-07, + "loss": 0.1842, + "step": 6359 + }, + { + "epoch": 1.692389568919638, + "grad_norm": 0.24921782314777374, + "learning_rate": 1.504332705882244e-07, + "loss": 0.1645, + "step": 6360 + }, + { + "epoch": 1.6926556679084621, + "grad_norm": 0.27805083990097046, + "learning_rate": 1.5041868651858413e-07, + "loss": 0.1846, + "step": 6361 + }, + { + "epoch": 1.6929217668972858, + "grad_norm": 0.3939339220523834, + "learning_rate": 1.504041010109005e-07, + "loss": 0.1792, + "step": 6362 + }, + { + "epoch": 1.6931878658861095, + "grad_norm": 0.28845933079719543, + "learning_rate": 1.5038951406558946e-07, + "loss": 0.1742, + "step": 6363 + }, + { + "epoch": 1.6934539648749336, + "grad_norm": 0.24728354811668396, + "learning_rate": 1.5037492568306708e-07, + "loss": 0.1856, + "step": 6364 + }, + { + "epoch": 1.6937200638637573, + "grad_norm": 0.2812005579471588, + "learning_rate": 1.503603358637495e-07, + "loss": 0.1936, + "step": 6365 + }, + { + "epoch": 1.6939861628525812, + "grad_norm": 0.3414524495601654, + "learning_rate": 1.5034574460805275e-07, + "loss": 0.1947, + "step": 6366 + }, + { + "epoch": 1.694252261841405, + "grad_norm": 0.3216850757598877, + "learning_rate": 1.503311519163931e-07, + "loss": 0.1751, + "step": 6367 + }, + { + "epoch": 1.6945183608302288, + "grad_norm": 0.2650538384914398, + "learning_rate": 1.5031655778918667e-07, + "loss": 0.1951, + "step": 6368 + }, + { + "epoch": 1.6947844598190527, + "grad_norm": 0.33169153332710266, + "learning_rate": 1.5030196222684978e-07, + "loss": 0.2037, + "step": 6369 + }, + { + "epoch": 1.6950505588078766, + "grad_norm": 0.3302684724330902, + "learning_rate": 1.5028736522979872e-07, + "loss": 0.1954, + "step": 6370 + }, + { + "epoch": 1.6953166577967003, + "grad_norm": 0.2966825067996979, + "learning_rate": 1.5027276679844985e-07, + "loss": 0.1995, + "step": 6371 + }, + { + "epoch": 1.6955827567855242, + "grad_norm": 0.27990129590034485, + "learning_rate": 1.5025816693321944e-07, + "loss": 0.1934, + "step": 6372 + }, + { + "epoch": 1.695848855774348, + "grad_norm": 0.28349944949150085, + "learning_rate": 1.5024356563452402e-07, + "loss": 0.2089, + "step": 6373 + }, + { + "epoch": 1.6961149547631718, + "grad_norm": 0.26361408829689026, + "learning_rate": 1.5022896290278003e-07, + "loss": 0.187, + "step": 6374 + }, + { + "epoch": 1.6963810537519959, + "grad_norm": 0.25832313299179077, + "learning_rate": 1.5021435873840388e-07, + "loss": 0.1746, + "step": 6375 + }, + { + "epoch": 1.6966471527408196, + "grad_norm": 0.2612083852291107, + "learning_rate": 1.5019975314181223e-07, + "loss": 0.1753, + "step": 6376 + }, + { + "epoch": 1.6969132517296435, + "grad_norm": 0.30602169036865234, + "learning_rate": 1.501851461134216e-07, + "loss": 0.203, + "step": 6377 + }, + { + "epoch": 1.6971793507184674, + "grad_norm": 0.28290632367134094, + "learning_rate": 1.5017053765364862e-07, + "loss": 0.1955, + "step": 6378 + }, + { + "epoch": 1.697445449707291, + "grad_norm": 0.25979891419410706, + "learning_rate": 1.5015592776290996e-07, + "loss": 0.1863, + "step": 6379 + }, + { + "epoch": 1.697711548696115, + "grad_norm": 0.25952082872390747, + "learning_rate": 1.501413164416223e-07, + "loss": 0.1787, + "step": 6380 + }, + { + "epoch": 1.6979776476849389, + "grad_norm": 0.26967278122901917, + "learning_rate": 1.5012670369020243e-07, + "loss": 0.2061, + "step": 6381 + }, + { + "epoch": 1.6982437466737625, + "grad_norm": 0.27470213174819946, + "learning_rate": 1.501120895090671e-07, + "loss": 0.1684, + "step": 6382 + }, + { + "epoch": 1.6985098456625864, + "grad_norm": 0.34427353739738464, + "learning_rate": 1.5009747389863315e-07, + "loss": 0.2097, + "step": 6383 + }, + { + "epoch": 1.6987759446514104, + "grad_norm": 0.2576892077922821, + "learning_rate": 1.500828568593174e-07, + "loss": 0.1752, + "step": 6384 + }, + { + "epoch": 1.699042043640234, + "grad_norm": 0.3428056240081787, + "learning_rate": 1.5006823839153686e-07, + "loss": 0.1899, + "step": 6385 + }, + { + "epoch": 1.6993081426290582, + "grad_norm": 0.3272881507873535, + "learning_rate": 1.5005361849570837e-07, + "loss": 0.1931, + "step": 6386 + }, + { + "epoch": 1.6995742416178818, + "grad_norm": 0.3869066536426544, + "learning_rate": 1.5003899717224897e-07, + "loss": 0.175, + "step": 6387 + }, + { + "epoch": 1.6998403406067057, + "grad_norm": 0.2854072153568268, + "learning_rate": 1.500243744215757e-07, + "loss": 0.2001, + "step": 6388 + }, + { + "epoch": 1.7001064395955297, + "grad_norm": 0.26038169860839844, + "learning_rate": 1.5000975024410566e-07, + "loss": 0.1695, + "step": 6389 + }, + { + "epoch": 1.7003725385843533, + "grad_norm": 0.28540298342704773, + "learning_rate": 1.4999512464025586e-07, + "loss": 0.1913, + "step": 6390 + }, + { + "epoch": 1.7006386375731772, + "grad_norm": 0.29154321551322937, + "learning_rate": 1.4998049761044352e-07, + "loss": 0.1691, + "step": 6391 + }, + { + "epoch": 1.7009047365620011, + "grad_norm": 0.3620639145374298, + "learning_rate": 1.4996586915508584e-07, + "loss": 0.191, + "step": 6392 + }, + { + "epoch": 1.7011708355508248, + "grad_norm": 0.2737441658973694, + "learning_rate": 1.499512392746e-07, + "loss": 0.1905, + "step": 6393 + }, + { + "epoch": 1.7014369345396487, + "grad_norm": 0.2536166310310364, + "learning_rate": 1.4993660796940335e-07, + "loss": 0.1673, + "step": 6394 + }, + { + "epoch": 1.7017030335284726, + "grad_norm": 0.33620160818099976, + "learning_rate": 1.4992197523991313e-07, + "loss": 0.1677, + "step": 6395 + }, + { + "epoch": 1.7019691325172963, + "grad_norm": 0.2721651494503021, + "learning_rate": 1.4990734108654677e-07, + "loss": 0.1799, + "step": 6396 + }, + { + "epoch": 1.7022352315061204, + "grad_norm": 0.2704487144947052, + "learning_rate": 1.4989270550972163e-07, + "loss": 0.1947, + "step": 6397 + }, + { + "epoch": 1.7025013304949441, + "grad_norm": 0.3332071900367737, + "learning_rate": 1.4987806850985507e-07, + "loss": 0.1956, + "step": 6398 + }, + { + "epoch": 1.7027674294837678, + "grad_norm": 0.2927371859550476, + "learning_rate": 1.498634300873647e-07, + "loss": 0.1844, + "step": 6399 + }, + { + "epoch": 1.703033528472592, + "grad_norm": 0.4691876769065857, + "learning_rate": 1.49848790242668e-07, + "loss": 0.2133, + "step": 6400 + }, + { + "epoch": 1.7032996274614156, + "grad_norm": 0.30932560563087463, + "learning_rate": 1.4983414897618244e-07, + "loss": 0.175, + "step": 6401 + }, + { + "epoch": 1.7035657264502395, + "grad_norm": 0.27354246377944946, + "learning_rate": 1.498195062883257e-07, + "loss": 0.1754, + "step": 6402 + }, + { + "epoch": 1.7038318254390634, + "grad_norm": 0.3245543837547302, + "learning_rate": 1.498048621795154e-07, + "loss": 0.1968, + "step": 6403 + }, + { + "epoch": 1.704097924427887, + "grad_norm": 0.2818162441253662, + "learning_rate": 1.4979021665016926e-07, + "loss": 0.1887, + "step": 6404 + }, + { + "epoch": 1.704364023416711, + "grad_norm": 0.2802056670188904, + "learning_rate": 1.4977556970070496e-07, + "loss": 0.19, + "step": 6405 + }, + { + "epoch": 1.704630122405535, + "grad_norm": 0.2815796434879303, + "learning_rate": 1.4976092133154023e-07, + "loss": 0.2129, + "step": 6406 + }, + { + "epoch": 1.7048962213943586, + "grad_norm": 0.30661749839782715, + "learning_rate": 1.497462715430929e-07, + "loss": 0.1889, + "step": 6407 + }, + { + "epoch": 1.7051623203831825, + "grad_norm": 0.3793058395385742, + "learning_rate": 1.4973162033578086e-07, + "loss": 0.2094, + "step": 6408 + }, + { + "epoch": 1.7054284193720064, + "grad_norm": 0.27290603518486023, + "learning_rate": 1.497169677100219e-07, + "loss": 0.189, + "step": 6409 + }, + { + "epoch": 1.70569451836083, + "grad_norm": 0.2523179054260254, + "learning_rate": 1.4970231366623403e-07, + "loss": 0.1825, + "step": 6410 + }, + { + "epoch": 1.7059606173496542, + "grad_norm": 0.2529574930667877, + "learning_rate": 1.4968765820483516e-07, + "loss": 0.1737, + "step": 6411 + }, + { + "epoch": 1.7062267163384779, + "grad_norm": 0.3055025041103363, + "learning_rate": 1.4967300132624329e-07, + "loss": 0.1789, + "step": 6412 + }, + { + "epoch": 1.7064928153273018, + "grad_norm": 0.24272292852401733, + "learning_rate": 1.496583430308765e-07, + "loss": 0.1784, + "step": 6413 + }, + { + "epoch": 1.7067589143161257, + "grad_norm": 0.2478296160697937, + "learning_rate": 1.4964368331915286e-07, + "loss": 0.172, + "step": 6414 + }, + { + "epoch": 1.7070250133049494, + "grad_norm": 0.29652461409568787, + "learning_rate": 1.4962902219149048e-07, + "loss": 0.2191, + "step": 6415 + }, + { + "epoch": 1.7072911122937733, + "grad_norm": 0.3319523334503174, + "learning_rate": 1.4961435964830754e-07, + "loss": 0.1774, + "step": 6416 + }, + { + "epoch": 1.7075572112825972, + "grad_norm": 0.2824339270591736, + "learning_rate": 1.4959969569002223e-07, + "loss": 0.1977, + "step": 6417 + }, + { + "epoch": 1.7078233102714209, + "grad_norm": 0.3287297189235687, + "learning_rate": 1.4958503031705283e-07, + "loss": 0.2038, + "step": 6418 + }, + { + "epoch": 1.7080894092602448, + "grad_norm": 0.2685225307941437, + "learning_rate": 1.495703635298176e-07, + "loss": 0.1771, + "step": 6419 + }, + { + "epoch": 1.7083555082490687, + "grad_norm": 0.2549404203891754, + "learning_rate": 1.4955569532873486e-07, + "loss": 0.1597, + "step": 6420 + }, + { + "epoch": 1.7086216072378924, + "grad_norm": 0.3964279890060425, + "learning_rate": 1.4954102571422299e-07, + "loss": 0.1892, + "step": 6421 + }, + { + "epoch": 1.7088877062267165, + "grad_norm": 0.2993737757205963, + "learning_rate": 1.495263546867004e-07, + "loss": 0.1883, + "step": 6422 + }, + { + "epoch": 1.7091538052155402, + "grad_norm": 0.28066328167915344, + "learning_rate": 1.495116822465855e-07, + "loss": 0.1863, + "step": 6423 + }, + { + "epoch": 1.709419904204364, + "grad_norm": 0.2900867462158203, + "learning_rate": 1.4949700839429686e-07, + "loss": 0.1868, + "step": 6424 + }, + { + "epoch": 1.709686003193188, + "grad_norm": 0.36536654829978943, + "learning_rate": 1.4948233313025295e-07, + "loss": 0.2021, + "step": 6425 + }, + { + "epoch": 1.7099521021820117, + "grad_norm": 0.40276825428009033, + "learning_rate": 1.4946765645487233e-07, + "loss": 0.181, + "step": 6426 + }, + { + "epoch": 1.7102182011708356, + "grad_norm": 0.3193678855895996, + "learning_rate": 1.4945297836857364e-07, + "loss": 0.1955, + "step": 6427 + }, + { + "epoch": 1.7104843001596595, + "grad_norm": 0.28198128938674927, + "learning_rate": 1.4943829887177548e-07, + "loss": 0.1726, + "step": 6428 + }, + { + "epoch": 1.7107503991484831, + "grad_norm": 0.28450286388397217, + "learning_rate": 1.494236179648966e-07, + "loss": 0.204, + "step": 6429 + }, + { + "epoch": 1.711016498137307, + "grad_norm": 0.3115438222885132, + "learning_rate": 1.494089356483557e-07, + "loss": 0.1958, + "step": 6430 + }, + { + "epoch": 1.711282597126131, + "grad_norm": 0.28830623626708984, + "learning_rate": 1.493942519225715e-07, + "loss": 0.1932, + "step": 6431 + }, + { + "epoch": 1.7115486961149546, + "grad_norm": 0.31242039799690247, + "learning_rate": 1.4937956678796293e-07, + "loss": 0.1872, + "step": 6432 + }, + { + "epoch": 1.7118147951037788, + "grad_norm": 0.2674826681613922, + "learning_rate": 1.4936488024494875e-07, + "loss": 0.1721, + "step": 6433 + }, + { + "epoch": 1.7120808940926024, + "grad_norm": 0.26196184754371643, + "learning_rate": 1.4935019229394786e-07, + "loss": 0.1717, + "step": 6434 + }, + { + "epoch": 1.7123469930814261, + "grad_norm": 0.27186766266822815, + "learning_rate": 1.4933550293537923e-07, + "loss": 0.1929, + "step": 6435 + }, + { + "epoch": 1.7126130920702503, + "grad_norm": 0.6338654160499573, + "learning_rate": 1.4932081216966178e-07, + "loss": 0.1882, + "step": 6436 + }, + { + "epoch": 1.712879191059074, + "grad_norm": 0.31282833218574524, + "learning_rate": 1.4930611999721454e-07, + "loss": 0.179, + "step": 6437 + }, + { + "epoch": 1.7131452900478978, + "grad_norm": 0.2716451585292816, + "learning_rate": 1.4929142641845656e-07, + "loss": 0.185, + "step": 6438 + }, + { + "epoch": 1.7134113890367217, + "grad_norm": 0.253803551197052, + "learning_rate": 1.4927673143380694e-07, + "loss": 0.1655, + "step": 6439 + }, + { + "epoch": 1.7136774880255454, + "grad_norm": 0.2687947452068329, + "learning_rate": 1.4926203504368482e-07, + "loss": 0.1927, + "step": 6440 + }, + { + "epoch": 1.7139435870143693, + "grad_norm": 0.4129619598388672, + "learning_rate": 1.4924733724850932e-07, + "loss": 0.2048, + "step": 6441 + }, + { + "epoch": 1.7142096860031932, + "grad_norm": 0.2654252052307129, + "learning_rate": 1.4923263804869972e-07, + "loss": 0.1844, + "step": 6442 + }, + { + "epoch": 1.714475784992017, + "grad_norm": 0.29400116205215454, + "learning_rate": 1.4921793744467524e-07, + "loss": 0.1946, + "step": 6443 + }, + { + "epoch": 1.7147418839808408, + "grad_norm": 0.26324284076690674, + "learning_rate": 1.4920323543685515e-07, + "loss": 0.1839, + "step": 6444 + }, + { + "epoch": 1.7150079829696647, + "grad_norm": 0.38389402627944946, + "learning_rate": 1.491885320256588e-07, + "loss": 0.1944, + "step": 6445 + }, + { + "epoch": 1.7152740819584884, + "grad_norm": 0.2930542230606079, + "learning_rate": 1.4917382721150555e-07, + "loss": 0.1948, + "step": 6446 + }, + { + "epoch": 1.7155401809473125, + "grad_norm": 0.2928736209869385, + "learning_rate": 1.4915912099481484e-07, + "loss": 0.1979, + "step": 6447 + }, + { + "epoch": 1.7158062799361362, + "grad_norm": 0.26272282004356384, + "learning_rate": 1.491444133760061e-07, + "loss": 0.1895, + "step": 6448 + }, + { + "epoch": 1.7160723789249601, + "grad_norm": 0.26578429341316223, + "learning_rate": 1.491297043554988e-07, + "loss": 0.1941, + "step": 6449 + }, + { + "epoch": 1.716338477913784, + "grad_norm": 0.2632911503314972, + "learning_rate": 1.4911499393371255e-07, + "loss": 0.1891, + "step": 6450 + }, + { + "epoch": 1.7166045769026077, + "grad_norm": 0.3340086042881012, + "learning_rate": 1.4910028211106682e-07, + "loss": 0.1995, + "step": 6451 + }, + { + "epoch": 1.7168706758914316, + "grad_norm": 0.7505632638931274, + "learning_rate": 1.4908556888798128e-07, + "loss": 0.1953, + "step": 6452 + }, + { + "epoch": 1.7171367748802555, + "grad_norm": 0.27924591302871704, + "learning_rate": 1.4907085426487557e-07, + "loss": 0.1904, + "step": 6453 + }, + { + "epoch": 1.7174028738690792, + "grad_norm": 0.28590548038482666, + "learning_rate": 1.4905613824216937e-07, + "loss": 0.195, + "step": 6454 + }, + { + "epoch": 1.717668972857903, + "grad_norm": 0.431984007358551, + "learning_rate": 1.4904142082028246e-07, + "loss": 0.2224, + "step": 6455 + }, + { + "epoch": 1.717935071846727, + "grad_norm": 0.3534572124481201, + "learning_rate": 1.490267019996345e-07, + "loss": 0.2156, + "step": 6456 + }, + { + "epoch": 1.7182011708355507, + "grad_norm": 0.27411776781082153, + "learning_rate": 1.4901198178064544e-07, + "loss": 0.2003, + "step": 6457 + }, + { + "epoch": 1.7184672698243748, + "grad_norm": 0.4445989727973938, + "learning_rate": 1.4899726016373503e-07, + "loss": 0.1828, + "step": 6458 + }, + { + "epoch": 1.7187333688131985, + "grad_norm": 0.24420373141765594, + "learning_rate": 1.4898253714932322e-07, + "loss": 0.1779, + "step": 6459 + }, + { + "epoch": 1.7189994678020224, + "grad_norm": 0.28620705008506775, + "learning_rate": 1.4896781273782988e-07, + "loss": 0.1904, + "step": 6460 + }, + { + "epoch": 1.7192655667908463, + "grad_norm": 0.33524835109710693, + "learning_rate": 1.4895308692967503e-07, + "loss": 0.1934, + "step": 6461 + }, + { + "epoch": 1.71953166577967, + "grad_norm": 0.29929861426353455, + "learning_rate": 1.489383597252787e-07, + "loss": 0.1945, + "step": 6462 + }, + { + "epoch": 1.7197977647684939, + "grad_norm": 0.3769231140613556, + "learning_rate": 1.4892363112506084e-07, + "loss": 0.2191, + "step": 6463 + }, + { + "epoch": 1.7200638637573178, + "grad_norm": 0.28201186656951904, + "learning_rate": 1.489089011294416e-07, + "loss": 0.1861, + "step": 6464 + }, + { + "epoch": 1.7203299627461415, + "grad_norm": 0.3817945420742035, + "learning_rate": 1.4889416973884115e-07, + "loss": 0.1916, + "step": 6465 + }, + { + "epoch": 1.7205960617349654, + "grad_norm": 0.3252500891685486, + "learning_rate": 1.4887943695367962e-07, + "loss": 0.197, + "step": 6466 + }, + { + "epoch": 1.7208621607237893, + "grad_norm": 0.2867850363254547, + "learning_rate": 1.488647027743772e-07, + "loss": 0.1851, + "step": 6467 + }, + { + "epoch": 1.721128259712613, + "grad_norm": 0.2945544123649597, + "learning_rate": 1.4884996720135416e-07, + "loss": 0.2109, + "step": 6468 + }, + { + "epoch": 1.721394358701437, + "grad_norm": 0.28646236658096313, + "learning_rate": 1.488352302350308e-07, + "loss": 0.177, + "step": 6469 + }, + { + "epoch": 1.7216604576902608, + "grad_norm": 0.26970845460891724, + "learning_rate": 1.4882049187582745e-07, + "loss": 0.1877, + "step": 6470 + }, + { + "epoch": 1.7219265566790847, + "grad_norm": 0.25638318061828613, + "learning_rate": 1.4880575212416443e-07, + "loss": 0.1839, + "step": 6471 + }, + { + "epoch": 1.7221926556679086, + "grad_norm": 0.2950208783149719, + "learning_rate": 1.4879101098046222e-07, + "loss": 0.1863, + "step": 6472 + }, + { + "epoch": 1.7224587546567323, + "grad_norm": 0.4972326457500458, + "learning_rate": 1.487762684451412e-07, + "loss": 0.1705, + "step": 6473 + }, + { + "epoch": 1.7227248536455562, + "grad_norm": 0.5180774331092834, + "learning_rate": 1.4876152451862187e-07, + "loss": 0.1797, + "step": 6474 + }, + { + "epoch": 1.72299095263438, + "grad_norm": 0.2640141546726227, + "learning_rate": 1.487467792013248e-07, + "loss": 0.1857, + "step": 6475 + }, + { + "epoch": 1.7232570516232038, + "grad_norm": 0.32860326766967773, + "learning_rate": 1.4873203249367052e-07, + "loss": 0.2058, + "step": 6476 + }, + { + "epoch": 1.7235231506120277, + "grad_norm": 0.4236132502555847, + "learning_rate": 1.4871728439607964e-07, + "loss": 0.1892, + "step": 6477 + }, + { + "epoch": 1.7237892496008516, + "grad_norm": 0.2725561559200287, + "learning_rate": 1.4870253490897282e-07, + "loss": 0.1899, + "step": 6478 + }, + { + "epoch": 1.7240553485896752, + "grad_norm": 0.26223358511924744, + "learning_rate": 1.4868778403277075e-07, + "loss": 0.1793, + "step": 6479 + }, + { + "epoch": 1.7243214475784994, + "grad_norm": 0.28892093896865845, + "learning_rate": 1.4867303176789413e-07, + "loss": 0.2017, + "step": 6480 + }, + { + "epoch": 1.724587546567323, + "grad_norm": 0.30839571356773376, + "learning_rate": 1.4865827811476374e-07, + "loss": 0.1814, + "step": 6481 + }, + { + "epoch": 1.7248536455561467, + "grad_norm": 0.4162861704826355, + "learning_rate": 1.4864352307380035e-07, + "loss": 0.1981, + "step": 6482 + }, + { + "epoch": 1.7251197445449709, + "grad_norm": 0.29394710063934326, + "learning_rate": 1.4862876664542486e-07, + "loss": 0.1976, + "step": 6483 + }, + { + "epoch": 1.7253858435337945, + "grad_norm": 0.2661214768886566, + "learning_rate": 1.4861400883005813e-07, + "loss": 0.1983, + "step": 6484 + }, + { + "epoch": 1.7256519425226184, + "grad_norm": 0.27715617418289185, + "learning_rate": 1.4859924962812104e-07, + "loss": 0.1788, + "step": 6485 + }, + { + "epoch": 1.7259180415114423, + "grad_norm": 0.2909741699695587, + "learning_rate": 1.4858448904003464e-07, + "loss": 0.1985, + "step": 6486 + }, + { + "epoch": 1.726184140500266, + "grad_norm": 0.270742803812027, + "learning_rate": 1.4856972706621987e-07, + "loss": 0.1831, + "step": 6487 + }, + { + "epoch": 1.72645023948909, + "grad_norm": 0.3138522207736969, + "learning_rate": 1.485549637070978e-07, + "loss": 0.1853, + "step": 6488 + }, + { + "epoch": 1.7267163384779138, + "grad_norm": 0.3792443871498108, + "learning_rate": 1.4854019896308944e-07, + "loss": 0.2084, + "step": 6489 + }, + { + "epoch": 1.7269824374667375, + "grad_norm": 0.3491807281970978, + "learning_rate": 1.4852543283461602e-07, + "loss": 0.2001, + "step": 6490 + }, + { + "epoch": 1.7272485364555614, + "grad_norm": 0.27479004859924316, + "learning_rate": 1.4851066532209865e-07, + "loss": 0.1716, + "step": 6491 + }, + { + "epoch": 1.7275146354443853, + "grad_norm": 0.4775432050228119, + "learning_rate": 1.484958964259585e-07, + "loss": 0.1818, + "step": 6492 + }, + { + "epoch": 1.727780734433209, + "grad_norm": 0.2644430696964264, + "learning_rate": 1.4848112614661685e-07, + "loss": 0.178, + "step": 6493 + }, + { + "epoch": 1.7280468334220331, + "grad_norm": 0.29584118723869324, + "learning_rate": 1.4846635448449497e-07, + "loss": 0.2067, + "step": 6494 + }, + { + "epoch": 1.7283129324108568, + "grad_norm": 0.37181007862091064, + "learning_rate": 1.4845158144001416e-07, + "loss": 0.1859, + "step": 6495 + }, + { + "epoch": 1.7285790313996807, + "grad_norm": 0.3219048082828522, + "learning_rate": 1.484368070135958e-07, + "loss": 0.2097, + "step": 6496 + }, + { + "epoch": 1.7288451303885046, + "grad_norm": 0.29838600754737854, + "learning_rate": 1.4842203120566125e-07, + "loss": 0.2085, + "step": 6497 + }, + { + "epoch": 1.7291112293773283, + "grad_norm": 0.3510553538799286, + "learning_rate": 1.48407254016632e-07, + "loss": 0.1954, + "step": 6498 + }, + { + "epoch": 1.7293773283661522, + "grad_norm": 0.40134310722351074, + "learning_rate": 1.4839247544692947e-07, + "loss": 0.226, + "step": 6499 + }, + { + "epoch": 1.7296434273549761, + "grad_norm": 0.2935886085033417, + "learning_rate": 1.4837769549697523e-07, + "loss": 0.1805, + "step": 6500 + }, + { + "epoch": 1.7299095263437998, + "grad_norm": 0.28133127093315125, + "learning_rate": 1.483629141671908e-07, + "loss": 0.1695, + "step": 6501 + }, + { + "epoch": 1.7301756253326237, + "grad_norm": 0.3706282675266266, + "learning_rate": 1.4834813145799778e-07, + "loss": 0.1813, + "step": 6502 + }, + { + "epoch": 1.7304417243214476, + "grad_norm": 0.29652947187423706, + "learning_rate": 1.4833334736981778e-07, + "loss": 0.196, + "step": 6503 + }, + { + "epoch": 1.7307078233102713, + "grad_norm": 0.3018967807292938, + "learning_rate": 1.483185619030725e-07, + "loss": 0.1785, + "step": 6504 + }, + { + "epoch": 1.7309739222990954, + "grad_norm": 0.29694804549217224, + "learning_rate": 1.4830377505818368e-07, + "loss": 0.1989, + "step": 6505 + }, + { + "epoch": 1.731240021287919, + "grad_norm": 0.3431495130062103, + "learning_rate": 1.48288986835573e-07, + "loss": 0.1649, + "step": 6506 + }, + { + "epoch": 1.731506120276743, + "grad_norm": 0.2749384045600891, + "learning_rate": 1.482741972356623e-07, + "loss": 0.1792, + "step": 6507 + }, + { + "epoch": 1.731772219265567, + "grad_norm": 0.30712929368019104, + "learning_rate": 1.482594062588734e-07, + "loss": 0.185, + "step": 6508 + }, + { + "epoch": 1.7320383182543906, + "grad_norm": 0.2482057362794876, + "learning_rate": 1.4824461390562819e-07, + "loss": 0.1704, + "step": 6509 + }, + { + "epoch": 1.7323044172432145, + "grad_norm": 0.26380249857902527, + "learning_rate": 1.4822982017634853e-07, + "loss": 0.1861, + "step": 6510 + }, + { + "epoch": 1.7325705162320384, + "grad_norm": 0.2571692168712616, + "learning_rate": 1.4821502507145638e-07, + "loss": 0.1859, + "step": 6511 + }, + { + "epoch": 1.732836615220862, + "grad_norm": 0.2930099368095398, + "learning_rate": 1.4820022859137376e-07, + "loss": 0.1935, + "step": 6512 + }, + { + "epoch": 1.733102714209686, + "grad_norm": 0.27632489800453186, + "learning_rate": 1.4818543073652265e-07, + "loss": 0.1822, + "step": 6513 + }, + { + "epoch": 1.7333688131985099, + "grad_norm": 0.4252636432647705, + "learning_rate": 1.4817063150732514e-07, + "loss": 0.1957, + "step": 6514 + }, + { + "epoch": 1.7336349121873336, + "grad_norm": 0.31241777539253235, + "learning_rate": 1.4815583090420337e-07, + "loss": 0.2005, + "step": 6515 + }, + { + "epoch": 1.7339010111761577, + "grad_norm": 0.27236616611480713, + "learning_rate": 1.481410289275794e-07, + "loss": 0.1881, + "step": 6516 + }, + { + "epoch": 1.7341671101649814, + "grad_norm": 0.25658631324768066, + "learning_rate": 1.4812622557787548e-07, + "loss": 0.1698, + "step": 6517 + }, + { + "epoch": 1.734433209153805, + "grad_norm": 0.3331086337566376, + "learning_rate": 1.481114208555138e-07, + "loss": 0.1818, + "step": 6518 + }, + { + "epoch": 1.7346993081426292, + "grad_norm": 0.26684093475341797, + "learning_rate": 1.4809661476091665e-07, + "loss": 0.2, + "step": 6519 + }, + { + "epoch": 1.7349654071314529, + "grad_norm": 0.29676952958106995, + "learning_rate": 1.480818072945063e-07, + "loss": 0.2076, + "step": 6520 + }, + { + "epoch": 1.7352315061202768, + "grad_norm": 0.28780949115753174, + "learning_rate": 1.4806699845670507e-07, + "loss": 0.195, + "step": 6521 + }, + { + "epoch": 1.7354976051091007, + "grad_norm": 0.38892847299575806, + "learning_rate": 1.4805218824793536e-07, + "loss": 0.2093, + "step": 6522 + }, + { + "epoch": 1.7357637040979244, + "grad_norm": 0.33447393774986267, + "learning_rate": 1.4803737666861965e-07, + "loss": 0.183, + "step": 6523 + }, + { + "epoch": 1.7360298030867483, + "grad_norm": 0.3302581012248993, + "learning_rate": 1.4802256371918032e-07, + "loss": 0.1935, + "step": 6524 + }, + { + "epoch": 1.7362959020755722, + "grad_norm": 0.4511553645133972, + "learning_rate": 1.4800774940003987e-07, + "loss": 0.2002, + "step": 6525 + }, + { + "epoch": 1.7365620010643958, + "grad_norm": 0.2810763418674469, + "learning_rate": 1.4799293371162085e-07, + "loss": 0.1902, + "step": 6526 + }, + { + "epoch": 1.7368281000532197, + "grad_norm": 0.2601986527442932, + "learning_rate": 1.4797811665434584e-07, + "loss": 0.1819, + "step": 6527 + }, + { + "epoch": 1.7370941990420437, + "grad_norm": 0.4720827341079712, + "learning_rate": 1.479632982286375e-07, + "loss": 0.2196, + "step": 6528 + }, + { + "epoch": 1.7373602980308673, + "grad_norm": 0.8594781160354614, + "learning_rate": 1.4794847843491837e-07, + "loss": 0.1725, + "step": 6529 + }, + { + "epoch": 1.7376263970196915, + "grad_norm": 0.367341548204422, + "learning_rate": 1.479336572736112e-07, + "loss": 0.2237, + "step": 6530 + }, + { + "epoch": 1.7378924960085151, + "grad_norm": 0.2670416533946991, + "learning_rate": 1.4791883474513874e-07, + "loss": 0.1937, + "step": 6531 + }, + { + "epoch": 1.738158594997339, + "grad_norm": 0.3487668037414551, + "learning_rate": 1.479040108499237e-07, + "loss": 0.175, + "step": 6532 + }, + { + "epoch": 1.738424693986163, + "grad_norm": 0.2601897716522217, + "learning_rate": 1.4788918558838897e-07, + "loss": 0.1872, + "step": 6533 + }, + { + "epoch": 1.7386907929749866, + "grad_norm": 0.3532499372959137, + "learning_rate": 1.4787435896095734e-07, + "loss": 0.189, + "step": 6534 + }, + { + "epoch": 1.7389568919638105, + "grad_norm": 0.33449095487594604, + "learning_rate": 1.478595309680517e-07, + "loss": 0.1964, + "step": 6535 + }, + { + "epoch": 1.7392229909526344, + "grad_norm": 0.33952370285987854, + "learning_rate": 1.4784470161009497e-07, + "loss": 0.1903, + "step": 6536 + }, + { + "epoch": 1.7394890899414581, + "grad_norm": 0.42873015999794006, + "learning_rate": 1.4782987088751014e-07, + "loss": 0.2216, + "step": 6537 + }, + { + "epoch": 1.739755188930282, + "grad_norm": 0.3897952735424042, + "learning_rate": 1.478150388007202e-07, + "loss": 0.1901, + "step": 6538 + }, + { + "epoch": 1.740021287919106, + "grad_norm": 0.2696506679058075, + "learning_rate": 1.4780020535014816e-07, + "loss": 0.2012, + "step": 6539 + }, + { + "epoch": 1.7402873869079296, + "grad_norm": 0.3262718915939331, + "learning_rate": 1.4778537053621716e-07, + "loss": 0.1846, + "step": 6540 + }, + { + "epoch": 1.7405534858967537, + "grad_norm": 0.4226069748401642, + "learning_rate": 1.4777053435935027e-07, + "loss": 0.2011, + "step": 6541 + }, + { + "epoch": 1.7408195848855774, + "grad_norm": 0.2686389684677124, + "learning_rate": 1.477556968199707e-07, + "loss": 0.1704, + "step": 6542 + }, + { + "epoch": 1.7410856838744013, + "grad_norm": 0.2751832604408264, + "learning_rate": 1.4774085791850155e-07, + "loss": 0.1898, + "step": 6543 + }, + { + "epoch": 1.7413517828632252, + "grad_norm": 0.3289005756378174, + "learning_rate": 1.4772601765536616e-07, + "loss": 0.2016, + "step": 6544 + }, + { + "epoch": 1.741617881852049, + "grad_norm": 0.29387667775154114, + "learning_rate": 1.4771117603098777e-07, + "loss": 0.1684, + "step": 6545 + }, + { + "epoch": 1.7418839808408728, + "grad_norm": 0.2802862524986267, + "learning_rate": 1.476963330457897e-07, + "loss": 0.1922, + "step": 6546 + }, + { + "epoch": 1.7421500798296967, + "grad_norm": 0.2967197299003601, + "learning_rate": 1.4768148870019524e-07, + "loss": 0.206, + "step": 6547 + }, + { + "epoch": 1.7424161788185204, + "grad_norm": 0.35070762038230896, + "learning_rate": 1.4766664299462785e-07, + "loss": 0.1818, + "step": 6548 + }, + { + "epoch": 1.7426822778073443, + "grad_norm": 0.4278595745563507, + "learning_rate": 1.4765179592951093e-07, + "loss": 0.1867, + "step": 6549 + }, + { + "epoch": 1.7429483767961682, + "grad_norm": 0.2908295691013336, + "learning_rate": 1.4763694750526797e-07, + "loss": 0.1902, + "step": 6550 + }, + { + "epoch": 1.743214475784992, + "grad_norm": 0.2829039990901947, + "learning_rate": 1.4762209772232246e-07, + "loss": 0.1885, + "step": 6551 + }, + { + "epoch": 1.743480574773816, + "grad_norm": 0.2634366452693939, + "learning_rate": 1.4760724658109798e-07, + "loss": 0.1914, + "step": 6552 + }, + { + "epoch": 1.7437466737626397, + "grad_norm": 0.3223978281021118, + "learning_rate": 1.4759239408201806e-07, + "loss": 0.1794, + "step": 6553 + }, + { + "epoch": 1.7440127727514634, + "grad_norm": 0.2600601017475128, + "learning_rate": 1.4757754022550632e-07, + "loss": 0.188, + "step": 6554 + }, + { + "epoch": 1.7442788717402875, + "grad_norm": 0.34591948986053467, + "learning_rate": 1.4756268501198647e-07, + "loss": 0.222, + "step": 6555 + }, + { + "epoch": 1.7445449707291112, + "grad_norm": 0.2784506380558014, + "learning_rate": 1.4754782844188222e-07, + "loss": 0.1913, + "step": 6556 + }, + { + "epoch": 1.744811069717935, + "grad_norm": 0.2705960273742676, + "learning_rate": 1.4753297051561725e-07, + "loss": 0.1951, + "step": 6557 + }, + { + "epoch": 1.745077168706759, + "grad_norm": 0.2953375279903412, + "learning_rate": 1.4751811123361536e-07, + "loss": 0.1849, + "step": 6558 + }, + { + "epoch": 1.7453432676955827, + "grad_norm": 0.2682315707206726, + "learning_rate": 1.475032505963004e-07, + "loss": 0.166, + "step": 6559 + }, + { + "epoch": 1.7456093666844066, + "grad_norm": 0.28446999192237854, + "learning_rate": 1.4748838860409618e-07, + "loss": 0.1927, + "step": 6560 + }, + { + "epoch": 1.7458754656732305, + "grad_norm": 0.4641436040401459, + "learning_rate": 1.474735252574266e-07, + "loss": 0.235, + "step": 6561 + }, + { + "epoch": 1.7461415646620542, + "grad_norm": 0.26314106583595276, + "learning_rate": 1.4745866055671567e-07, + "loss": 0.1725, + "step": 6562 + }, + { + "epoch": 1.746407663650878, + "grad_norm": 0.48643654584884644, + "learning_rate": 1.4744379450238729e-07, + "loss": 0.1951, + "step": 6563 + }, + { + "epoch": 1.746673762639702, + "grad_norm": 0.36449411511421204, + "learning_rate": 1.4742892709486546e-07, + "loss": 0.2003, + "step": 6564 + }, + { + "epoch": 1.7469398616285257, + "grad_norm": 0.379000723361969, + "learning_rate": 1.4741405833457423e-07, + "loss": 0.1981, + "step": 6565 + }, + { + "epoch": 1.7472059606173498, + "grad_norm": 0.3180731236934662, + "learning_rate": 1.4739918822193772e-07, + "loss": 0.1825, + "step": 6566 + }, + { + "epoch": 1.7474720596061735, + "grad_norm": 0.39213669300079346, + "learning_rate": 1.4738431675738007e-07, + "loss": 0.1885, + "step": 6567 + }, + { + "epoch": 1.7477381585949974, + "grad_norm": 0.4120500385761261, + "learning_rate": 1.473694439413254e-07, + "loss": 0.2206, + "step": 6568 + }, + { + "epoch": 1.7480042575838213, + "grad_norm": 0.26274192333221436, + "learning_rate": 1.4735456977419792e-07, + "loss": 0.1748, + "step": 6569 + }, + { + "epoch": 1.748270356572645, + "grad_norm": 0.385774701833725, + "learning_rate": 1.4733969425642188e-07, + "loss": 0.193, + "step": 6570 + }, + { + "epoch": 1.7485364555614689, + "grad_norm": 0.38263529539108276, + "learning_rate": 1.4732481738842157e-07, + "loss": 0.1944, + "step": 6571 + }, + { + "epoch": 1.7488025545502928, + "grad_norm": 0.33118608593940735, + "learning_rate": 1.4730993917062127e-07, + "loss": 0.1868, + "step": 6572 + }, + { + "epoch": 1.7490686535391164, + "grad_norm": 0.2767537534236908, + "learning_rate": 1.472950596034454e-07, + "loss": 0.2043, + "step": 6573 + }, + { + "epoch": 1.7493347525279404, + "grad_norm": 0.26870197057724, + "learning_rate": 1.4728017868731834e-07, + "loss": 0.1742, + "step": 6574 + }, + { + "epoch": 1.7496008515167643, + "grad_norm": 0.4336867332458496, + "learning_rate": 1.4726529642266444e-07, + "loss": 0.2147, + "step": 6575 + }, + { + "epoch": 1.749866950505588, + "grad_norm": 0.3129862844944, + "learning_rate": 1.4725041280990827e-07, + "loss": 0.1902, + "step": 6576 + }, + { + "epoch": 1.750133049494412, + "grad_norm": 0.28309670090675354, + "learning_rate": 1.4723552784947432e-07, + "loss": 0.197, + "step": 6577 + }, + { + "epoch": 1.7503991484832357, + "grad_norm": 0.3819194734096527, + "learning_rate": 1.4722064154178714e-07, + "loss": 0.1914, + "step": 6578 + }, + { + "epoch": 1.7506652474720596, + "grad_norm": 0.36833426356315613, + "learning_rate": 1.472057538872713e-07, + "loss": 0.2114, + "step": 6579 + }, + { + "epoch": 1.7509313464608836, + "grad_norm": 0.2664751410484314, + "learning_rate": 1.471908648863514e-07, + "loss": 0.1853, + "step": 6580 + }, + { + "epoch": 1.7511974454497072, + "grad_norm": 0.4369317889213562, + "learning_rate": 1.4717597453945215e-07, + "loss": 0.1988, + "step": 6581 + }, + { + "epoch": 1.7514635444385311, + "grad_norm": 0.3204042911529541, + "learning_rate": 1.4716108284699828e-07, + "loss": 0.1838, + "step": 6582 + }, + { + "epoch": 1.751729643427355, + "grad_norm": 0.27393779158592224, + "learning_rate": 1.4714618980941445e-07, + "loss": 0.179, + "step": 6583 + }, + { + "epoch": 1.7519957424161787, + "grad_norm": 0.3336947560310364, + "learning_rate": 1.471312954271255e-07, + "loss": 0.1976, + "step": 6584 + }, + { + "epoch": 1.7522618414050026, + "grad_norm": 0.25890135765075684, + "learning_rate": 1.4711639970055622e-07, + "loss": 0.1862, + "step": 6585 + }, + { + "epoch": 1.7525279403938265, + "grad_norm": 0.27527135610580444, + "learning_rate": 1.471015026301315e-07, + "loss": 0.184, + "step": 6586 + }, + { + "epoch": 1.7527940393826502, + "grad_norm": 0.3076099753379822, + "learning_rate": 1.4708660421627617e-07, + "loss": 0.2015, + "step": 6587 + }, + { + "epoch": 1.7530601383714743, + "grad_norm": 0.2550155520439148, + "learning_rate": 1.4707170445941523e-07, + "loss": 0.1873, + "step": 6588 + }, + { + "epoch": 1.753326237360298, + "grad_norm": 0.4823455512523651, + "learning_rate": 1.4705680335997361e-07, + "loss": 0.1864, + "step": 6589 + }, + { + "epoch": 1.753592336349122, + "grad_norm": 0.3215424120426178, + "learning_rate": 1.4704190091837634e-07, + "loss": 0.1924, + "step": 6590 + }, + { + "epoch": 1.7538584353379458, + "grad_norm": 0.9199762344360352, + "learning_rate": 1.4702699713504847e-07, + "loss": 0.1824, + "step": 6591 + }, + { + "epoch": 1.7541245343267695, + "grad_norm": 0.25311025977134705, + "learning_rate": 1.4701209201041507e-07, + "loss": 0.1763, + "step": 6592 + }, + { + "epoch": 1.7543906333155934, + "grad_norm": 0.27077797055244446, + "learning_rate": 1.4699718554490128e-07, + "loss": 0.1984, + "step": 6593 + }, + { + "epoch": 1.7546567323044173, + "grad_norm": 0.25177958607673645, + "learning_rate": 1.469822777389322e-07, + "loss": 0.1789, + "step": 6594 + }, + { + "epoch": 1.754922831293241, + "grad_norm": 0.2705676257610321, + "learning_rate": 1.4696736859293315e-07, + "loss": 0.1779, + "step": 6595 + }, + { + "epoch": 1.755188930282065, + "grad_norm": 0.31384414434432983, + "learning_rate": 1.469524581073293e-07, + "loss": 0.1764, + "step": 6596 + }, + { + "epoch": 1.7554550292708888, + "grad_norm": 0.2743685245513916, + "learning_rate": 1.4693754628254593e-07, + "loss": 0.1929, + "step": 6597 + }, + { + "epoch": 1.7557211282597125, + "grad_norm": 0.3139224946498871, + "learning_rate": 1.4692263311900833e-07, + "loss": 0.1916, + "step": 6598 + }, + { + "epoch": 1.7559872272485366, + "grad_norm": 0.2694171071052551, + "learning_rate": 1.469077186171419e-07, + "loss": 0.1837, + "step": 6599 + }, + { + "epoch": 1.7562533262373603, + "grad_norm": 0.28658825159072876, + "learning_rate": 1.4689280277737204e-07, + "loss": 0.2117, + "step": 6600 + }, + { + "epoch": 1.756519425226184, + "grad_norm": 0.26935875415802, + "learning_rate": 1.4687788560012412e-07, + "loss": 0.1905, + "step": 6601 + }, + { + "epoch": 1.756785524215008, + "grad_norm": 0.47827181220054626, + "learning_rate": 1.4686296708582368e-07, + "loss": 0.2062, + "step": 6602 + }, + { + "epoch": 1.7570516232038318, + "grad_norm": 0.29131531715393066, + "learning_rate": 1.4684804723489614e-07, + "loss": 0.1837, + "step": 6603 + }, + { + "epoch": 1.7573177221926557, + "grad_norm": 0.2559327483177185, + "learning_rate": 1.4683312604776714e-07, + "loss": 0.1627, + "step": 6604 + }, + { + "epoch": 1.7575838211814796, + "grad_norm": 0.34736624360084534, + "learning_rate": 1.4681820352486217e-07, + "loss": 0.1853, + "step": 6605 + }, + { + "epoch": 1.7578499201703033, + "grad_norm": 0.2683543860912323, + "learning_rate": 1.4680327966660694e-07, + "loss": 0.1839, + "step": 6606 + }, + { + "epoch": 1.7581160191591272, + "grad_norm": 0.280998170375824, + "learning_rate": 1.4678835447342703e-07, + "loss": 0.1696, + "step": 6607 + }, + { + "epoch": 1.758382118147951, + "grad_norm": 0.2645224630832672, + "learning_rate": 1.4677342794574815e-07, + "loss": 0.1931, + "step": 6608 + }, + { + "epoch": 1.7586482171367748, + "grad_norm": 0.35598355531692505, + "learning_rate": 1.4675850008399614e-07, + "loss": 0.1832, + "step": 6609 + }, + { + "epoch": 1.7589143161255987, + "grad_norm": 0.2625458836555481, + "learning_rate": 1.4674357088859662e-07, + "loss": 0.1855, + "step": 6610 + }, + { + "epoch": 1.7591804151144226, + "grad_norm": 0.2932438552379608, + "learning_rate": 1.467286403599755e-07, + "loss": 0.1972, + "step": 6611 + }, + { + "epoch": 1.7594465141032463, + "grad_norm": 0.2841866910457611, + "learning_rate": 1.4671370849855856e-07, + "loss": 0.1927, + "step": 6612 + }, + { + "epoch": 1.7597126130920704, + "grad_norm": 0.37058135867118835, + "learning_rate": 1.4669877530477176e-07, + "loss": 0.1824, + "step": 6613 + }, + { + "epoch": 1.759978712080894, + "grad_norm": 0.3943903148174286, + "learning_rate": 1.4668384077904102e-07, + "loss": 0.1814, + "step": 6614 + }, + { + "epoch": 1.760244811069718, + "grad_norm": 0.26963093876838684, + "learning_rate": 1.4666890492179223e-07, + "loss": 0.1865, + "step": 6615 + }, + { + "epoch": 1.7605109100585419, + "grad_norm": 0.2799406945705414, + "learning_rate": 1.4665396773345144e-07, + "loss": 0.1758, + "step": 6616 + }, + { + "epoch": 1.7607770090473656, + "grad_norm": 0.3411124646663666, + "learning_rate": 1.466390292144447e-07, + "loss": 0.195, + "step": 6617 + }, + { + "epoch": 1.7610431080361895, + "grad_norm": 0.2600868046283722, + "learning_rate": 1.4662408936519808e-07, + "loss": 0.183, + "step": 6618 + }, + { + "epoch": 1.7613092070250134, + "grad_norm": 0.39670529961586, + "learning_rate": 1.4660914818613765e-07, + "loss": 0.1993, + "step": 6619 + }, + { + "epoch": 1.761575306013837, + "grad_norm": 0.2692319452762604, + "learning_rate": 1.4659420567768965e-07, + "loss": 0.1951, + "step": 6620 + }, + { + "epoch": 1.761841405002661, + "grad_norm": 0.28405481576919556, + "learning_rate": 1.465792618402802e-07, + "loss": 0.1917, + "step": 6621 + }, + { + "epoch": 1.7621075039914849, + "grad_norm": 0.23974347114562988, + "learning_rate": 1.4656431667433555e-07, + "loss": 0.163, + "step": 6622 + }, + { + "epoch": 1.7623736029803085, + "grad_norm": 0.27200227975845337, + "learning_rate": 1.4654937018028193e-07, + "loss": 0.1866, + "step": 6623 + }, + { + "epoch": 1.7626397019691327, + "grad_norm": 0.2797248959541321, + "learning_rate": 1.4653442235854571e-07, + "loss": 0.1736, + "step": 6624 + }, + { + "epoch": 1.7629058009579563, + "grad_norm": 0.2677887976169586, + "learning_rate": 1.465194732095532e-07, + "loss": 0.1948, + "step": 6625 + }, + { + "epoch": 1.7631718999467803, + "grad_norm": 0.275398850440979, + "learning_rate": 1.465045227337308e-07, + "loss": 0.1753, + "step": 6626 + }, + { + "epoch": 1.7634379989356042, + "grad_norm": 0.3696049153804779, + "learning_rate": 1.4648957093150488e-07, + "loss": 0.1988, + "step": 6627 + }, + { + "epoch": 1.7637040979244278, + "grad_norm": 0.2779494524002075, + "learning_rate": 1.4647461780330194e-07, + "loss": 0.209, + "step": 6628 + }, + { + "epoch": 1.7639701969132517, + "grad_norm": 0.2984350621700287, + "learning_rate": 1.4645966334954847e-07, + "loss": 0.1858, + "step": 6629 + }, + { + "epoch": 1.7642362959020756, + "grad_norm": 0.3335179090499878, + "learning_rate": 1.4644470757067096e-07, + "loss": 0.1839, + "step": 6630 + }, + { + "epoch": 1.7645023948908993, + "grad_norm": 0.5267853736877441, + "learning_rate": 1.4642975046709602e-07, + "loss": 0.2145, + "step": 6631 + }, + { + "epoch": 1.7647684938797232, + "grad_norm": 0.2618173360824585, + "learning_rate": 1.4641479203925025e-07, + "loss": 0.1781, + "step": 6632 + }, + { + "epoch": 1.7650345928685471, + "grad_norm": 0.35260099172592163, + "learning_rate": 1.463998322875603e-07, + "loss": 0.1955, + "step": 6633 + }, + { + "epoch": 1.7653006918573708, + "grad_norm": 0.36177971959114075, + "learning_rate": 1.463848712124528e-07, + "loss": 0.1912, + "step": 6634 + }, + { + "epoch": 1.765566790846195, + "grad_norm": 0.2496425211429596, + "learning_rate": 1.463699088143546e-07, + "loss": 0.1812, + "step": 6635 + }, + { + "epoch": 1.7658328898350186, + "grad_norm": 0.5132352113723755, + "learning_rate": 1.4635494509369233e-07, + "loss": 0.198, + "step": 6636 + }, + { + "epoch": 1.7660989888238423, + "grad_norm": 0.3702837824821472, + "learning_rate": 1.4633998005089284e-07, + "loss": 0.2072, + "step": 6637 + }, + { + "epoch": 1.7663650878126664, + "grad_norm": 0.3243215084075928, + "learning_rate": 1.463250136863829e-07, + "loss": 0.1806, + "step": 6638 + }, + { + "epoch": 1.7666311868014901, + "grad_norm": 0.2767656743526459, + "learning_rate": 1.4631004600058948e-07, + "loss": 0.1748, + "step": 6639 + }, + { + "epoch": 1.766897285790314, + "grad_norm": 0.28326064348220825, + "learning_rate": 1.4629507699393944e-07, + "loss": 0.1771, + "step": 6640 + }, + { + "epoch": 1.767163384779138, + "grad_norm": 0.3221464157104492, + "learning_rate": 1.462801066668597e-07, + "loss": 0.1983, + "step": 6641 + }, + { + "epoch": 1.7674294837679616, + "grad_norm": 0.27444130182266235, + "learning_rate": 1.462651350197773e-07, + "loss": 0.1809, + "step": 6642 + }, + { + "epoch": 1.7676955827567855, + "grad_norm": 0.2635551691055298, + "learning_rate": 1.4625016205311922e-07, + "loss": 0.1757, + "step": 6643 + }, + { + "epoch": 1.7679616817456094, + "grad_norm": 0.33674880862236023, + "learning_rate": 1.4623518776731256e-07, + "loss": 0.2062, + "step": 6644 + }, + { + "epoch": 1.768227780734433, + "grad_norm": 0.26229459047317505, + "learning_rate": 1.4622021216278435e-07, + "loss": 0.1654, + "step": 6645 + }, + { + "epoch": 1.768493879723257, + "grad_norm": 0.26150205731391907, + "learning_rate": 1.462052352399618e-07, + "loss": 0.1793, + "step": 6646 + }, + { + "epoch": 1.768759978712081, + "grad_norm": 0.24726566672325134, + "learning_rate": 1.4619025699927202e-07, + "loss": 0.1612, + "step": 6647 + }, + { + "epoch": 1.7690260777009046, + "grad_norm": 0.3249672055244446, + "learning_rate": 1.4617527744114227e-07, + "loss": 0.188, + "step": 6648 + }, + { + "epoch": 1.7692921766897287, + "grad_norm": 0.28735899925231934, + "learning_rate": 1.4616029656599972e-07, + "loss": 0.2001, + "step": 6649 + }, + { + "epoch": 1.7695582756785524, + "grad_norm": 0.32060903310775757, + "learning_rate": 1.4614531437427177e-07, + "loss": 0.1846, + "step": 6650 + }, + { + "epoch": 1.7698243746673763, + "grad_norm": 0.3001398742198944, + "learning_rate": 1.4613033086638567e-07, + "loss": 0.1893, + "step": 6651 + }, + { + "epoch": 1.7700904736562002, + "grad_norm": 0.25612351298332214, + "learning_rate": 1.4611534604276878e-07, + "loss": 0.1722, + "step": 6652 + }, + { + "epoch": 1.7703565726450239, + "grad_norm": 0.3124397397041321, + "learning_rate": 1.4610035990384853e-07, + "loss": 0.1996, + "step": 6653 + }, + { + "epoch": 1.7706226716338478, + "grad_norm": 0.4175869822502136, + "learning_rate": 1.4608537245005232e-07, + "loss": 0.2056, + "step": 6654 + }, + { + "epoch": 1.7708887706226717, + "grad_norm": 0.3613967299461365, + "learning_rate": 1.4607038368180765e-07, + "loss": 0.1919, + "step": 6655 + }, + { + "epoch": 1.7711548696114954, + "grad_norm": 0.29747307300567627, + "learning_rate": 1.46055393599542e-07, + "loss": 0.1904, + "step": 6656 + }, + { + "epoch": 1.7714209686003193, + "grad_norm": 0.2719172239303589, + "learning_rate": 1.4604040220368295e-07, + "loss": 0.1639, + "step": 6657 + }, + { + "epoch": 1.7716870675891432, + "grad_norm": 0.2724403440952301, + "learning_rate": 1.4602540949465807e-07, + "loss": 0.1956, + "step": 6658 + }, + { + "epoch": 1.7719531665779669, + "grad_norm": 0.354735791683197, + "learning_rate": 1.4601041547289497e-07, + "loss": 0.2185, + "step": 6659 + }, + { + "epoch": 1.772219265566791, + "grad_norm": 0.3903568983078003, + "learning_rate": 1.4599542013882133e-07, + "loss": 0.1946, + "step": 6660 + }, + { + "epoch": 1.7724853645556147, + "grad_norm": 0.42717376351356506, + "learning_rate": 1.4598042349286485e-07, + "loss": 0.1979, + "step": 6661 + }, + { + "epoch": 1.7727514635444386, + "grad_norm": 0.31366658210754395, + "learning_rate": 1.4596542553545326e-07, + "loss": 0.1989, + "step": 6662 + }, + { + "epoch": 1.7730175625332625, + "grad_norm": 0.3306035101413727, + "learning_rate": 1.459504262670143e-07, + "loss": 0.1778, + "step": 6663 + }, + { + "epoch": 1.7732836615220862, + "grad_norm": 0.2529279589653015, + "learning_rate": 1.459354256879758e-07, + "loss": 0.1563, + "step": 6664 + }, + { + "epoch": 1.77354976051091, + "grad_norm": 0.3830258250236511, + "learning_rate": 1.4592042379876565e-07, + "loss": 0.2006, + "step": 6665 + }, + { + "epoch": 1.773815859499734, + "grad_norm": 0.37390679121017456, + "learning_rate": 1.459054205998117e-07, + "loss": 0.1954, + "step": 6666 + }, + { + "epoch": 1.7740819584885577, + "grad_norm": 0.3775559961795807, + "learning_rate": 1.4589041609154183e-07, + "loss": 0.2019, + "step": 6667 + }, + { + "epoch": 1.7743480574773816, + "grad_norm": 0.3115694522857666, + "learning_rate": 1.4587541027438405e-07, + "loss": 0.1911, + "step": 6668 + }, + { + "epoch": 1.7746141564662055, + "grad_norm": 0.2553480565547943, + "learning_rate": 1.4586040314876637e-07, + "loss": 0.1811, + "step": 6669 + }, + { + "epoch": 1.7748802554550291, + "grad_norm": 0.49981993436813354, + "learning_rate": 1.4584539471511677e-07, + "loss": 0.195, + "step": 6670 + }, + { + "epoch": 1.7751463544438533, + "grad_norm": 0.3575712740421295, + "learning_rate": 1.4583038497386337e-07, + "loss": 0.1865, + "step": 6671 + }, + { + "epoch": 1.775412453432677, + "grad_norm": 0.4988846480846405, + "learning_rate": 1.4581537392543428e-07, + "loss": 0.1864, + "step": 6672 + }, + { + "epoch": 1.7756785524215006, + "grad_norm": 0.3993239402770996, + "learning_rate": 1.458003615702576e-07, + "loss": 0.2358, + "step": 6673 + }, + { + "epoch": 1.7759446514103248, + "grad_norm": 0.3062998950481415, + "learning_rate": 1.4578534790876153e-07, + "loss": 0.1895, + "step": 6674 + }, + { + "epoch": 1.7762107503991484, + "grad_norm": 0.3320925831794739, + "learning_rate": 1.457703329413743e-07, + "loss": 0.1926, + "step": 6675 + }, + { + "epoch": 1.7764768493879723, + "grad_norm": 0.5152183175086975, + "learning_rate": 1.4575531666852417e-07, + "loss": 0.1983, + "step": 6676 + }, + { + "epoch": 1.7767429483767962, + "grad_norm": 0.26760998368263245, + "learning_rate": 1.4574029909063944e-07, + "loss": 0.1738, + "step": 6677 + }, + { + "epoch": 1.77700904736562, + "grad_norm": 0.29532596468925476, + "learning_rate": 1.4572528020814842e-07, + "loss": 0.2045, + "step": 6678 + }, + { + "epoch": 1.7772751463544438, + "grad_norm": 0.41477736830711365, + "learning_rate": 1.457102600214795e-07, + "loss": 0.2143, + "step": 6679 + }, + { + "epoch": 1.7775412453432677, + "grad_norm": 0.27220967411994934, + "learning_rate": 1.4569523853106107e-07, + "loss": 0.1732, + "step": 6680 + }, + { + "epoch": 1.7778073443320914, + "grad_norm": 0.29109129309654236, + "learning_rate": 1.4568021573732156e-07, + "loss": 0.1926, + "step": 6681 + }, + { + "epoch": 1.7780734433209153, + "grad_norm": 0.30330485105514526, + "learning_rate": 1.456651916406895e-07, + "loss": 0.1858, + "step": 6682 + }, + { + "epoch": 1.7783395423097392, + "grad_norm": 0.2979039251804352, + "learning_rate": 1.456501662415934e-07, + "loss": 0.2082, + "step": 6683 + }, + { + "epoch": 1.778605641298563, + "grad_norm": 0.3564295172691345, + "learning_rate": 1.4563513954046173e-07, + "loss": 0.1898, + "step": 6684 + }, + { + "epoch": 1.778871740287387, + "grad_norm": 0.2700052857398987, + "learning_rate": 1.4562011153772318e-07, + "loss": 0.1861, + "step": 6685 + }, + { + "epoch": 1.7791378392762107, + "grad_norm": 0.24499459564685822, + "learning_rate": 1.456050822338063e-07, + "loss": 0.1681, + "step": 6686 + }, + { + "epoch": 1.7794039382650346, + "grad_norm": 0.3588707447052002, + "learning_rate": 1.4559005162913986e-07, + "loss": 0.1839, + "step": 6687 + }, + { + "epoch": 1.7796700372538585, + "grad_norm": 0.3667908310890198, + "learning_rate": 1.455750197241525e-07, + "loss": 0.1652, + "step": 6688 + }, + { + "epoch": 1.7799361362426822, + "grad_norm": 0.2942008078098297, + "learning_rate": 1.4555998651927296e-07, + "loss": 0.1834, + "step": 6689 + }, + { + "epoch": 1.7802022352315061, + "grad_norm": 0.28781789541244507, + "learning_rate": 1.4554495201493e-07, + "loss": 0.199, + "step": 6690 + }, + { + "epoch": 1.78046833422033, + "grad_norm": 0.27220961451530457, + "learning_rate": 1.4552991621155245e-07, + "loss": 0.1918, + "step": 6691 + }, + { + "epoch": 1.7807344332091537, + "grad_norm": 0.3081338703632355, + "learning_rate": 1.455148791095692e-07, + "loss": 0.169, + "step": 6692 + }, + { + "epoch": 1.7810005321979776, + "grad_norm": 0.35641586780548096, + "learning_rate": 1.454998407094091e-07, + "loss": 0.1966, + "step": 6693 + }, + { + "epoch": 1.7812666311868015, + "grad_norm": 0.3830374479293823, + "learning_rate": 1.4548480101150107e-07, + "loss": 0.2032, + "step": 6694 + }, + { + "epoch": 1.7815327301756252, + "grad_norm": 0.32930490374565125, + "learning_rate": 1.454697600162741e-07, + "loss": 0.1666, + "step": 6695 + }, + { + "epoch": 1.7817988291644493, + "grad_norm": 0.3319058120250702, + "learning_rate": 1.4545471772415717e-07, + "loss": 0.1733, + "step": 6696 + }, + { + "epoch": 1.782064928153273, + "grad_norm": 0.2845511734485626, + "learning_rate": 1.4543967413557932e-07, + "loss": 0.1816, + "step": 6697 + }, + { + "epoch": 1.782331027142097, + "grad_norm": 0.34192442893981934, + "learning_rate": 1.4542462925096964e-07, + "loss": 0.1755, + "step": 6698 + }, + { + "epoch": 1.7825971261309208, + "grad_norm": 0.24751584231853485, + "learning_rate": 1.454095830707572e-07, + "loss": 0.1766, + "step": 6699 + }, + { + "epoch": 1.7828632251197445, + "grad_norm": 0.5070513486862183, + "learning_rate": 1.453945355953712e-07, + "loss": 0.1925, + "step": 6700 + }, + { + "epoch": 1.7831293241085684, + "grad_norm": 0.3225368559360504, + "learning_rate": 1.4537948682524076e-07, + "loss": 0.2006, + "step": 6701 + }, + { + "epoch": 1.7833954230973923, + "grad_norm": 0.26242807507514954, + "learning_rate": 1.4536443676079517e-07, + "loss": 0.1758, + "step": 6702 + }, + { + "epoch": 1.783661522086216, + "grad_norm": 0.2777569591999054, + "learning_rate": 1.4534938540246366e-07, + "loss": 0.1948, + "step": 6703 + }, + { + "epoch": 1.7839276210750399, + "grad_norm": 0.32016855478286743, + "learning_rate": 1.453343327506755e-07, + "loss": 0.1854, + "step": 6704 + }, + { + "epoch": 1.7841937200638638, + "grad_norm": 0.2691032886505127, + "learning_rate": 1.453192788058601e-07, + "loss": 0.1762, + "step": 6705 + }, + { + "epoch": 1.7844598190526875, + "grad_norm": 0.2690059244632721, + "learning_rate": 1.4530422356844673e-07, + "loss": 0.1881, + "step": 6706 + }, + { + "epoch": 1.7847259180415116, + "grad_norm": 0.2999396026134491, + "learning_rate": 1.4528916703886483e-07, + "loss": 0.1914, + "step": 6707 + }, + { + "epoch": 1.7849920170303353, + "grad_norm": 0.24789397418498993, + "learning_rate": 1.4527410921754387e-07, + "loss": 0.1754, + "step": 6708 + }, + { + "epoch": 1.7852581160191592, + "grad_norm": 0.2872558832168579, + "learning_rate": 1.4525905010491333e-07, + "loss": 0.1939, + "step": 6709 + }, + { + "epoch": 1.785524215007983, + "grad_norm": 0.2952738106250763, + "learning_rate": 1.452439897014027e-07, + "loss": 0.2041, + "step": 6710 + }, + { + "epoch": 1.7857903139968068, + "grad_norm": 0.2523152232170105, + "learning_rate": 1.4522892800744154e-07, + "loss": 0.1656, + "step": 6711 + }, + { + "epoch": 1.7860564129856307, + "grad_norm": 0.2745237648487091, + "learning_rate": 1.4521386502345946e-07, + "loss": 0.187, + "step": 6712 + }, + { + "epoch": 1.7863225119744546, + "grad_norm": 0.2655170261859894, + "learning_rate": 1.4519880074988607e-07, + "loss": 0.1836, + "step": 6713 + }, + { + "epoch": 1.7865886109632783, + "grad_norm": 0.35862547159194946, + "learning_rate": 1.4518373518715098e-07, + "loss": 0.1914, + "step": 6714 + }, + { + "epoch": 1.7868547099521022, + "grad_norm": 0.28318628668785095, + "learning_rate": 1.45168668335684e-07, + "loss": 0.1908, + "step": 6715 + }, + { + "epoch": 1.787120808940926, + "grad_norm": 0.26008903980255127, + "learning_rate": 1.451536001959148e-07, + "loss": 0.178, + "step": 6716 + }, + { + "epoch": 1.7873869079297497, + "grad_norm": 0.26387888193130493, + "learning_rate": 1.4513853076827315e-07, + "loss": 0.1835, + "step": 6717 + }, + { + "epoch": 1.7876530069185739, + "grad_norm": 0.24933458864688873, + "learning_rate": 1.451234600531889e-07, + "loss": 0.1792, + "step": 6718 + }, + { + "epoch": 1.7879191059073976, + "grad_norm": 0.27063265442848206, + "learning_rate": 1.4510838805109187e-07, + "loss": 0.186, + "step": 6719 + }, + { + "epoch": 1.7881852048962212, + "grad_norm": 0.3364933729171753, + "learning_rate": 1.4509331476241196e-07, + "loss": 0.1601, + "step": 6720 + }, + { + "epoch": 1.7884513038850454, + "grad_norm": 0.5911145806312561, + "learning_rate": 1.4507824018757904e-07, + "loss": 0.201, + "step": 6721 + }, + { + "epoch": 1.788717402873869, + "grad_norm": 0.2837567627429962, + "learning_rate": 1.4506316432702313e-07, + "loss": 0.1994, + "step": 6722 + }, + { + "epoch": 1.788983501862693, + "grad_norm": 0.28728267550468445, + "learning_rate": 1.450480871811742e-07, + "loss": 0.1821, + "step": 6723 + }, + { + "epoch": 1.7892496008515169, + "grad_norm": 0.4070538580417633, + "learning_rate": 1.4503300875046227e-07, + "loss": 0.1686, + "step": 6724 + }, + { + "epoch": 1.7895156998403405, + "grad_norm": 0.36447757482528687, + "learning_rate": 1.4501792903531743e-07, + "loss": 0.1822, + "step": 6725 + }, + { + "epoch": 1.7897817988291644, + "grad_norm": 0.2514243423938751, + "learning_rate": 1.4500284803616976e-07, + "loss": 0.1646, + "step": 6726 + }, + { + "epoch": 1.7900478978179883, + "grad_norm": 0.2725420594215393, + "learning_rate": 1.4498776575344943e-07, + "loss": 0.1812, + "step": 6727 + }, + { + "epoch": 1.790313996806812, + "grad_norm": 0.3251517713069916, + "learning_rate": 1.4497268218758659e-07, + "loss": 0.2017, + "step": 6728 + }, + { + "epoch": 1.790580095795636, + "grad_norm": 0.34265318512916565, + "learning_rate": 1.4495759733901148e-07, + "loss": 0.2051, + "step": 6729 + }, + { + "epoch": 1.7908461947844598, + "grad_norm": 0.2632901668548584, + "learning_rate": 1.4494251120815433e-07, + "loss": 0.1743, + "step": 6730 + }, + { + "epoch": 1.7911122937732835, + "grad_norm": 0.3951264023780823, + "learning_rate": 1.4492742379544542e-07, + "loss": 0.1948, + "step": 6731 + }, + { + "epoch": 1.7913783927621076, + "grad_norm": 0.26985493302345276, + "learning_rate": 1.449123351013151e-07, + "loss": 0.1806, + "step": 6732 + }, + { + "epoch": 1.7916444917509313, + "grad_norm": 0.2936021685600281, + "learning_rate": 1.4489724512619371e-07, + "loss": 0.1911, + "step": 6733 + }, + { + "epoch": 1.7919105907397552, + "grad_norm": 0.32856693863868713, + "learning_rate": 1.4488215387051168e-07, + "loss": 0.1867, + "step": 6734 + }, + { + "epoch": 1.7921766897285791, + "grad_norm": 0.40430018305778503, + "learning_rate": 1.4486706133469936e-07, + "loss": 0.2108, + "step": 6735 + }, + { + "epoch": 1.7924427887174028, + "grad_norm": 0.3537963926792145, + "learning_rate": 1.4485196751918733e-07, + "loss": 0.1975, + "step": 6736 + }, + { + "epoch": 1.7927088877062267, + "grad_norm": 0.4880559742450714, + "learning_rate": 1.4483687242440604e-07, + "loss": 0.2082, + "step": 6737 + }, + { + "epoch": 1.7929749866950506, + "grad_norm": 0.2790803611278534, + "learning_rate": 1.4482177605078604e-07, + "loss": 0.1788, + "step": 6738 + }, + { + "epoch": 1.7932410856838743, + "grad_norm": 0.26696228981018066, + "learning_rate": 1.4480667839875784e-07, + "loss": 0.1796, + "step": 6739 + }, + { + "epoch": 1.7935071846726982, + "grad_norm": 0.2924599349498749, + "learning_rate": 1.4479157946875216e-07, + "loss": 0.2084, + "step": 6740 + }, + { + "epoch": 1.7937732836615221, + "grad_norm": 0.2810971140861511, + "learning_rate": 1.4477647926119962e-07, + "loss": 0.1642, + "step": 6741 + }, + { + "epoch": 1.7940393826503458, + "grad_norm": 0.27385181188583374, + "learning_rate": 1.447613777765309e-07, + "loss": 0.1878, + "step": 6742 + }, + { + "epoch": 1.79430548163917, + "grad_norm": 0.32169800996780396, + "learning_rate": 1.4474627501517672e-07, + "loss": 0.1924, + "step": 6743 + }, + { + "epoch": 1.7945715806279936, + "grad_norm": 0.2768717110157013, + "learning_rate": 1.4473117097756785e-07, + "loss": 0.2007, + "step": 6744 + }, + { + "epoch": 1.7948376796168175, + "grad_norm": 0.2516356110572815, + "learning_rate": 1.447160656641351e-07, + "loss": 0.1605, + "step": 6745 + }, + { + "epoch": 1.7951037786056414, + "grad_norm": 0.26475441455841064, + "learning_rate": 1.4470095907530925e-07, + "loss": 0.1831, + "step": 6746 + }, + { + "epoch": 1.795369877594465, + "grad_norm": 0.3622029423713684, + "learning_rate": 1.4468585121152127e-07, + "loss": 0.1908, + "step": 6747 + }, + { + "epoch": 1.795635976583289, + "grad_norm": 0.31068867444992065, + "learning_rate": 1.4467074207320197e-07, + "loss": 0.2009, + "step": 6748 + }, + { + "epoch": 1.795902075572113, + "grad_norm": 0.2808581292629242, + "learning_rate": 1.4465563166078235e-07, + "loss": 0.192, + "step": 6749 + }, + { + "epoch": 1.7961681745609366, + "grad_norm": 0.3945479393005371, + "learning_rate": 1.4464051997469334e-07, + "loss": 0.1933, + "step": 6750 + }, + { + "epoch": 1.7964342735497605, + "grad_norm": 0.2886165678501129, + "learning_rate": 1.44625407015366e-07, + "loss": 0.1919, + "step": 6751 + }, + { + "epoch": 1.7967003725385844, + "grad_norm": 0.3888639211654663, + "learning_rate": 1.4461029278323138e-07, + "loss": 0.2017, + "step": 6752 + }, + { + "epoch": 1.796966471527408, + "grad_norm": 0.5568135380744934, + "learning_rate": 1.4459517727872052e-07, + "loss": 0.1746, + "step": 6753 + }, + { + "epoch": 1.7972325705162322, + "grad_norm": 0.4898662567138672, + "learning_rate": 1.4458006050226464e-07, + "loss": 0.2193, + "step": 6754 + }, + { + "epoch": 1.7974986695050559, + "grad_norm": 0.27916067838668823, + "learning_rate": 1.4456494245429482e-07, + "loss": 0.1914, + "step": 6755 + }, + { + "epoch": 1.7977647684938796, + "grad_norm": 0.3918978273868561, + "learning_rate": 1.4454982313524226e-07, + "loss": 0.1899, + "step": 6756 + }, + { + "epoch": 1.7980308674827037, + "grad_norm": 0.3459922671318054, + "learning_rate": 1.4453470254553823e-07, + "loss": 0.192, + "step": 6757 + }, + { + "epoch": 1.7982969664715274, + "grad_norm": 0.26022714376449585, + "learning_rate": 1.44519580685614e-07, + "loss": 0.1757, + "step": 6758 + }, + { + "epoch": 1.7985630654603513, + "grad_norm": 0.26924094557762146, + "learning_rate": 1.445044575559008e-07, + "loss": 0.1831, + "step": 6759 + }, + { + "epoch": 1.7988291644491752, + "grad_norm": 0.5758342742919922, + "learning_rate": 1.4448933315683012e-07, + "loss": 0.1875, + "step": 6760 + }, + { + "epoch": 1.7990952634379989, + "grad_norm": 0.3818381726741791, + "learning_rate": 1.4447420748883318e-07, + "loss": 0.1957, + "step": 6761 + }, + { + "epoch": 1.7993613624268228, + "grad_norm": 0.3095133304595947, + "learning_rate": 1.4445908055234154e-07, + "loss": 0.2101, + "step": 6762 + }, + { + "epoch": 1.7996274614156467, + "grad_norm": 0.280236154794693, + "learning_rate": 1.4444395234778652e-07, + "loss": 0.206, + "step": 6763 + }, + { + "epoch": 1.7998935604044703, + "grad_norm": 0.2709401249885559, + "learning_rate": 1.4442882287559968e-07, + "loss": 0.1807, + "step": 6764 + }, + { + "epoch": 1.8001596593932943, + "grad_norm": 0.3613347113132477, + "learning_rate": 1.444136921362125e-07, + "loss": 0.2114, + "step": 6765 + }, + { + "epoch": 1.8004257583821182, + "grad_norm": 0.38656166195869446, + "learning_rate": 1.4439856013005663e-07, + "loss": 0.2168, + "step": 6766 + }, + { + "epoch": 1.8006918573709418, + "grad_norm": 0.3775942921638489, + "learning_rate": 1.4438342685756357e-07, + "loss": 0.1984, + "step": 6767 + }, + { + "epoch": 1.800957956359766, + "grad_norm": 0.3474915027618408, + "learning_rate": 1.4436829231916497e-07, + "loss": 0.1953, + "step": 6768 + }, + { + "epoch": 1.8012240553485896, + "grad_norm": 0.34879758954048157, + "learning_rate": 1.4435315651529253e-07, + "loss": 0.1807, + "step": 6769 + }, + { + "epoch": 1.8014901543374136, + "grad_norm": 0.2838355600833893, + "learning_rate": 1.4433801944637791e-07, + "loss": 0.1773, + "step": 6770 + }, + { + "epoch": 1.8017562533262375, + "grad_norm": 0.26404139399528503, + "learning_rate": 1.443228811128529e-07, + "loss": 0.1726, + "step": 6771 + }, + { + "epoch": 1.8020223523150611, + "grad_norm": 0.33506739139556885, + "learning_rate": 1.4430774151514925e-07, + "loss": 0.2004, + "step": 6772 + }, + { + "epoch": 1.802288451303885, + "grad_norm": 0.29671555757522583, + "learning_rate": 1.442926006536988e-07, + "loss": 0.1815, + "step": 6773 + }, + { + "epoch": 1.802554550292709, + "grad_norm": 0.3456249535083771, + "learning_rate": 1.4427745852893333e-07, + "loss": 0.2036, + "step": 6774 + }, + { + "epoch": 1.8028206492815326, + "grad_norm": 0.2685635983943939, + "learning_rate": 1.4426231514128478e-07, + "loss": 0.1839, + "step": 6775 + }, + { + "epoch": 1.8030867482703565, + "grad_norm": 0.5977779030799866, + "learning_rate": 1.4424717049118508e-07, + "loss": 0.2009, + "step": 6776 + }, + { + "epoch": 1.8033528472591804, + "grad_norm": 0.2844739258289337, + "learning_rate": 1.442320245790661e-07, + "loss": 0.195, + "step": 6777 + }, + { + "epoch": 1.8036189462480041, + "grad_norm": 0.2938966155052185, + "learning_rate": 1.4421687740535996e-07, + "loss": 0.1909, + "step": 6778 + }, + { + "epoch": 1.8038850452368282, + "grad_norm": 0.3353230953216553, + "learning_rate": 1.442017289704986e-07, + "loss": 0.1903, + "step": 6779 + }, + { + "epoch": 1.804151144225652, + "grad_norm": 0.25453513860702515, + "learning_rate": 1.441865792749141e-07, + "loss": 0.1718, + "step": 6780 + }, + { + "epoch": 1.8044172432144758, + "grad_norm": 0.29312801361083984, + "learning_rate": 1.4417142831903858e-07, + "loss": 0.1961, + "step": 6781 + }, + { + "epoch": 1.8046833422032997, + "grad_norm": 0.26960891485214233, + "learning_rate": 1.4415627610330417e-07, + "loss": 0.1886, + "step": 6782 + }, + { + "epoch": 1.8049494411921234, + "grad_norm": 0.27885696291923523, + "learning_rate": 1.44141122628143e-07, + "loss": 0.1942, + "step": 6783 + }, + { + "epoch": 1.8052155401809473, + "grad_norm": 0.2640572786331177, + "learning_rate": 1.4412596789398736e-07, + "loss": 0.1861, + "step": 6784 + }, + { + "epoch": 1.8054816391697712, + "grad_norm": 0.3863358199596405, + "learning_rate": 1.441108119012694e-07, + "loss": 0.1879, + "step": 6785 + }, + { + "epoch": 1.805747738158595, + "grad_norm": 0.28835374116897583, + "learning_rate": 1.4409565465042147e-07, + "loss": 0.1873, + "step": 6786 + }, + { + "epoch": 1.8060138371474188, + "grad_norm": 0.40086960792541504, + "learning_rate": 1.440804961418759e-07, + "loss": 0.1988, + "step": 6787 + }, + { + "epoch": 1.8062799361362427, + "grad_norm": 0.257276713848114, + "learning_rate": 1.4406533637606496e-07, + "loss": 0.1799, + "step": 6788 + }, + { + "epoch": 1.8065460351250664, + "grad_norm": 0.33527353405952454, + "learning_rate": 1.440501753534211e-07, + "loss": 0.1862, + "step": 6789 + }, + { + "epoch": 1.8068121341138905, + "grad_norm": 0.2559032738208771, + "learning_rate": 1.4403501307437672e-07, + "loss": 0.1763, + "step": 6790 + }, + { + "epoch": 1.8070782331027142, + "grad_norm": 0.39955195784568787, + "learning_rate": 1.440198495393643e-07, + "loss": 0.1917, + "step": 6791 + }, + { + "epoch": 1.8073443320915379, + "grad_norm": 0.369741290807724, + "learning_rate": 1.440046847488163e-07, + "loss": 0.1793, + "step": 6792 + }, + { + "epoch": 1.807610431080362, + "grad_norm": 0.25419551134109497, + "learning_rate": 1.4398951870316523e-07, + "loss": 0.1697, + "step": 6793 + }, + { + "epoch": 1.8078765300691857, + "grad_norm": 0.3391941785812378, + "learning_rate": 1.4397435140284375e-07, + "loss": 0.1846, + "step": 6794 + }, + { + "epoch": 1.8081426290580096, + "grad_norm": 0.48397964239120483, + "learning_rate": 1.4395918284828436e-07, + "loss": 0.1858, + "step": 6795 + }, + { + "epoch": 1.8084087280468335, + "grad_norm": 0.2953970730304718, + "learning_rate": 1.439440130399198e-07, + "loss": 0.2056, + "step": 6796 + }, + { + "epoch": 1.8086748270356572, + "grad_norm": 0.2845960557460785, + "learning_rate": 1.4392884197818266e-07, + "loss": 0.1951, + "step": 6797 + }, + { + "epoch": 1.808940926024481, + "grad_norm": 0.2825565040111542, + "learning_rate": 1.439136696635057e-07, + "loss": 0.1755, + "step": 6798 + }, + { + "epoch": 1.809207025013305, + "grad_norm": 0.31135454773902893, + "learning_rate": 1.438984960963216e-07, + "loss": 0.2077, + "step": 6799 + }, + { + "epoch": 1.8094731240021287, + "grad_norm": 0.27008458971977234, + "learning_rate": 1.4388332127706325e-07, + "loss": 0.1863, + "step": 6800 + }, + { + "epoch": 1.8097392229909528, + "grad_norm": 0.2893833518028259, + "learning_rate": 1.4386814520616334e-07, + "loss": 0.1955, + "step": 6801 + }, + { + "epoch": 1.8100053219797765, + "grad_norm": 0.6172904372215271, + "learning_rate": 1.4385296788405482e-07, + "loss": 0.1666, + "step": 6802 + }, + { + "epoch": 1.8102714209686002, + "grad_norm": 0.30946168303489685, + "learning_rate": 1.4383778931117055e-07, + "loss": 0.2169, + "step": 6803 + }, + { + "epoch": 1.8105375199574243, + "grad_norm": 0.26585692167282104, + "learning_rate": 1.4382260948794343e-07, + "loss": 0.2071, + "step": 6804 + }, + { + "epoch": 1.810803618946248, + "grad_norm": 0.3119787871837616, + "learning_rate": 1.4380742841480644e-07, + "loss": 0.1755, + "step": 6805 + }, + { + "epoch": 1.8110697179350719, + "grad_norm": 0.32410797476768494, + "learning_rate": 1.4379224609219256e-07, + "loss": 0.1945, + "step": 6806 + }, + { + "epoch": 1.8113358169238958, + "grad_norm": 0.2837260365486145, + "learning_rate": 1.4377706252053485e-07, + "loss": 0.165, + "step": 6807 + }, + { + "epoch": 1.8116019159127195, + "grad_norm": 0.29710233211517334, + "learning_rate": 1.4376187770026632e-07, + "loss": 0.195, + "step": 6808 + }, + { + "epoch": 1.8118680149015434, + "grad_norm": 0.4956790804862976, + "learning_rate": 1.4374669163182015e-07, + "loss": 0.1907, + "step": 6809 + }, + { + "epoch": 1.8121341138903673, + "grad_norm": 0.4067981541156769, + "learning_rate": 1.4373150431562944e-07, + "loss": 0.2017, + "step": 6810 + }, + { + "epoch": 1.812400212879191, + "grad_norm": 0.26300108432769775, + "learning_rate": 1.4371631575212732e-07, + "loss": 0.1893, + "step": 6811 + }, + { + "epoch": 1.8126663118680149, + "grad_norm": 0.2729385197162628, + "learning_rate": 1.4370112594174706e-07, + "loss": 0.2052, + "step": 6812 + }, + { + "epoch": 1.8129324108568388, + "grad_norm": 0.41724833846092224, + "learning_rate": 1.436859348849219e-07, + "loss": 0.1832, + "step": 6813 + }, + { + "epoch": 1.8131985098456624, + "grad_norm": 0.2593103349208832, + "learning_rate": 1.4367074258208507e-07, + "loss": 0.175, + "step": 6814 + }, + { + "epoch": 1.8134646088344866, + "grad_norm": 0.27738335728645325, + "learning_rate": 1.4365554903366993e-07, + "loss": 0.1988, + "step": 6815 + }, + { + "epoch": 1.8137307078233103, + "grad_norm": 0.3232709467411041, + "learning_rate": 1.4364035424010983e-07, + "loss": 0.194, + "step": 6816 + }, + { + "epoch": 1.8139968068121342, + "grad_norm": 0.2731611430644989, + "learning_rate": 1.4362515820183814e-07, + "loss": 0.1746, + "step": 6817 + }, + { + "epoch": 1.814262905800958, + "grad_norm": 0.2914995551109314, + "learning_rate": 1.4360996091928829e-07, + "loss": 0.2088, + "step": 6818 + }, + { + "epoch": 1.8145290047897817, + "grad_norm": 0.24736511707305908, + "learning_rate": 1.4359476239289374e-07, + "loss": 0.1706, + "step": 6819 + }, + { + "epoch": 1.8147951037786056, + "grad_norm": 0.3569827973842621, + "learning_rate": 1.4357956262308796e-07, + "loss": 0.1804, + "step": 6820 + }, + { + "epoch": 1.8150612027674295, + "grad_norm": 0.2715674936771393, + "learning_rate": 1.435643616103045e-07, + "loss": 0.1896, + "step": 6821 + }, + { + "epoch": 1.8153273017562532, + "grad_norm": 0.3336048424243927, + "learning_rate": 1.4354915935497695e-07, + "loss": 0.1894, + "step": 6822 + }, + { + "epoch": 1.8155934007450771, + "grad_norm": 0.36628276109695435, + "learning_rate": 1.4353395585753884e-07, + "loss": 0.1974, + "step": 6823 + }, + { + "epoch": 1.815859499733901, + "grad_norm": 0.36510804295539856, + "learning_rate": 1.435187511184239e-07, + "loss": 0.1937, + "step": 6824 + }, + { + "epoch": 1.8161255987227247, + "grad_norm": 0.27474358677864075, + "learning_rate": 1.4350354513806569e-07, + "loss": 0.1798, + "step": 6825 + }, + { + "epoch": 1.8163916977115488, + "grad_norm": 0.2548660337924957, + "learning_rate": 1.4348833791689797e-07, + "loss": 0.1794, + "step": 6826 + }, + { + "epoch": 1.8166577967003725, + "grad_norm": 0.31037116050720215, + "learning_rate": 1.434731294553545e-07, + "loss": 0.1947, + "step": 6827 + }, + { + "epoch": 1.8169238956891964, + "grad_norm": 0.2812161445617676, + "learning_rate": 1.4345791975386905e-07, + "loss": 0.1746, + "step": 6828 + }, + { + "epoch": 1.8171899946780203, + "grad_norm": 0.3236081898212433, + "learning_rate": 1.4344270881287536e-07, + "loss": 0.2054, + "step": 6829 + }, + { + "epoch": 1.817456093666844, + "grad_norm": 0.26743069291114807, + "learning_rate": 1.4342749663280739e-07, + "loss": 0.1872, + "step": 6830 + }, + { + "epoch": 1.817722192655668, + "grad_norm": 0.28069034218788147, + "learning_rate": 1.4341228321409897e-07, + "loss": 0.2048, + "step": 6831 + }, + { + "epoch": 1.8179882916444918, + "grad_norm": 0.2635621130466461, + "learning_rate": 1.4339706855718397e-07, + "loss": 0.1742, + "step": 6832 + }, + { + "epoch": 1.8182543906333155, + "grad_norm": 0.2808069586753845, + "learning_rate": 1.4338185266249642e-07, + "loss": 0.1874, + "step": 6833 + }, + { + "epoch": 1.8185204896221394, + "grad_norm": 0.40334993600845337, + "learning_rate": 1.4336663553047025e-07, + "loss": 0.2141, + "step": 6834 + }, + { + "epoch": 1.8187865886109633, + "grad_norm": 0.27139443159103394, + "learning_rate": 1.4335141716153953e-07, + "loss": 0.1822, + "step": 6835 + }, + { + "epoch": 1.819052687599787, + "grad_norm": 0.2651747763156891, + "learning_rate": 1.433361975561383e-07, + "loss": 0.1896, + "step": 6836 + }, + { + "epoch": 1.8193187865886111, + "grad_norm": 0.289449006319046, + "learning_rate": 1.4332097671470065e-07, + "loss": 0.2023, + "step": 6837 + }, + { + "epoch": 1.8195848855774348, + "grad_norm": 0.2616565227508545, + "learning_rate": 1.433057546376607e-07, + "loss": 0.1926, + "step": 6838 + }, + { + "epoch": 1.8198509845662585, + "grad_norm": 0.26833927631378174, + "learning_rate": 1.4329053132545267e-07, + "loss": 0.201, + "step": 6839 + }, + { + "epoch": 1.8201170835550826, + "grad_norm": 0.27753159403800964, + "learning_rate": 1.4327530677851067e-07, + "loss": 0.1978, + "step": 6840 + }, + { + "epoch": 1.8203831825439063, + "grad_norm": 0.2777571678161621, + "learning_rate": 1.4326008099726898e-07, + "loss": 0.1989, + "step": 6841 + }, + { + "epoch": 1.8206492815327302, + "grad_norm": 0.33565789461135864, + "learning_rate": 1.432448539821619e-07, + "loss": 0.1912, + "step": 6842 + }, + { + "epoch": 1.820915380521554, + "grad_norm": 0.2582360506057739, + "learning_rate": 1.432296257336237e-07, + "loss": 0.1779, + "step": 6843 + }, + { + "epoch": 1.8211814795103778, + "grad_norm": 0.2891208231449127, + "learning_rate": 1.4321439625208869e-07, + "loss": 0.1775, + "step": 6844 + }, + { + "epoch": 1.8214475784992017, + "grad_norm": 0.2738055884838104, + "learning_rate": 1.431991655379913e-07, + "loss": 0.1684, + "step": 6845 + }, + { + "epoch": 1.8217136774880256, + "grad_norm": 0.25771188735961914, + "learning_rate": 1.4318393359176598e-07, + "loss": 0.1704, + "step": 6846 + }, + { + "epoch": 1.8219797764768493, + "grad_norm": 0.2818349301815033, + "learning_rate": 1.4316870041384706e-07, + "loss": 0.1953, + "step": 6847 + }, + { + "epoch": 1.8222458754656732, + "grad_norm": 0.36276406049728394, + "learning_rate": 1.431534660046691e-07, + "loss": 0.1861, + "step": 6848 + }, + { + "epoch": 1.822511974454497, + "grad_norm": 0.3193511366844177, + "learning_rate": 1.431382303646666e-07, + "loss": 0.1793, + "step": 6849 + }, + { + "epoch": 1.8227780734433208, + "grad_norm": 0.37148693203926086, + "learning_rate": 1.431229934942741e-07, + "loss": 0.214, + "step": 6850 + }, + { + "epoch": 1.823044172432145, + "grad_norm": 0.34686970710754395, + "learning_rate": 1.431077553939262e-07, + "loss": 0.169, + "step": 6851 + }, + { + "epoch": 1.8233102714209686, + "grad_norm": 0.26319029927253723, + "learning_rate": 1.430925160640575e-07, + "loss": 0.198, + "step": 6852 + }, + { + "epoch": 1.8235763704097925, + "grad_norm": 0.2811274528503418, + "learning_rate": 1.4307727550510266e-07, + "loss": 0.1962, + "step": 6853 + }, + { + "epoch": 1.8238424693986164, + "grad_norm": 0.2748839259147644, + "learning_rate": 1.4306203371749644e-07, + "loss": 0.1872, + "step": 6854 + }, + { + "epoch": 1.82410856838744, + "grad_norm": 0.41081148386001587, + "learning_rate": 1.4304679070167347e-07, + "loss": 0.2076, + "step": 6855 + }, + { + "epoch": 1.824374667376264, + "grad_norm": 0.2743383049964905, + "learning_rate": 1.430315464580686e-07, + "loss": 0.1733, + "step": 6856 + }, + { + "epoch": 1.8246407663650879, + "grad_norm": 0.2824670672416687, + "learning_rate": 1.4301630098711654e-07, + "loss": 0.1895, + "step": 6857 + }, + { + "epoch": 1.8249068653539116, + "grad_norm": 0.2911315858364105, + "learning_rate": 1.4300105428925218e-07, + "loss": 0.2002, + "step": 6858 + }, + { + "epoch": 1.8251729643427355, + "grad_norm": 0.34878015518188477, + "learning_rate": 1.4298580636491036e-07, + "loss": 0.1809, + "step": 6859 + }, + { + "epoch": 1.8254390633315594, + "grad_norm": 0.2833704352378845, + "learning_rate": 1.4297055721452597e-07, + "loss": 0.1966, + "step": 6860 + }, + { + "epoch": 1.825705162320383, + "grad_norm": 0.2704705595970154, + "learning_rate": 1.4295530683853403e-07, + "loss": 0.1999, + "step": 6861 + }, + { + "epoch": 1.8259712613092072, + "grad_norm": 0.26068323850631714, + "learning_rate": 1.4294005523736939e-07, + "loss": 0.1904, + "step": 6862 + }, + { + "epoch": 1.8262373602980309, + "grad_norm": 0.25572124123573303, + "learning_rate": 1.4292480241146713e-07, + "loss": 0.1641, + "step": 6863 + }, + { + "epoch": 1.8265034592868548, + "grad_norm": 0.2662104070186615, + "learning_rate": 1.429095483612623e-07, + "loss": 0.1904, + "step": 6864 + }, + { + "epoch": 1.8267695582756787, + "grad_norm": 0.2542107105255127, + "learning_rate": 1.4289429308718993e-07, + "loss": 0.1828, + "step": 6865 + }, + { + "epoch": 1.8270356572645023, + "grad_norm": 0.37685903906822205, + "learning_rate": 1.4287903658968516e-07, + "loss": 0.2016, + "step": 6866 + }, + { + "epoch": 1.8273017562533262, + "grad_norm": 0.266279011964798, + "learning_rate": 1.4286377886918314e-07, + "loss": 0.1718, + "step": 6867 + }, + { + "epoch": 1.8275678552421502, + "grad_norm": 0.26272842288017273, + "learning_rate": 1.4284851992611905e-07, + "loss": 0.1849, + "step": 6868 + }, + { + "epoch": 1.8278339542309738, + "grad_norm": 0.25654712319374084, + "learning_rate": 1.4283325976092808e-07, + "loss": 0.1774, + "step": 6869 + }, + { + "epoch": 1.8281000532197977, + "grad_norm": 0.25164639949798584, + "learning_rate": 1.428179983740455e-07, + "loss": 0.1891, + "step": 6870 + }, + { + "epoch": 1.8283661522086216, + "grad_norm": 0.2642848491668701, + "learning_rate": 1.4280273576590662e-07, + "loss": 0.1898, + "step": 6871 + }, + { + "epoch": 1.8286322511974453, + "grad_norm": 0.3800680637359619, + "learning_rate": 1.4278747193694675e-07, + "loss": 0.2212, + "step": 6872 + }, + { + "epoch": 1.8288983501862695, + "grad_norm": 0.2721620202064514, + "learning_rate": 1.4277220688760118e-07, + "loss": 0.1878, + "step": 6873 + }, + { + "epoch": 1.8291644491750931, + "grad_norm": 0.25141769647598267, + "learning_rate": 1.427569406183054e-07, + "loss": 0.1682, + "step": 6874 + }, + { + "epoch": 1.8294305481639168, + "grad_norm": 0.2897357940673828, + "learning_rate": 1.4274167312949474e-07, + "loss": 0.2037, + "step": 6875 + }, + { + "epoch": 1.829696647152741, + "grad_norm": 0.2578985393047333, + "learning_rate": 1.4272640442160476e-07, + "loss": 0.1734, + "step": 6876 + }, + { + "epoch": 1.8299627461415646, + "grad_norm": 0.2600593864917755, + "learning_rate": 1.4271113449507085e-07, + "loss": 0.188, + "step": 6877 + }, + { + "epoch": 1.8302288451303885, + "grad_norm": 0.29616954922676086, + "learning_rate": 1.4269586335032862e-07, + "loss": 0.189, + "step": 6878 + }, + { + "epoch": 1.8304949441192124, + "grad_norm": 0.2638096511363983, + "learning_rate": 1.426805909878136e-07, + "loss": 0.1703, + "step": 6879 + }, + { + "epoch": 1.8307610431080361, + "grad_norm": 0.41731172800064087, + "learning_rate": 1.426653174079614e-07, + "loss": 0.1929, + "step": 6880 + }, + { + "epoch": 1.83102714209686, + "grad_norm": 0.35357052087783813, + "learning_rate": 1.4265004261120764e-07, + "loss": 0.1929, + "step": 6881 + }, + { + "epoch": 1.831293241085684, + "grad_norm": 0.3067391812801361, + "learning_rate": 1.42634766597988e-07, + "loss": 0.1916, + "step": 6882 + }, + { + "epoch": 1.8315593400745076, + "grad_norm": 0.34317272901535034, + "learning_rate": 1.4261948936873818e-07, + "loss": 0.2022, + "step": 6883 + }, + { + "epoch": 1.8318254390633315, + "grad_norm": 0.2712608277797699, + "learning_rate": 1.426042109238939e-07, + "loss": 0.1713, + "step": 6884 + }, + { + "epoch": 1.8320915380521554, + "grad_norm": 0.29191941022872925, + "learning_rate": 1.4258893126389097e-07, + "loss": 0.1915, + "step": 6885 + }, + { + "epoch": 1.832357637040979, + "grad_norm": 0.2750273048877716, + "learning_rate": 1.4257365038916518e-07, + "loss": 0.2012, + "step": 6886 + }, + { + "epoch": 1.8326237360298032, + "grad_norm": 0.3280467987060547, + "learning_rate": 1.4255836830015233e-07, + "loss": 0.2036, + "step": 6887 + }, + { + "epoch": 1.832889835018627, + "grad_norm": 0.2984772324562073, + "learning_rate": 1.4254308499728835e-07, + "loss": 0.2037, + "step": 6888 + }, + { + "epoch": 1.8331559340074508, + "grad_norm": 0.3280640244483948, + "learning_rate": 1.4252780048100913e-07, + "loss": 0.1913, + "step": 6889 + }, + { + "epoch": 1.8334220329962747, + "grad_norm": 0.2907763123512268, + "learning_rate": 1.4251251475175065e-07, + "loss": 0.2032, + "step": 6890 + }, + { + "epoch": 1.8336881319850984, + "grad_norm": 0.3408104181289673, + "learning_rate": 1.4249722780994885e-07, + "loss": 0.1951, + "step": 6891 + }, + { + "epoch": 1.8339542309739223, + "grad_norm": 0.2685275375843048, + "learning_rate": 1.424819396560397e-07, + "loss": 0.1758, + "step": 6892 + }, + { + "epoch": 1.8342203299627462, + "grad_norm": 0.491514652967453, + "learning_rate": 1.4246665029045938e-07, + "loss": 0.1738, + "step": 6893 + }, + { + "epoch": 1.8344864289515699, + "grad_norm": 0.2802923619747162, + "learning_rate": 1.4245135971364388e-07, + "loss": 0.1972, + "step": 6894 + }, + { + "epoch": 1.8347525279403938, + "grad_norm": 0.282671719789505, + "learning_rate": 1.424360679260293e-07, + "loss": 0.199, + "step": 6895 + }, + { + "epoch": 1.8350186269292177, + "grad_norm": 0.2620566487312317, + "learning_rate": 1.4242077492805184e-07, + "loss": 0.1781, + "step": 6896 + }, + { + "epoch": 1.8352847259180414, + "grad_norm": 0.3358902335166931, + "learning_rate": 1.424054807201477e-07, + "loss": 0.1903, + "step": 6897 + }, + { + "epoch": 1.8355508249068655, + "grad_norm": 0.32941919565200806, + "learning_rate": 1.4239018530275308e-07, + "loss": 0.1901, + "step": 6898 + }, + { + "epoch": 1.8358169238956892, + "grad_norm": 0.2613202929496765, + "learning_rate": 1.423748886763042e-07, + "loss": 0.174, + "step": 6899 + }, + { + "epoch": 1.836083022884513, + "grad_norm": 0.3066559433937073, + "learning_rate": 1.423595908412374e-07, + "loss": 0.2017, + "step": 6900 + }, + { + "epoch": 1.836349121873337, + "grad_norm": 0.26753556728363037, + "learning_rate": 1.4234429179798902e-07, + "loss": 0.1935, + "step": 6901 + }, + { + "epoch": 1.8366152208621607, + "grad_norm": 0.3627117872238159, + "learning_rate": 1.4232899154699535e-07, + "loss": 0.1912, + "step": 6902 + }, + { + "epoch": 1.8368813198509846, + "grad_norm": 0.2632262110710144, + "learning_rate": 1.4231369008869285e-07, + "loss": 0.1945, + "step": 6903 + }, + { + "epoch": 1.8371474188398085, + "grad_norm": 0.27140381932258606, + "learning_rate": 1.4229838742351793e-07, + "loss": 0.1813, + "step": 6904 + }, + { + "epoch": 1.8374135178286322, + "grad_norm": 0.26243627071380615, + "learning_rate": 1.42283083551907e-07, + "loss": 0.1702, + "step": 6905 + }, + { + "epoch": 1.837679616817456, + "grad_norm": 0.2660468518733978, + "learning_rate": 1.422677784742966e-07, + "loss": 0.1848, + "step": 6906 + }, + { + "epoch": 1.83794571580628, + "grad_norm": 0.2419682741165161, + "learning_rate": 1.422524721911233e-07, + "loss": 0.1712, + "step": 6907 + }, + { + "epoch": 1.8382118147951036, + "grad_norm": 0.27238360047340393, + "learning_rate": 1.4223716470282362e-07, + "loss": 0.2099, + "step": 6908 + }, + { + "epoch": 1.8384779137839278, + "grad_norm": 0.3632107973098755, + "learning_rate": 1.422218560098342e-07, + "loss": 0.195, + "step": 6909 + }, + { + "epoch": 1.8387440127727515, + "grad_norm": 0.24846667051315308, + "learning_rate": 1.4220654611259163e-07, + "loss": 0.1668, + "step": 6910 + }, + { + "epoch": 1.8390101117615754, + "grad_norm": 0.2577223777770996, + "learning_rate": 1.421912350115326e-07, + "loss": 0.1847, + "step": 6911 + }, + { + "epoch": 1.8392762107503993, + "grad_norm": 0.2514362037181854, + "learning_rate": 1.421759227070938e-07, + "loss": 0.1785, + "step": 6912 + }, + { + "epoch": 1.839542309739223, + "grad_norm": 0.280470609664917, + "learning_rate": 1.4216060919971198e-07, + "loss": 0.1923, + "step": 6913 + }, + { + "epoch": 1.8398084087280469, + "grad_norm": 0.36058130860328674, + "learning_rate": 1.4214529448982392e-07, + "loss": 0.2046, + "step": 6914 + }, + { + "epoch": 1.8400745077168708, + "grad_norm": 0.2734798491001129, + "learning_rate": 1.421299785778664e-07, + "loss": 0.1853, + "step": 6915 + }, + { + "epoch": 1.8403406067056944, + "grad_norm": 0.27058809995651245, + "learning_rate": 1.4211466146427632e-07, + "loss": 0.1923, + "step": 6916 + }, + { + "epoch": 1.8406067056945183, + "grad_norm": 0.37759461998939514, + "learning_rate": 1.4209934314949047e-07, + "loss": 0.2071, + "step": 6917 + }, + { + "epoch": 1.8408728046833422, + "grad_norm": 0.25268420577049255, + "learning_rate": 1.4208402363394584e-07, + "loss": 0.1755, + "step": 6918 + }, + { + "epoch": 1.841138903672166, + "grad_norm": 0.27495303750038147, + "learning_rate": 1.4206870291807932e-07, + "loss": 0.1826, + "step": 6919 + }, + { + "epoch": 1.84140500266099, + "grad_norm": 0.2822524309158325, + "learning_rate": 1.420533810023279e-07, + "loss": 0.1906, + "step": 6920 + }, + { + "epoch": 1.8416711016498137, + "grad_norm": 0.3182586431503296, + "learning_rate": 1.420380578871286e-07, + "loss": 0.1993, + "step": 6921 + }, + { + "epoch": 1.8419372006386374, + "grad_norm": 0.28458619117736816, + "learning_rate": 1.4202273357291849e-07, + "loss": 0.1801, + "step": 6922 + }, + { + "epoch": 1.8422032996274615, + "grad_norm": 0.4576607048511505, + "learning_rate": 1.4200740806013455e-07, + "loss": 0.2216, + "step": 6923 + }, + { + "epoch": 1.8424693986162852, + "grad_norm": 0.2733632028102875, + "learning_rate": 1.4199208134921402e-07, + "loss": 0.1792, + "step": 6924 + }, + { + "epoch": 1.8427354976051091, + "grad_norm": 0.27807947993278503, + "learning_rate": 1.41976753440594e-07, + "loss": 0.2102, + "step": 6925 + }, + { + "epoch": 1.843001596593933, + "grad_norm": 0.28248804807662964, + "learning_rate": 1.4196142433471167e-07, + "loss": 0.202, + "step": 6926 + }, + { + "epoch": 1.8432676955827567, + "grad_norm": 0.2772712707519531, + "learning_rate": 1.4194609403200426e-07, + "loss": 0.1774, + "step": 6927 + }, + { + "epoch": 1.8435337945715806, + "grad_norm": 0.2690514922142029, + "learning_rate": 1.4193076253290896e-07, + "loss": 0.1684, + "step": 6928 + }, + { + "epoch": 1.8437998935604045, + "grad_norm": 0.2945125699043274, + "learning_rate": 1.4191542983786316e-07, + "loss": 0.1946, + "step": 6929 + }, + { + "epoch": 1.8440659925492282, + "grad_norm": 0.3763456344604492, + "learning_rate": 1.419000959473041e-07, + "loss": 0.2095, + "step": 6930 + }, + { + "epoch": 1.844332091538052, + "grad_norm": 0.2774907648563385, + "learning_rate": 1.4188476086166915e-07, + "loss": 0.2028, + "step": 6931 + }, + { + "epoch": 1.844598190526876, + "grad_norm": 0.40695545077323914, + "learning_rate": 1.4186942458139575e-07, + "loss": 0.2123, + "step": 6932 + }, + { + "epoch": 1.8448642895156997, + "grad_norm": 0.34119507670402527, + "learning_rate": 1.4185408710692123e-07, + "loss": 0.1968, + "step": 6933 + }, + { + "epoch": 1.8451303885045238, + "grad_norm": 0.2803584933280945, + "learning_rate": 1.4183874843868314e-07, + "loss": 0.1876, + "step": 6934 + }, + { + "epoch": 1.8453964874933475, + "grad_norm": 0.28085756301879883, + "learning_rate": 1.4182340857711887e-07, + "loss": 0.1833, + "step": 6935 + }, + { + "epoch": 1.8456625864821714, + "grad_norm": 0.2685031294822693, + "learning_rate": 1.4180806752266602e-07, + "loss": 0.1629, + "step": 6936 + }, + { + "epoch": 1.8459286854709953, + "grad_norm": 0.36491093039512634, + "learning_rate": 1.4179272527576215e-07, + "loss": 0.2139, + "step": 6937 + }, + { + "epoch": 1.846194784459819, + "grad_norm": 0.26566222310066223, + "learning_rate": 1.417773818368448e-07, + "loss": 0.1752, + "step": 6938 + }, + { + "epoch": 1.846460883448643, + "grad_norm": 0.2446349859237671, + "learning_rate": 1.4176203720635162e-07, + "loss": 0.1846, + "step": 6939 + }, + { + "epoch": 1.8467269824374668, + "grad_norm": 0.27393728494644165, + "learning_rate": 1.417466913847203e-07, + "loss": 0.1872, + "step": 6940 + }, + { + "epoch": 1.8469930814262905, + "grad_norm": 0.2823004126548767, + "learning_rate": 1.4173134437238847e-07, + "loss": 0.1771, + "step": 6941 + }, + { + "epoch": 1.8472591804151144, + "grad_norm": 0.40050458908081055, + "learning_rate": 1.4171599616979393e-07, + "loss": 0.1868, + "step": 6942 + }, + { + "epoch": 1.8475252794039383, + "grad_norm": 0.5270482301712036, + "learning_rate": 1.4170064677737438e-07, + "loss": 0.2127, + "step": 6943 + }, + { + "epoch": 1.847791378392762, + "grad_norm": 0.3951832056045532, + "learning_rate": 1.4168529619556768e-07, + "loss": 0.1996, + "step": 6944 + }, + { + "epoch": 1.848057477381586, + "grad_norm": 0.3975665867328644, + "learning_rate": 1.4166994442481162e-07, + "loss": 0.2101, + "step": 6945 + }, + { + "epoch": 1.8483235763704098, + "grad_norm": 0.2975090742111206, + "learning_rate": 1.4165459146554403e-07, + "loss": 0.1765, + "step": 6946 + }, + { + "epoch": 1.8485896753592337, + "grad_norm": 0.27796557545661926, + "learning_rate": 1.4163923731820288e-07, + "loss": 0.189, + "step": 6947 + }, + { + "epoch": 1.8488557743480576, + "grad_norm": 0.3113378584384918, + "learning_rate": 1.4162388198322607e-07, + "loss": 0.1782, + "step": 6948 + }, + { + "epoch": 1.8491218733368813, + "grad_norm": 0.2820933759212494, + "learning_rate": 1.4160852546105153e-07, + "loss": 0.186, + "step": 6949 + }, + { + "epoch": 1.8493879723257052, + "grad_norm": 0.3459379971027374, + "learning_rate": 1.415931677521173e-07, + "loss": 0.1782, + "step": 6950 + }, + { + "epoch": 1.849654071314529, + "grad_norm": 0.2688221037387848, + "learning_rate": 1.4157780885686143e-07, + "loss": 0.1854, + "step": 6951 + }, + { + "epoch": 1.8499201703033528, + "grad_norm": 0.3153390884399414, + "learning_rate": 1.4156244877572198e-07, + "loss": 0.2084, + "step": 6952 + }, + { + "epoch": 1.8501862692921767, + "grad_norm": 0.39003562927246094, + "learning_rate": 1.41547087509137e-07, + "loss": 0.1928, + "step": 6953 + }, + { + "epoch": 1.8504523682810006, + "grad_norm": 0.3680359721183777, + "learning_rate": 1.4153172505754462e-07, + "loss": 0.1759, + "step": 6954 + }, + { + "epoch": 1.8507184672698243, + "grad_norm": 0.35826870799064636, + "learning_rate": 1.415163614213831e-07, + "loss": 0.2053, + "step": 6955 + }, + { + "epoch": 1.8509845662586484, + "grad_norm": 0.28297170996665955, + "learning_rate": 1.4150099660109053e-07, + "loss": 0.1789, + "step": 6956 + }, + { + "epoch": 1.851250665247472, + "grad_norm": 0.36723458766937256, + "learning_rate": 1.4148563059710527e-07, + "loss": 0.1821, + "step": 6957 + }, + { + "epoch": 1.8515167642362957, + "grad_norm": 0.25469323992729187, + "learning_rate": 1.414702634098655e-07, + "loss": 0.1702, + "step": 6958 + }, + { + "epoch": 1.8517828632251199, + "grad_norm": 0.2969895601272583, + "learning_rate": 1.4145489503980955e-07, + "loss": 0.2063, + "step": 6959 + }, + { + "epoch": 1.8520489622139436, + "grad_norm": 0.30460992455482483, + "learning_rate": 1.4143952548737572e-07, + "loss": 0.1969, + "step": 6960 + }, + { + "epoch": 1.8523150612027675, + "grad_norm": 0.30726873874664307, + "learning_rate": 1.4142415475300243e-07, + "loss": 0.1819, + "step": 6961 + }, + { + "epoch": 1.8525811601915914, + "grad_norm": 0.33815324306488037, + "learning_rate": 1.4140878283712808e-07, + "loss": 0.1917, + "step": 6962 + }, + { + "epoch": 1.852847259180415, + "grad_norm": 0.34099718928337097, + "learning_rate": 1.4139340974019107e-07, + "loss": 0.199, + "step": 6963 + }, + { + "epoch": 1.853113358169239, + "grad_norm": 0.280060350894928, + "learning_rate": 1.4137803546262992e-07, + "loss": 0.1923, + "step": 6964 + }, + { + "epoch": 1.8533794571580628, + "grad_norm": 0.30529963970184326, + "learning_rate": 1.4136266000488312e-07, + "loss": 0.2057, + "step": 6965 + }, + { + "epoch": 1.8536455561468865, + "grad_norm": 0.2674589455127716, + "learning_rate": 1.413472833673892e-07, + "loss": 0.1853, + "step": 6966 + }, + { + "epoch": 1.8539116551357104, + "grad_norm": 0.2593076825141907, + "learning_rate": 1.4133190555058673e-07, + "loss": 0.1798, + "step": 6967 + }, + { + "epoch": 1.8541777541245343, + "grad_norm": 0.27046316862106323, + "learning_rate": 1.4131652655491433e-07, + "loss": 0.1904, + "step": 6968 + }, + { + "epoch": 1.854443853113358, + "grad_norm": 0.325303852558136, + "learning_rate": 1.4130114638081064e-07, + "loss": 0.1703, + "step": 6969 + }, + { + "epoch": 1.8547099521021821, + "grad_norm": 0.2580420672893524, + "learning_rate": 1.4128576502871432e-07, + "loss": 0.1821, + "step": 6970 + }, + { + "epoch": 1.8549760510910058, + "grad_norm": 0.9150041341781616, + "learning_rate": 1.4127038249906406e-07, + "loss": 0.1837, + "step": 6971 + }, + { + "epoch": 1.8552421500798297, + "grad_norm": 0.26160791516304016, + "learning_rate": 1.4125499879229865e-07, + "loss": 0.1823, + "step": 6972 + }, + { + "epoch": 1.8555082490686536, + "grad_norm": 0.34491345286369324, + "learning_rate": 1.4123961390885686e-07, + "loss": 0.1896, + "step": 6973 + }, + { + "epoch": 1.8557743480574773, + "grad_norm": 0.26470786333084106, + "learning_rate": 1.4122422784917745e-07, + "loss": 0.1883, + "step": 6974 + }, + { + "epoch": 1.8560404470463012, + "grad_norm": 0.24900029599666595, + "learning_rate": 1.4120884061369928e-07, + "loss": 0.1814, + "step": 6975 + }, + { + "epoch": 1.8563065460351251, + "grad_norm": 0.2664289176464081, + "learning_rate": 1.4119345220286128e-07, + "loss": 0.1851, + "step": 6976 + }, + { + "epoch": 1.8565726450239488, + "grad_norm": 0.3896709978580475, + "learning_rate": 1.411780626171023e-07, + "loss": 0.2024, + "step": 6977 + }, + { + "epoch": 1.8568387440127727, + "grad_norm": 0.3771144151687622, + "learning_rate": 1.4116267185686126e-07, + "loss": 0.1884, + "step": 6978 + }, + { + "epoch": 1.8571048430015966, + "grad_norm": 0.2826792001724243, + "learning_rate": 1.411472799225772e-07, + "loss": 0.1815, + "step": 6979 + }, + { + "epoch": 1.8573709419904203, + "grad_norm": 0.33807381987571716, + "learning_rate": 1.4113188681468908e-07, + "loss": 0.1873, + "step": 6980 + }, + { + "epoch": 1.8576370409792444, + "grad_norm": 0.29848551750183105, + "learning_rate": 1.41116492533636e-07, + "loss": 0.1963, + "step": 6981 + }, + { + "epoch": 1.857903139968068, + "grad_norm": 0.28014886379241943, + "learning_rate": 1.4110109707985696e-07, + "loss": 0.1723, + "step": 6982 + }, + { + "epoch": 1.858169238956892, + "grad_norm": 0.26150789856910706, + "learning_rate": 1.4108570045379115e-07, + "loss": 0.1862, + "step": 6983 + }, + { + "epoch": 1.858435337945716, + "grad_norm": 0.3357656002044678, + "learning_rate": 1.4107030265587764e-07, + "loss": 0.199, + "step": 6984 + }, + { + "epoch": 1.8587014369345396, + "grad_norm": 0.27458077669143677, + "learning_rate": 1.4105490368655565e-07, + "loss": 0.1882, + "step": 6985 + }, + { + "epoch": 1.8589675359233635, + "grad_norm": 0.29558178782463074, + "learning_rate": 1.4103950354626438e-07, + "loss": 0.2188, + "step": 6986 + }, + { + "epoch": 1.8592336349121874, + "grad_norm": 0.2624601125717163, + "learning_rate": 1.4102410223544308e-07, + "loss": 0.2032, + "step": 6987 + }, + { + "epoch": 1.859499733901011, + "grad_norm": 0.39500585198402405, + "learning_rate": 1.41008699754531e-07, + "loss": 0.1908, + "step": 6988 + }, + { + "epoch": 1.859765832889835, + "grad_norm": 0.25122785568237305, + "learning_rate": 1.4099329610396748e-07, + "loss": 0.1805, + "step": 6989 + }, + { + "epoch": 1.860031931878659, + "grad_norm": 0.4421968460083008, + "learning_rate": 1.4097789128419187e-07, + "loss": 0.1926, + "step": 6990 + }, + { + "epoch": 1.8602980308674826, + "grad_norm": 0.30071142315864563, + "learning_rate": 1.409624852956435e-07, + "loss": 0.1918, + "step": 6991 + }, + { + "epoch": 1.8605641298563067, + "grad_norm": 0.29825788736343384, + "learning_rate": 1.4094707813876183e-07, + "loss": 0.2, + "step": 6992 + }, + { + "epoch": 1.8608302288451304, + "grad_norm": 0.26372241973876953, + "learning_rate": 1.4093166981398627e-07, + "loss": 0.1801, + "step": 6993 + }, + { + "epoch": 1.861096327833954, + "grad_norm": 0.3766748905181885, + "learning_rate": 1.4091626032175635e-07, + "loss": 0.1826, + "step": 6994 + }, + { + "epoch": 1.8613624268227782, + "grad_norm": 0.26373469829559326, + "learning_rate": 1.409008496625115e-07, + "loss": 0.1803, + "step": 6995 + }, + { + "epoch": 1.8616285258116019, + "grad_norm": 0.2535863518714905, + "learning_rate": 1.408854378366913e-07, + "loss": 0.1669, + "step": 6996 + }, + { + "epoch": 1.8618946248004258, + "grad_norm": 0.3259177803993225, + "learning_rate": 1.4087002484473532e-07, + "loss": 0.1933, + "step": 6997 + }, + { + "epoch": 1.8621607237892497, + "grad_norm": 0.26109370589256287, + "learning_rate": 1.408546106870832e-07, + "loss": 0.1984, + "step": 6998 + }, + { + "epoch": 1.8624268227780734, + "grad_norm": 0.27139830589294434, + "learning_rate": 1.408391953641746e-07, + "loss": 0.1883, + "step": 6999 + }, + { + "epoch": 1.8626929217668973, + "grad_norm": 0.45556095242500305, + "learning_rate": 1.4082377887644908e-07, + "loss": 0.171, + "step": 7000 + }, + { + "epoch": 1.8629590207557212, + "grad_norm": 0.3394666016101837, + "learning_rate": 1.4080836122434648e-07, + "loss": 0.1744, + "step": 7001 + }, + { + "epoch": 1.8632251197445449, + "grad_norm": 0.2842285931110382, + "learning_rate": 1.4079294240830647e-07, + "loss": 0.1708, + "step": 7002 + }, + { + "epoch": 1.8634912187333688, + "grad_norm": 0.2553044855594635, + "learning_rate": 1.4077752242876888e-07, + "loss": 0.1828, + "step": 7003 + }, + { + "epoch": 1.8637573177221927, + "grad_norm": 0.37936970591545105, + "learning_rate": 1.4076210128617344e-07, + "loss": 0.1927, + "step": 7004 + }, + { + "epoch": 1.8640234167110163, + "grad_norm": 0.3687392473220825, + "learning_rate": 1.4074667898096009e-07, + "loss": 0.1914, + "step": 7005 + }, + { + "epoch": 1.8642895156998405, + "grad_norm": 0.4419589936733246, + "learning_rate": 1.4073125551356864e-07, + "loss": 0.1988, + "step": 7006 + }, + { + "epoch": 1.8645556146886642, + "grad_norm": 0.28122347593307495, + "learning_rate": 1.40715830884439e-07, + "loss": 0.1795, + "step": 7007 + }, + { + "epoch": 1.864821713677488, + "grad_norm": 0.2584910988807678, + "learning_rate": 1.407004050940111e-07, + "loss": 0.1683, + "step": 7008 + }, + { + "epoch": 1.865087812666312, + "grad_norm": 0.2837640643119812, + "learning_rate": 1.4068497814272496e-07, + "loss": 0.2044, + "step": 7009 + }, + { + "epoch": 1.8653539116551356, + "grad_norm": 0.37612366676330566, + "learning_rate": 1.406695500310206e-07, + "loss": 0.2247, + "step": 7010 + }, + { + "epoch": 1.8656200106439595, + "grad_norm": 0.4336882531642914, + "learning_rate": 1.40654120759338e-07, + "loss": 0.2031, + "step": 7011 + }, + { + "epoch": 1.8658861096327835, + "grad_norm": 0.24201461672782898, + "learning_rate": 1.4063869032811725e-07, + "loss": 0.1615, + "step": 7012 + }, + { + "epoch": 1.8661522086216071, + "grad_norm": 0.26289889216423035, + "learning_rate": 1.406232587377985e-07, + "loss": 0.1817, + "step": 7013 + }, + { + "epoch": 1.866418307610431, + "grad_norm": 0.329010009765625, + "learning_rate": 1.4060782598882183e-07, + "loss": 0.211, + "step": 7014 + }, + { + "epoch": 1.866684406599255, + "grad_norm": 0.2924025058746338, + "learning_rate": 1.4059239208162747e-07, + "loss": 0.1709, + "step": 7015 + }, + { + "epoch": 1.8669505055880786, + "grad_norm": 0.28124353289604187, + "learning_rate": 1.4057695701665558e-07, + "loss": 0.1842, + "step": 7016 + }, + { + "epoch": 1.8672166045769027, + "grad_norm": 0.2599050998687744, + "learning_rate": 1.4056152079434643e-07, + "loss": 0.1606, + "step": 7017 + }, + { + "epoch": 1.8674827035657264, + "grad_norm": 0.26678863167762756, + "learning_rate": 1.4054608341514027e-07, + "loss": 0.1747, + "step": 7018 + }, + { + "epoch": 1.8677488025545503, + "grad_norm": 0.29014158248901367, + "learning_rate": 1.4053064487947744e-07, + "loss": 0.1911, + "step": 7019 + }, + { + "epoch": 1.8680149015433742, + "grad_norm": 0.2640722095966339, + "learning_rate": 1.4051520518779826e-07, + "loss": 0.186, + "step": 7020 + }, + { + "epoch": 1.868281000532198, + "grad_norm": 0.2899402976036072, + "learning_rate": 1.4049976434054307e-07, + "loss": 0.1994, + "step": 7021 + }, + { + "epoch": 1.8685470995210218, + "grad_norm": 0.240824356675148, + "learning_rate": 1.4048432233815231e-07, + "loss": 0.1658, + "step": 7022 + }, + { + "epoch": 1.8688131985098457, + "grad_norm": 0.32742762565612793, + "learning_rate": 1.404688791810664e-07, + "loss": 0.1819, + "step": 7023 + }, + { + "epoch": 1.8690792974986694, + "grad_norm": 0.2908945679664612, + "learning_rate": 1.4045343486972584e-07, + "loss": 0.2025, + "step": 7024 + }, + { + "epoch": 1.8693453964874933, + "grad_norm": 0.4530985653400421, + "learning_rate": 1.404379894045711e-07, + "loss": 0.1838, + "step": 7025 + }, + { + "epoch": 1.8696114954763172, + "grad_norm": 0.3518587350845337, + "learning_rate": 1.4042254278604273e-07, + "loss": 0.1731, + "step": 7026 + }, + { + "epoch": 1.869877594465141, + "grad_norm": 0.4279211461544037, + "learning_rate": 1.404070950145813e-07, + "loss": 0.1812, + "step": 7027 + }, + { + "epoch": 1.870143693453965, + "grad_norm": 0.38394132256507874, + "learning_rate": 1.403916460906274e-07, + "loss": 0.1918, + "step": 7028 + }, + { + "epoch": 1.8704097924427887, + "grad_norm": 0.37392762303352356, + "learning_rate": 1.4037619601462168e-07, + "loss": 0.2045, + "step": 7029 + }, + { + "epoch": 1.8706758914316126, + "grad_norm": 0.38032466173171997, + "learning_rate": 1.4036074478700483e-07, + "loss": 0.1903, + "step": 7030 + }, + { + "epoch": 1.8709419904204365, + "grad_norm": 0.29945042729377747, + "learning_rate": 1.4034529240821747e-07, + "loss": 0.2085, + "step": 7031 + }, + { + "epoch": 1.8712080894092602, + "grad_norm": 0.419143408536911, + "learning_rate": 1.403298388787004e-07, + "loss": 0.1919, + "step": 7032 + }, + { + "epoch": 1.871474188398084, + "grad_norm": 0.4495593011379242, + "learning_rate": 1.4031438419889436e-07, + "loss": 0.197, + "step": 7033 + }, + { + "epoch": 1.871740287386908, + "grad_norm": 0.36824727058410645, + "learning_rate": 1.402989283692402e-07, + "loss": 0.1957, + "step": 7034 + }, + { + "epoch": 1.8720063863757317, + "grad_norm": 0.4359104335308075, + "learning_rate": 1.402834713901787e-07, + "loss": 0.2147, + "step": 7035 + }, + { + "epoch": 1.8722724853645556, + "grad_norm": 0.25794005393981934, + "learning_rate": 1.4026801326215073e-07, + "loss": 0.178, + "step": 7036 + }, + { + "epoch": 1.8725385843533795, + "grad_norm": 0.2586528956890106, + "learning_rate": 1.4025255398559716e-07, + "loss": 0.1807, + "step": 7037 + }, + { + "epoch": 1.8728046833422032, + "grad_norm": 0.2803901731967926, + "learning_rate": 1.40237093560959e-07, + "loss": 0.1816, + "step": 7038 + }, + { + "epoch": 1.8730707823310273, + "grad_norm": 0.35418257117271423, + "learning_rate": 1.4022163198867717e-07, + "loss": 0.1916, + "step": 7039 + }, + { + "epoch": 1.873336881319851, + "grad_norm": 0.2625042796134949, + "learning_rate": 1.402061692691926e-07, + "loss": 0.179, + "step": 7040 + }, + { + "epoch": 1.8736029803086747, + "grad_norm": 0.3453710675239563, + "learning_rate": 1.4019070540294644e-07, + "loss": 0.186, + "step": 7041 + }, + { + "epoch": 1.8738690792974988, + "grad_norm": 0.32115253806114197, + "learning_rate": 1.4017524039037966e-07, + "loss": 0.2206, + "step": 7042 + }, + { + "epoch": 1.8741351782863225, + "grad_norm": 0.288189172744751, + "learning_rate": 1.401597742319334e-07, + "loss": 0.1971, + "step": 7043 + }, + { + "epoch": 1.8744012772751464, + "grad_norm": 0.27079007029533386, + "learning_rate": 1.4014430692804878e-07, + "loss": 0.179, + "step": 7044 + }, + { + "epoch": 1.8746673762639703, + "grad_norm": 0.3213193118572235, + "learning_rate": 1.4012883847916693e-07, + "loss": 0.2009, + "step": 7045 + }, + { + "epoch": 1.874933475252794, + "grad_norm": 0.30238720774650574, + "learning_rate": 1.4011336888572906e-07, + "loss": 0.1895, + "step": 7046 + }, + { + "epoch": 1.8751995742416179, + "grad_norm": 0.2637619972229004, + "learning_rate": 1.400978981481764e-07, + "loss": 0.1736, + "step": 7047 + }, + { + "epoch": 1.8754656732304418, + "grad_norm": 0.35747769474983215, + "learning_rate": 1.4008242626695022e-07, + "loss": 0.2036, + "step": 7048 + }, + { + "epoch": 1.8757317722192655, + "grad_norm": 0.2702627182006836, + "learning_rate": 1.4006695324249176e-07, + "loss": 0.1986, + "step": 7049 + }, + { + "epoch": 1.8759978712080894, + "grad_norm": 0.2635583281517029, + "learning_rate": 1.400514790752424e-07, + "loss": 0.1694, + "step": 7050 + }, + { + "epoch": 1.8762639701969133, + "grad_norm": 0.24549567699432373, + "learning_rate": 1.400360037656434e-07, + "loss": 0.1855, + "step": 7051 + }, + { + "epoch": 1.876530069185737, + "grad_norm": 0.2672322988510132, + "learning_rate": 1.400205273141363e-07, + "loss": 0.1709, + "step": 7052 + }, + { + "epoch": 1.876796168174561, + "grad_norm": 0.2962803244590759, + "learning_rate": 1.400050497211624e-07, + "loss": 0.1958, + "step": 7053 + }, + { + "epoch": 1.8770622671633848, + "grad_norm": 0.3227631747722626, + "learning_rate": 1.3998957098716322e-07, + "loss": 0.185, + "step": 7054 + }, + { + "epoch": 1.8773283661522087, + "grad_norm": 0.26715901494026184, + "learning_rate": 1.3997409111258017e-07, + "loss": 0.1674, + "step": 7055 + }, + { + "epoch": 1.8775944651410326, + "grad_norm": 0.29415163397789, + "learning_rate": 1.3995861009785484e-07, + "loss": 0.1844, + "step": 7056 + }, + { + "epoch": 1.8778605641298562, + "grad_norm": 0.29664021730422974, + "learning_rate": 1.3994312794342875e-07, + "loss": 0.2023, + "step": 7057 + }, + { + "epoch": 1.8781266631186802, + "grad_norm": 0.28843235969543457, + "learning_rate": 1.3992764464974345e-07, + "loss": 0.1753, + "step": 7058 + }, + { + "epoch": 1.878392762107504, + "grad_norm": 0.27084022760391235, + "learning_rate": 1.3991216021724062e-07, + "loss": 0.1864, + "step": 7059 + }, + { + "epoch": 1.8786588610963277, + "grad_norm": 0.3017547130584717, + "learning_rate": 1.3989667464636185e-07, + "loss": 0.2, + "step": 7060 + }, + { + "epoch": 1.8789249600851516, + "grad_norm": 0.2564936578273773, + "learning_rate": 1.3988118793754887e-07, + "loss": 0.1732, + "step": 7061 + }, + { + "epoch": 1.8791910590739755, + "grad_norm": 0.25450992584228516, + "learning_rate": 1.3986570009124336e-07, + "loss": 0.173, + "step": 7062 + }, + { + "epoch": 1.8794571580627992, + "grad_norm": 0.2579646408557892, + "learning_rate": 1.3985021110788706e-07, + "loss": 0.169, + "step": 7063 + }, + { + "epoch": 1.8797232570516234, + "grad_norm": 0.31343311071395874, + "learning_rate": 1.3983472098792176e-07, + "loss": 0.2054, + "step": 7064 + }, + { + "epoch": 1.879989356040447, + "grad_norm": 0.2728286385536194, + "learning_rate": 1.3981922973178927e-07, + "loss": 0.2031, + "step": 7065 + }, + { + "epoch": 1.880255455029271, + "grad_norm": 0.32699212431907654, + "learning_rate": 1.3980373733993145e-07, + "loss": 0.1908, + "step": 7066 + }, + { + "epoch": 1.8805215540180948, + "grad_norm": 0.25535887479782104, + "learning_rate": 1.3978824381279015e-07, + "loss": 0.1718, + "step": 7067 + }, + { + "epoch": 1.8807876530069185, + "grad_norm": 0.8432353138923645, + "learning_rate": 1.3977274915080731e-07, + "loss": 0.2112, + "step": 7068 + }, + { + "epoch": 1.8810537519957424, + "grad_norm": 0.5912530422210693, + "learning_rate": 1.3975725335442478e-07, + "loss": 0.1832, + "step": 7069 + }, + { + "epoch": 1.8813198509845663, + "grad_norm": 0.27983036637306213, + "learning_rate": 1.3974175642408462e-07, + "loss": 0.1848, + "step": 7070 + }, + { + "epoch": 1.88158594997339, + "grad_norm": 0.4973524510860443, + "learning_rate": 1.3972625836022882e-07, + "loss": 0.208, + "step": 7071 + }, + { + "epoch": 1.881852048962214, + "grad_norm": 0.40717819333076477, + "learning_rate": 1.397107591632994e-07, + "loss": 0.1892, + "step": 7072 + }, + { + "epoch": 1.8821181479510378, + "grad_norm": 0.2999970316886902, + "learning_rate": 1.396952588337384e-07, + "loss": 0.1808, + "step": 7073 + }, + { + "epoch": 1.8823842469398615, + "grad_norm": 0.25017356872558594, + "learning_rate": 1.39679757371988e-07, + "loss": 0.179, + "step": 7074 + }, + { + "epoch": 1.8826503459286856, + "grad_norm": 0.2876442074775696, + "learning_rate": 1.3966425477849027e-07, + "loss": 0.1924, + "step": 7075 + }, + { + "epoch": 1.8829164449175093, + "grad_norm": 0.4081307351589203, + "learning_rate": 1.396487510536874e-07, + "loss": 0.1888, + "step": 7076 + }, + { + "epoch": 1.883182543906333, + "grad_norm": 0.39502155780792236, + "learning_rate": 1.3963324619802154e-07, + "loss": 0.2068, + "step": 7077 + }, + { + "epoch": 1.8834486428951571, + "grad_norm": 0.28289100527763367, + "learning_rate": 1.3961774021193499e-07, + "loss": 0.1847, + "step": 7078 + }, + { + "epoch": 1.8837147418839808, + "grad_norm": 0.37855783104896545, + "learning_rate": 1.3960223309587e-07, + "loss": 0.2212, + "step": 7079 + }, + { + "epoch": 1.8839808408728047, + "grad_norm": 0.34185320138931274, + "learning_rate": 1.3958672485026882e-07, + "loss": 0.1971, + "step": 7080 + }, + { + "epoch": 1.8842469398616286, + "grad_norm": 0.2917858064174652, + "learning_rate": 1.3957121547557383e-07, + "loss": 0.1797, + "step": 7081 + }, + { + "epoch": 1.8845130388504523, + "grad_norm": 0.8845049738883972, + "learning_rate": 1.3955570497222735e-07, + "loss": 0.2023, + "step": 7082 + }, + { + "epoch": 1.8847791378392762, + "grad_norm": 0.2620966136455536, + "learning_rate": 1.3954019334067176e-07, + "loss": 0.1655, + "step": 7083 + }, + { + "epoch": 1.8850452368281, + "grad_norm": 0.27802836894989014, + "learning_rate": 1.3952468058134953e-07, + "loss": 0.1972, + "step": 7084 + }, + { + "epoch": 1.8853113358169238, + "grad_norm": 0.27901533246040344, + "learning_rate": 1.395091666947031e-07, + "loss": 0.1881, + "step": 7085 + }, + { + "epoch": 1.8855774348057477, + "grad_norm": 0.28419601917266846, + "learning_rate": 1.3949365168117492e-07, + "loss": 0.1848, + "step": 7086 + }, + { + "epoch": 1.8858435337945716, + "grad_norm": 0.44798851013183594, + "learning_rate": 1.3947813554120756e-07, + "loss": 0.2034, + "step": 7087 + }, + { + "epoch": 1.8861096327833953, + "grad_norm": 0.37121596932411194, + "learning_rate": 1.3946261827524352e-07, + "loss": 0.2087, + "step": 7088 + }, + { + "epoch": 1.8863757317722194, + "grad_norm": 0.295939564704895, + "learning_rate": 1.3944709988372544e-07, + "loss": 0.1855, + "step": 7089 + }, + { + "epoch": 1.886641830761043, + "grad_norm": 0.32633480429649353, + "learning_rate": 1.394315803670959e-07, + "loss": 0.1819, + "step": 7090 + }, + { + "epoch": 1.886907929749867, + "grad_norm": 0.26840248703956604, + "learning_rate": 1.3941605972579758e-07, + "loss": 0.1727, + "step": 7091 + }, + { + "epoch": 1.887174028738691, + "grad_norm": 0.2522781193256378, + "learning_rate": 1.3940053796027313e-07, + "loss": 0.1767, + "step": 7092 + }, + { + "epoch": 1.8874401277275146, + "grad_norm": 0.25592827796936035, + "learning_rate": 1.3938501507096527e-07, + "loss": 0.1782, + "step": 7093 + }, + { + "epoch": 1.8877062267163385, + "grad_norm": 0.2665444016456604, + "learning_rate": 1.3936949105831678e-07, + "loss": 0.1904, + "step": 7094 + }, + { + "epoch": 1.8879723257051624, + "grad_norm": 0.3209782540798187, + "learning_rate": 1.3935396592277037e-07, + "loss": 0.1981, + "step": 7095 + }, + { + "epoch": 1.888238424693986, + "grad_norm": 0.3292695879936218, + "learning_rate": 1.3933843966476885e-07, + "loss": 0.1898, + "step": 7096 + }, + { + "epoch": 1.88850452368281, + "grad_norm": 0.34875163435935974, + "learning_rate": 1.3932291228475515e-07, + "loss": 0.2125, + "step": 7097 + }, + { + "epoch": 1.8887706226716339, + "grad_norm": 0.2832868695259094, + "learning_rate": 1.3930738378317201e-07, + "loss": 0.187, + "step": 7098 + }, + { + "epoch": 1.8890367216604576, + "grad_norm": 0.3139856159687042, + "learning_rate": 1.3929185416046248e-07, + "loss": 0.1929, + "step": 7099 + }, + { + "epoch": 1.8893028206492817, + "grad_norm": 0.2876126170158386, + "learning_rate": 1.392763234170694e-07, + "loss": 0.197, + "step": 7100 + }, + { + "epoch": 1.8895689196381054, + "grad_norm": 0.28322917222976685, + "learning_rate": 1.3926079155343577e-07, + "loss": 0.1731, + "step": 7101 + }, + { + "epoch": 1.8898350186269293, + "grad_norm": 0.2813510596752167, + "learning_rate": 1.3924525857000456e-07, + "loss": 0.1836, + "step": 7102 + }, + { + "epoch": 1.8901011176157532, + "grad_norm": 0.27716904878616333, + "learning_rate": 1.3922972446721884e-07, + "loss": 0.2099, + "step": 7103 + }, + { + "epoch": 1.8903672166045768, + "grad_norm": 0.43013519048690796, + "learning_rate": 1.3921418924552167e-07, + "loss": 0.2149, + "step": 7104 + }, + { + "epoch": 1.8906333155934008, + "grad_norm": 0.2529437243938446, + "learning_rate": 1.391986529053561e-07, + "loss": 0.1677, + "step": 7105 + }, + { + "epoch": 1.8908994145822247, + "grad_norm": 0.2714265286922455, + "learning_rate": 1.391831154471653e-07, + "loss": 0.197, + "step": 7106 + }, + { + "epoch": 1.8911655135710483, + "grad_norm": 0.2838635742664337, + "learning_rate": 1.3916757687139247e-07, + "loss": 0.19, + "step": 7107 + }, + { + "epoch": 1.8914316125598722, + "grad_norm": 0.3242475688457489, + "learning_rate": 1.3915203717848072e-07, + "loss": 0.2031, + "step": 7108 + }, + { + "epoch": 1.8916977115486961, + "grad_norm": 0.36101821064949036, + "learning_rate": 1.391364963688733e-07, + "loss": 0.1916, + "step": 7109 + }, + { + "epoch": 1.8919638105375198, + "grad_norm": 0.6392698884010315, + "learning_rate": 1.3912095444301352e-07, + "loss": 0.1915, + "step": 7110 + }, + { + "epoch": 1.892229909526344, + "grad_norm": 0.2770518362522125, + "learning_rate": 1.3910541140134457e-07, + "loss": 0.2081, + "step": 7111 + }, + { + "epoch": 1.8924960085151676, + "grad_norm": 0.3213501274585724, + "learning_rate": 1.3908986724430985e-07, + "loss": 0.203, + "step": 7112 + }, + { + "epoch": 1.8927621075039913, + "grad_norm": 0.33283692598342896, + "learning_rate": 1.3907432197235268e-07, + "loss": 0.1873, + "step": 7113 + }, + { + "epoch": 1.8930282064928154, + "grad_norm": 0.2744004726409912, + "learning_rate": 1.3905877558591644e-07, + "loss": 0.1823, + "step": 7114 + }, + { + "epoch": 1.8932943054816391, + "grad_norm": 0.2787860929965973, + "learning_rate": 1.3904322808544454e-07, + "loss": 0.1915, + "step": 7115 + }, + { + "epoch": 1.893560404470463, + "grad_norm": 0.3596310019493103, + "learning_rate": 1.3902767947138045e-07, + "loss": 0.1961, + "step": 7116 + }, + { + "epoch": 1.893826503459287, + "grad_norm": 0.2934364676475525, + "learning_rate": 1.3901212974416762e-07, + "loss": 0.1797, + "step": 7117 + }, + { + "epoch": 1.8940926024481106, + "grad_norm": 0.27329909801483154, + "learning_rate": 1.389965789042496e-07, + "loss": 0.188, + "step": 7118 + }, + { + "epoch": 1.8943587014369345, + "grad_norm": 0.2766727805137634, + "learning_rate": 1.3898102695206988e-07, + "loss": 0.1875, + "step": 7119 + }, + { + "epoch": 1.8946248004257584, + "grad_norm": 0.26995086669921875, + "learning_rate": 1.3896547388807206e-07, + "loss": 0.1737, + "step": 7120 + }, + { + "epoch": 1.894890899414582, + "grad_norm": 0.269390732049942, + "learning_rate": 1.3894991971269975e-07, + "loss": 0.1631, + "step": 7121 + }, + { + "epoch": 1.895156998403406, + "grad_norm": 0.26374727487564087, + "learning_rate": 1.3893436442639656e-07, + "loss": 0.1719, + "step": 7122 + }, + { + "epoch": 1.89542309739223, + "grad_norm": 0.32823804020881653, + "learning_rate": 1.3891880802960618e-07, + "loss": 0.1809, + "step": 7123 + }, + { + "epoch": 1.8956891963810536, + "grad_norm": 0.32811951637268066, + "learning_rate": 1.3890325052277231e-07, + "loss": 0.1947, + "step": 7124 + }, + { + "epoch": 1.8959552953698777, + "grad_norm": 0.26691102981567383, + "learning_rate": 1.388876919063387e-07, + "loss": 0.1863, + "step": 7125 + }, + { + "epoch": 1.8962213943587014, + "grad_norm": 0.3000461757183075, + "learning_rate": 1.3887213218074908e-07, + "loss": 0.2047, + "step": 7126 + }, + { + "epoch": 1.8964874933475253, + "grad_norm": 0.2976861596107483, + "learning_rate": 1.3885657134644726e-07, + "loss": 0.1709, + "step": 7127 + }, + { + "epoch": 1.8967535923363492, + "grad_norm": 0.324886679649353, + "learning_rate": 1.388410094038771e-07, + "loss": 0.1808, + "step": 7128 + }, + { + "epoch": 1.897019691325173, + "grad_norm": 0.3307180106639862, + "learning_rate": 1.3882544635348236e-07, + "loss": 0.2021, + "step": 7129 + }, + { + "epoch": 1.8972857903139968, + "grad_norm": 0.25517287850379944, + "learning_rate": 1.3880988219570707e-07, + "loss": 0.1807, + "step": 7130 + }, + { + "epoch": 1.8975518893028207, + "grad_norm": 0.2596918046474457, + "learning_rate": 1.3879431693099498e-07, + "loss": 0.1749, + "step": 7131 + }, + { + "epoch": 1.8978179882916444, + "grad_norm": 0.2558184564113617, + "learning_rate": 1.387787505597902e-07, + "loss": 0.1855, + "step": 7132 + }, + { + "epoch": 1.8980840872804683, + "grad_norm": 0.2881656289100647, + "learning_rate": 1.3876318308253665e-07, + "loss": 0.1856, + "step": 7133 + }, + { + "epoch": 1.8983501862692922, + "grad_norm": 0.2643626630306244, + "learning_rate": 1.3874761449967834e-07, + "loss": 0.1806, + "step": 7134 + }, + { + "epoch": 1.8986162852581159, + "grad_norm": 0.31765031814575195, + "learning_rate": 1.387320448116593e-07, + "loss": 0.1948, + "step": 7135 + }, + { + "epoch": 1.89888238424694, + "grad_norm": 0.36502841114997864, + "learning_rate": 1.3871647401892368e-07, + "loss": 0.1792, + "step": 7136 + }, + { + "epoch": 1.8991484832357637, + "grad_norm": 0.2656823992729187, + "learning_rate": 1.3870090212191552e-07, + "loss": 0.1799, + "step": 7137 + }, + { + "epoch": 1.8994145822245876, + "grad_norm": 0.5303717255592346, + "learning_rate": 1.3868532912107897e-07, + "loss": 0.1764, + "step": 7138 + }, + { + "epoch": 1.8996806812134115, + "grad_norm": 0.24815653264522552, + "learning_rate": 1.3866975501685825e-07, + "loss": 0.1547, + "step": 7139 + }, + { + "epoch": 1.8999467802022352, + "grad_norm": 0.2864212989807129, + "learning_rate": 1.3865417980969754e-07, + "loss": 0.1872, + "step": 7140 + }, + { + "epoch": 1.900212879191059, + "grad_norm": 0.24650566279888153, + "learning_rate": 1.3863860350004105e-07, + "loss": 0.1864, + "step": 7141 + }, + { + "epoch": 1.900478978179883, + "grad_norm": 0.41423436999320984, + "learning_rate": 1.3862302608833304e-07, + "loss": 0.2079, + "step": 7142 + }, + { + "epoch": 1.9007450771687067, + "grad_norm": 0.2698238492012024, + "learning_rate": 1.386074475750179e-07, + "loss": 0.1936, + "step": 7143 + }, + { + "epoch": 1.9010111761575306, + "grad_norm": 0.34611770510673523, + "learning_rate": 1.3859186796053988e-07, + "loss": 0.2128, + "step": 7144 + }, + { + "epoch": 1.9012772751463545, + "grad_norm": 0.3306783139705658, + "learning_rate": 1.3857628724534338e-07, + "loss": 0.1958, + "step": 7145 + }, + { + "epoch": 1.9015433741351782, + "grad_norm": 0.3453763425350189, + "learning_rate": 1.3856070542987275e-07, + "loss": 0.1765, + "step": 7146 + }, + { + "epoch": 1.9018094731240023, + "grad_norm": 0.33456507325172424, + "learning_rate": 1.3854512251457246e-07, + "loss": 0.1859, + "step": 7147 + }, + { + "epoch": 1.902075572112826, + "grad_norm": 0.29376333951950073, + "learning_rate": 1.3852953849988692e-07, + "loss": 0.1672, + "step": 7148 + }, + { + "epoch": 1.9023416711016499, + "grad_norm": 0.3008614480495453, + "learning_rate": 1.385139533862607e-07, + "loss": 0.1821, + "step": 7149 + }, + { + "epoch": 1.9026077700904738, + "grad_norm": 0.3091281056404114, + "learning_rate": 1.384983671741382e-07, + "loss": 0.1881, + "step": 7150 + }, + { + "epoch": 1.9028738690792975, + "grad_norm": 0.31446659564971924, + "learning_rate": 1.384827798639641e-07, + "loss": 0.1933, + "step": 7151 + }, + { + "epoch": 1.9031399680681214, + "grad_norm": 0.2661346197128296, + "learning_rate": 1.3846719145618288e-07, + "loss": 0.19, + "step": 7152 + }, + { + "epoch": 1.9034060670569453, + "grad_norm": 0.2527194917201996, + "learning_rate": 1.3845160195123919e-07, + "loss": 0.1721, + "step": 7153 + }, + { + "epoch": 1.903672166045769, + "grad_norm": 0.27800002694129944, + "learning_rate": 1.384360113495777e-07, + "loss": 0.1917, + "step": 7154 + }, + { + "epoch": 1.9039382650345928, + "grad_norm": 0.2855764329433441, + "learning_rate": 1.3842041965164304e-07, + "loss": 0.1867, + "step": 7155 + }, + { + "epoch": 1.9042043640234168, + "grad_norm": 0.3757893145084381, + "learning_rate": 1.3840482685787992e-07, + "loss": 0.2015, + "step": 7156 + }, + { + "epoch": 1.9044704630122404, + "grad_norm": 0.2898770868778229, + "learning_rate": 1.3838923296873314e-07, + "loss": 0.1887, + "step": 7157 + }, + { + "epoch": 1.9047365620010646, + "grad_norm": 0.2688502371311188, + "learning_rate": 1.383736379846474e-07, + "loss": 0.2007, + "step": 7158 + }, + { + "epoch": 1.9050026609898882, + "grad_norm": 0.257817804813385, + "learning_rate": 1.383580419060675e-07, + "loss": 0.1671, + "step": 7159 + }, + { + "epoch": 1.905268759978712, + "grad_norm": 0.2623639404773712, + "learning_rate": 1.3834244473343834e-07, + "loss": 0.1802, + "step": 7160 + }, + { + "epoch": 1.905534858967536, + "grad_norm": 0.28160691261291504, + "learning_rate": 1.3832684646720473e-07, + "loss": 0.1694, + "step": 7161 + }, + { + "epoch": 1.9058009579563597, + "grad_norm": 0.382341206073761, + "learning_rate": 1.3831124710781157e-07, + "loss": 0.1834, + "step": 7162 + }, + { + "epoch": 1.9060670569451836, + "grad_norm": 0.4236741364002228, + "learning_rate": 1.382956466557038e-07, + "loss": 0.1935, + "step": 7163 + }, + { + "epoch": 1.9063331559340075, + "grad_norm": 0.2771351933479309, + "learning_rate": 1.382800451113263e-07, + "loss": 0.1888, + "step": 7164 + }, + { + "epoch": 1.9065992549228312, + "grad_norm": 0.5043976902961731, + "learning_rate": 1.3826444247512422e-07, + "loss": 0.2115, + "step": 7165 + }, + { + "epoch": 1.9068653539116551, + "grad_norm": 0.3219095468521118, + "learning_rate": 1.3824883874754245e-07, + "loss": 0.1916, + "step": 7166 + }, + { + "epoch": 1.907131452900479, + "grad_norm": 0.3483000695705414, + "learning_rate": 1.3823323392902607e-07, + "loss": 0.1779, + "step": 7167 + }, + { + "epoch": 1.9073975518893027, + "grad_norm": 0.296566903591156, + "learning_rate": 1.3821762802002015e-07, + "loss": 0.1849, + "step": 7168 + }, + { + "epoch": 1.9076636508781266, + "grad_norm": 0.3224335014820099, + "learning_rate": 1.3820202102096982e-07, + "loss": 0.2032, + "step": 7169 + }, + { + "epoch": 1.9079297498669505, + "grad_norm": 0.2641737759113312, + "learning_rate": 1.3818641293232025e-07, + "loss": 0.1833, + "step": 7170 + }, + { + "epoch": 1.9081958488557742, + "grad_norm": 0.28131070733070374, + "learning_rate": 1.3817080375451654e-07, + "loss": 0.1809, + "step": 7171 + }, + { + "epoch": 1.9084619478445983, + "grad_norm": 0.33066070079803467, + "learning_rate": 1.38155193488004e-07, + "loss": 0.2046, + "step": 7172 + }, + { + "epoch": 1.908728046833422, + "grad_norm": 0.4029066264629364, + "learning_rate": 1.3813958213322776e-07, + "loss": 0.1957, + "step": 7173 + }, + { + "epoch": 1.908994145822246, + "grad_norm": 0.3341729938983917, + "learning_rate": 1.3812396969063312e-07, + "loss": 0.1987, + "step": 7174 + }, + { + "epoch": 1.9092602448110698, + "grad_norm": 0.26086002588272095, + "learning_rate": 1.3810835616066545e-07, + "loss": 0.1809, + "step": 7175 + }, + { + "epoch": 1.9095263437998935, + "grad_norm": 0.33547577261924744, + "learning_rate": 1.3809274154376998e-07, + "loss": 0.1999, + "step": 7176 + }, + { + "epoch": 1.9097924427887174, + "grad_norm": 0.3507625460624695, + "learning_rate": 1.3807712584039216e-07, + "loss": 0.2127, + "step": 7177 + }, + { + "epoch": 1.9100585417775413, + "grad_norm": 0.4298947751522064, + "learning_rate": 1.3806150905097728e-07, + "loss": 0.2117, + "step": 7178 + }, + { + "epoch": 1.910324640766365, + "grad_norm": 0.26981163024902344, + "learning_rate": 1.380458911759708e-07, + "loss": 0.1743, + "step": 7179 + }, + { + "epoch": 1.910590739755189, + "grad_norm": 0.29798123240470886, + "learning_rate": 1.3803027221581823e-07, + "loss": 0.1844, + "step": 7180 + }, + { + "epoch": 1.9108568387440128, + "grad_norm": 0.2790277600288391, + "learning_rate": 1.3801465217096504e-07, + "loss": 0.1856, + "step": 7181 + }, + { + "epoch": 1.9111229377328365, + "grad_norm": 0.33141717314720154, + "learning_rate": 1.3799903104185666e-07, + "loss": 0.1819, + "step": 7182 + }, + { + "epoch": 1.9113890367216606, + "grad_norm": 0.2809681296348572, + "learning_rate": 1.3798340882893876e-07, + "loss": 0.1957, + "step": 7183 + }, + { + "epoch": 1.9116551357104843, + "grad_norm": 0.4802674949169159, + "learning_rate": 1.3796778553265682e-07, + "loss": 0.1926, + "step": 7184 + }, + { + "epoch": 1.9119212346993082, + "grad_norm": 0.2856309413909912, + "learning_rate": 1.3795216115345647e-07, + "loss": 0.1647, + "step": 7185 + }, + { + "epoch": 1.912187333688132, + "grad_norm": 0.2635312080383301, + "learning_rate": 1.3793653569178338e-07, + "loss": 0.1665, + "step": 7186 + }, + { + "epoch": 1.9124534326769558, + "grad_norm": 0.32331758737564087, + "learning_rate": 1.3792090914808322e-07, + "loss": 0.2133, + "step": 7187 + }, + { + "epoch": 1.9127195316657797, + "grad_norm": 0.35327425599098206, + "learning_rate": 1.3790528152280166e-07, + "loss": 0.1941, + "step": 7188 + }, + { + "epoch": 1.9129856306546036, + "grad_norm": 0.2606704831123352, + "learning_rate": 1.3788965281638442e-07, + "loss": 0.1807, + "step": 7189 + }, + { + "epoch": 1.9132517296434273, + "grad_norm": 0.3628596365451813, + "learning_rate": 1.3787402302927728e-07, + "loss": 0.196, + "step": 7190 + }, + { + "epoch": 1.9135178286322512, + "grad_norm": 0.2814478874206543, + "learning_rate": 1.3785839216192608e-07, + "loss": 0.192, + "step": 7191 + }, + { + "epoch": 1.913783927621075, + "grad_norm": 0.3547958731651306, + "learning_rate": 1.3784276021477656e-07, + "loss": 0.2124, + "step": 7192 + }, + { + "epoch": 1.9140500266098988, + "grad_norm": 0.28159722685813904, + "learning_rate": 1.3782712718827466e-07, + "loss": 0.1927, + "step": 7193 + }, + { + "epoch": 1.9143161255987229, + "grad_norm": 0.33437809348106384, + "learning_rate": 1.3781149308286622e-07, + "loss": 0.1903, + "step": 7194 + }, + { + "epoch": 1.9145822245875466, + "grad_norm": 0.2912193536758423, + "learning_rate": 1.3779585789899716e-07, + "loss": 0.1813, + "step": 7195 + }, + { + "epoch": 1.9148483235763702, + "grad_norm": 0.26767730712890625, + "learning_rate": 1.377802216371134e-07, + "loss": 0.1883, + "step": 7196 + }, + { + "epoch": 1.9151144225651944, + "grad_norm": 0.3313637971878052, + "learning_rate": 1.3776458429766095e-07, + "loss": 0.2169, + "step": 7197 + }, + { + "epoch": 1.915380521554018, + "grad_norm": 0.28908786177635193, + "learning_rate": 1.3774894588108583e-07, + "loss": 0.199, + "step": 7198 + }, + { + "epoch": 1.915646620542842, + "grad_norm": 1.2421095371246338, + "learning_rate": 1.3773330638783408e-07, + "loss": 0.2001, + "step": 7199 + }, + { + "epoch": 1.9159127195316659, + "grad_norm": 0.286159485578537, + "learning_rate": 1.377176658183517e-07, + "loss": 0.2017, + "step": 7200 + }, + { + "epoch": 1.9161788185204895, + "grad_norm": 0.2505638897418976, + "learning_rate": 1.377020241730849e-07, + "loss": 0.1817, + "step": 7201 + }, + { + "epoch": 1.9164449175093135, + "grad_norm": 0.2846984267234802, + "learning_rate": 1.3768638145247972e-07, + "loss": 0.1824, + "step": 7202 + }, + { + "epoch": 1.9167110164981374, + "grad_norm": 0.3729579448699951, + "learning_rate": 1.3767073765698238e-07, + "loss": 0.1761, + "step": 7203 + }, + { + "epoch": 1.916977115486961, + "grad_norm": 0.4076097011566162, + "learning_rate": 1.3765509278703905e-07, + "loss": 0.201, + "step": 7204 + }, + { + "epoch": 1.917243214475785, + "grad_norm": 0.23078204691410065, + "learning_rate": 1.3763944684309595e-07, + "loss": 0.1574, + "step": 7205 + }, + { + "epoch": 1.9175093134646088, + "grad_norm": 0.26535600423812866, + "learning_rate": 1.3762379982559936e-07, + "loss": 0.1799, + "step": 7206 + }, + { + "epoch": 1.9177754124534325, + "grad_norm": 0.33882376551628113, + "learning_rate": 1.376081517349955e-07, + "loss": 0.2029, + "step": 7207 + }, + { + "epoch": 1.9180415114422567, + "grad_norm": 0.29505956172943115, + "learning_rate": 1.3759250257173075e-07, + "loss": 0.1919, + "step": 7208 + }, + { + "epoch": 1.9183076104310803, + "grad_norm": 0.29106244444847107, + "learning_rate": 1.3757685233625144e-07, + "loss": 0.1969, + "step": 7209 + }, + { + "epoch": 1.9185737094199042, + "grad_norm": 0.2940415143966675, + "learning_rate": 1.3756120102900393e-07, + "loss": 0.1857, + "step": 7210 + }, + { + "epoch": 1.9188398084087281, + "grad_norm": 0.2569175064563751, + "learning_rate": 1.3754554865043465e-07, + "loss": 0.1599, + "step": 7211 + }, + { + "epoch": 1.9191059073975518, + "grad_norm": 0.23594720661640167, + "learning_rate": 1.3752989520099004e-07, + "loss": 0.1555, + "step": 7212 + }, + { + "epoch": 1.9193720063863757, + "grad_norm": 0.2583211362361908, + "learning_rate": 1.3751424068111655e-07, + "loss": 0.1778, + "step": 7213 + }, + { + "epoch": 1.9196381053751996, + "grad_norm": 0.32446834444999695, + "learning_rate": 1.3749858509126064e-07, + "loss": 0.2025, + "step": 7214 + }, + { + "epoch": 1.9199042043640233, + "grad_norm": 0.2538609802722931, + "learning_rate": 1.3748292843186892e-07, + "loss": 0.1737, + "step": 7215 + }, + { + "epoch": 1.9201703033528472, + "grad_norm": 0.2719126343727112, + "learning_rate": 1.374672707033879e-07, + "loss": 0.1797, + "step": 7216 + }, + { + "epoch": 1.9204364023416711, + "grad_norm": 0.26930221915245056, + "learning_rate": 1.3745161190626419e-07, + "loss": 0.1756, + "step": 7217 + }, + { + "epoch": 1.9207025013304948, + "grad_norm": 0.30220749974250793, + "learning_rate": 1.374359520409444e-07, + "loss": 0.1843, + "step": 7218 + }, + { + "epoch": 1.920968600319319, + "grad_norm": 0.25195935368537903, + "learning_rate": 1.374202911078752e-07, + "loss": 0.1808, + "step": 7219 + }, + { + "epoch": 1.9212346993081426, + "grad_norm": 0.2574748694896698, + "learning_rate": 1.3740462910750323e-07, + "loss": 0.1887, + "step": 7220 + }, + { + "epoch": 1.9215007982969665, + "grad_norm": 0.28572702407836914, + "learning_rate": 1.3738896604027525e-07, + "loss": 0.1919, + "step": 7221 + }, + { + "epoch": 1.9217668972857904, + "grad_norm": 0.26198112964630127, + "learning_rate": 1.3737330190663797e-07, + "loss": 0.1862, + "step": 7222 + }, + { + "epoch": 1.922032996274614, + "grad_norm": 0.26000258326530457, + "learning_rate": 1.3735763670703816e-07, + "loss": 0.1765, + "step": 7223 + }, + { + "epoch": 1.922299095263438, + "grad_norm": 0.24478933215141296, + "learning_rate": 1.3734197044192267e-07, + "loss": 0.1604, + "step": 7224 + }, + { + "epoch": 1.922565194252262, + "grad_norm": 0.33937129378318787, + "learning_rate": 1.3732630311173824e-07, + "loss": 0.2089, + "step": 7225 + }, + { + "epoch": 1.9228312932410856, + "grad_norm": 0.2528870403766632, + "learning_rate": 1.3731063471693186e-07, + "loss": 0.1776, + "step": 7226 + }, + { + "epoch": 1.9230973922299095, + "grad_norm": 0.5407139658927917, + "learning_rate": 1.3729496525795035e-07, + "loss": 0.2037, + "step": 7227 + }, + { + "epoch": 1.9233634912187334, + "grad_norm": 0.3214116096496582, + "learning_rate": 1.3727929473524061e-07, + "loss": 0.1932, + "step": 7228 + }, + { + "epoch": 1.923629590207557, + "grad_norm": 0.4577626883983612, + "learning_rate": 1.3726362314924965e-07, + "loss": 0.2002, + "step": 7229 + }, + { + "epoch": 1.9238956891963812, + "grad_norm": 0.28645703196525574, + "learning_rate": 1.3724795050042443e-07, + "loss": 0.1889, + "step": 7230 + }, + { + "epoch": 1.924161788185205, + "grad_norm": 0.414087176322937, + "learning_rate": 1.3723227678921198e-07, + "loss": 0.1635, + "step": 7231 + }, + { + "epoch": 1.9244278871740286, + "grad_norm": 0.27442044019699097, + "learning_rate": 1.372166020160593e-07, + "loss": 0.1797, + "step": 7232 + }, + { + "epoch": 1.9246939861628527, + "grad_norm": 0.4431156814098358, + "learning_rate": 1.3720092618141355e-07, + "loss": 0.1795, + "step": 7233 + }, + { + "epoch": 1.9249600851516764, + "grad_norm": 0.396013468503952, + "learning_rate": 1.3718524928572176e-07, + "loss": 0.1953, + "step": 7234 + }, + { + "epoch": 1.9252261841405003, + "grad_norm": 0.26655125617980957, + "learning_rate": 1.3716957132943113e-07, + "loss": 0.186, + "step": 7235 + }, + { + "epoch": 1.9254922831293242, + "grad_norm": 0.2624537944793701, + "learning_rate": 1.3715389231298875e-07, + "loss": 0.1724, + "step": 7236 + }, + { + "epoch": 1.9257583821181479, + "grad_norm": 0.24968929588794708, + "learning_rate": 1.3713821223684187e-07, + "loss": 0.1844, + "step": 7237 + }, + { + "epoch": 1.9260244811069718, + "grad_norm": 0.44455721974372864, + "learning_rate": 1.3712253110143775e-07, + "loss": 0.1932, + "step": 7238 + }, + { + "epoch": 1.9262905800957957, + "grad_norm": 0.35079991817474365, + "learning_rate": 1.3710684890722357e-07, + "loss": 0.2011, + "step": 7239 + }, + { + "epoch": 1.9265566790846194, + "grad_norm": 0.41984397172927856, + "learning_rate": 1.3709116565464665e-07, + "loss": 0.1998, + "step": 7240 + }, + { + "epoch": 1.9268227780734435, + "grad_norm": 0.27187302708625793, + "learning_rate": 1.3707548134415433e-07, + "loss": 0.1904, + "step": 7241 + }, + { + "epoch": 1.9270888770622672, + "grad_norm": 0.27479103207588196, + "learning_rate": 1.370597959761939e-07, + "loss": 0.185, + "step": 7242 + }, + { + "epoch": 1.9273549760510909, + "grad_norm": 0.34072479605674744, + "learning_rate": 1.3704410955121285e-07, + "loss": 0.1981, + "step": 7243 + }, + { + "epoch": 1.927621075039915, + "grad_norm": 0.3696329891681671, + "learning_rate": 1.3702842206965848e-07, + "loss": 0.182, + "step": 7244 + }, + { + "epoch": 1.9278871740287387, + "grad_norm": 0.34480705857276917, + "learning_rate": 1.3701273353197827e-07, + "loss": 0.1953, + "step": 7245 + }, + { + "epoch": 1.9281532730175626, + "grad_norm": 0.3091569244861603, + "learning_rate": 1.369970439386197e-07, + "loss": 0.1929, + "step": 7246 + }, + { + "epoch": 1.9284193720063865, + "grad_norm": 0.38071107864379883, + "learning_rate": 1.3698135329003023e-07, + "loss": 0.2019, + "step": 7247 + }, + { + "epoch": 1.9286854709952101, + "grad_norm": 0.2745354473590851, + "learning_rate": 1.369656615866574e-07, + "loss": 0.1665, + "step": 7248 + }, + { + "epoch": 1.928951569984034, + "grad_norm": 0.3152831196784973, + "learning_rate": 1.3694996882894886e-07, + "loss": 0.1827, + "step": 7249 + }, + { + "epoch": 1.929217668972858, + "grad_norm": 0.3668093979358673, + "learning_rate": 1.3693427501735204e-07, + "loss": 0.2004, + "step": 7250 + }, + { + "epoch": 1.9294837679616816, + "grad_norm": 0.2685328423976898, + "learning_rate": 1.3691858015231468e-07, + "loss": 0.1802, + "step": 7251 + }, + { + "epoch": 1.9297498669505055, + "grad_norm": 0.26547709107398987, + "learning_rate": 1.3690288423428438e-07, + "loss": 0.1756, + "step": 7252 + }, + { + "epoch": 1.9300159659393294, + "grad_norm": 0.2433982938528061, + "learning_rate": 1.3688718726370882e-07, + "loss": 0.1658, + "step": 7253 + }, + { + "epoch": 1.9302820649281531, + "grad_norm": 0.2592124938964844, + "learning_rate": 1.3687148924103573e-07, + "loss": 0.1759, + "step": 7254 + }, + { + "epoch": 1.9305481639169773, + "grad_norm": 0.36879774928092957, + "learning_rate": 1.3685579016671286e-07, + "loss": 0.2072, + "step": 7255 + }, + { + "epoch": 1.930814262905801, + "grad_norm": 0.32547152042388916, + "learning_rate": 1.3684009004118797e-07, + "loss": 0.2084, + "step": 7256 + }, + { + "epoch": 1.9310803618946248, + "grad_norm": 0.2885810136795044, + "learning_rate": 1.3682438886490882e-07, + "loss": 0.179, + "step": 7257 + }, + { + "epoch": 1.9313464608834487, + "grad_norm": 0.4151362180709839, + "learning_rate": 1.3680868663832328e-07, + "loss": 0.1603, + "step": 7258 + }, + { + "epoch": 1.9316125598722724, + "grad_norm": 0.35503101348876953, + "learning_rate": 1.367929833618792e-07, + "loss": 0.228, + "step": 7259 + }, + { + "epoch": 1.9318786588610963, + "grad_norm": 0.336296409368515, + "learning_rate": 1.3677727903602448e-07, + "loss": 0.1797, + "step": 7260 + }, + { + "epoch": 1.9321447578499202, + "grad_norm": 0.2764435112476349, + "learning_rate": 1.3676157366120697e-07, + "loss": 0.1822, + "step": 7261 + }, + { + "epoch": 1.932410856838744, + "grad_norm": 0.27899178862571716, + "learning_rate": 1.3674586723787474e-07, + "loss": 0.2036, + "step": 7262 + }, + { + "epoch": 1.9326769558275678, + "grad_norm": 0.26200342178344727, + "learning_rate": 1.367301597664757e-07, + "loss": 0.1884, + "step": 7263 + }, + { + "epoch": 1.9329430548163917, + "grad_norm": 0.32066991925239563, + "learning_rate": 1.3671445124745783e-07, + "loss": 0.1931, + "step": 7264 + }, + { + "epoch": 1.9332091538052154, + "grad_norm": 0.30293771624565125, + "learning_rate": 1.366987416812692e-07, + "loss": 0.1893, + "step": 7265 + }, + { + "epoch": 1.9334752527940395, + "grad_norm": 0.42818549275398254, + "learning_rate": 1.3668303106835793e-07, + "loss": 0.2184, + "step": 7266 + }, + { + "epoch": 1.9337413517828632, + "grad_norm": 0.3056122362613678, + "learning_rate": 1.3666731940917203e-07, + "loss": 0.1873, + "step": 7267 + }, + { + "epoch": 1.9340074507716871, + "grad_norm": 0.26730287075042725, + "learning_rate": 1.3665160670415966e-07, + "loss": 0.1997, + "step": 7268 + }, + { + "epoch": 1.934273549760511, + "grad_norm": 0.3647919297218323, + "learning_rate": 1.3663589295376904e-07, + "loss": 0.1866, + "step": 7269 + }, + { + "epoch": 1.9345396487493347, + "grad_norm": 0.2867494225502014, + "learning_rate": 1.3662017815844823e-07, + "loss": 0.1839, + "step": 7270 + }, + { + "epoch": 1.9348057477381586, + "grad_norm": 0.2879403233528137, + "learning_rate": 1.3660446231864557e-07, + "loss": 0.1985, + "step": 7271 + }, + { + "epoch": 1.9350718467269825, + "grad_norm": 0.3885837495326996, + "learning_rate": 1.3658874543480928e-07, + "loss": 0.2043, + "step": 7272 + }, + { + "epoch": 1.9353379457158062, + "grad_norm": 0.4940871298313141, + "learning_rate": 1.3657302750738757e-07, + "loss": 0.205, + "step": 7273 + }, + { + "epoch": 1.93560404470463, + "grad_norm": 0.28900325298309326, + "learning_rate": 1.3655730853682884e-07, + "loss": 0.1698, + "step": 7274 + }, + { + "epoch": 1.935870143693454, + "grad_norm": 0.26527658104896545, + "learning_rate": 1.3654158852358134e-07, + "loss": 0.1812, + "step": 7275 + }, + { + "epoch": 1.9361362426822777, + "grad_norm": 0.2771199345588684, + "learning_rate": 1.3652586746809352e-07, + "loss": 0.1898, + "step": 7276 + }, + { + "epoch": 1.9364023416711018, + "grad_norm": 0.2724747657775879, + "learning_rate": 1.365101453708137e-07, + "loss": 0.1843, + "step": 7277 + }, + { + "epoch": 1.9366684406599255, + "grad_norm": 0.25465837121009827, + "learning_rate": 1.364944222321904e-07, + "loss": 0.177, + "step": 7278 + }, + { + "epoch": 1.9369345396487492, + "grad_norm": 0.23847495019435883, + "learning_rate": 1.3647869805267198e-07, + "loss": 0.162, + "step": 7279 + }, + { + "epoch": 1.9372006386375733, + "grad_norm": 0.26157087087631226, + "learning_rate": 1.3646297283270695e-07, + "loss": 0.1895, + "step": 7280 + }, + { + "epoch": 1.937466737626397, + "grad_norm": 0.26027703285217285, + "learning_rate": 1.3644724657274385e-07, + "loss": 0.1793, + "step": 7281 + }, + { + "epoch": 1.9377328366152209, + "grad_norm": 0.2913430631160736, + "learning_rate": 1.3643151927323122e-07, + "loss": 0.1844, + "step": 7282 + }, + { + "epoch": 1.9379989356040448, + "grad_norm": 0.26844489574432373, + "learning_rate": 1.3641579093461758e-07, + "loss": 0.169, + "step": 7283 + }, + { + "epoch": 1.9382650345928685, + "grad_norm": 0.25779959559440613, + "learning_rate": 1.3640006155735164e-07, + "loss": 0.182, + "step": 7284 + }, + { + "epoch": 1.9385311335816924, + "grad_norm": 0.3533742427825928, + "learning_rate": 1.3638433114188196e-07, + "loss": 0.2007, + "step": 7285 + }, + { + "epoch": 1.9387972325705163, + "grad_norm": 0.308481365442276, + "learning_rate": 1.363685996886572e-07, + "loss": 0.198, + "step": 7286 + }, + { + "epoch": 1.93906333155934, + "grad_norm": 0.3506528437137604, + "learning_rate": 1.363528671981261e-07, + "loss": 0.2064, + "step": 7287 + }, + { + "epoch": 1.9393294305481639, + "grad_norm": 0.3561708927154541, + "learning_rate": 1.3633713367073735e-07, + "loss": 0.207, + "step": 7288 + }, + { + "epoch": 1.9395955295369878, + "grad_norm": 0.4611193537712097, + "learning_rate": 1.363213991069397e-07, + "loss": 0.2169, + "step": 7289 + }, + { + "epoch": 1.9398616285258115, + "grad_norm": 0.26055365800857544, + "learning_rate": 1.3630566350718197e-07, + "loss": 0.1808, + "step": 7290 + }, + { + "epoch": 1.9401277275146356, + "grad_norm": 0.26582619547843933, + "learning_rate": 1.3628992687191288e-07, + "loss": 0.1887, + "step": 7291 + }, + { + "epoch": 1.9403938265034593, + "grad_norm": 0.3625509738922119, + "learning_rate": 1.3627418920158136e-07, + "loss": 0.1898, + "step": 7292 + }, + { + "epoch": 1.9406599254922832, + "grad_norm": 0.31806543469429016, + "learning_rate": 1.3625845049663627e-07, + "loss": 0.2067, + "step": 7293 + }, + { + "epoch": 1.940926024481107, + "grad_norm": 0.3153315484523773, + "learning_rate": 1.3624271075752644e-07, + "loss": 0.1906, + "step": 7294 + }, + { + "epoch": 1.9411921234699308, + "grad_norm": 0.3036922216415405, + "learning_rate": 1.362269699847009e-07, + "loss": 0.2011, + "step": 7295 + }, + { + "epoch": 1.9414582224587547, + "grad_norm": 0.2827955186367035, + "learning_rate": 1.3621122817860855e-07, + "loss": 0.1858, + "step": 7296 + }, + { + "epoch": 1.9417243214475786, + "grad_norm": 0.25607794523239136, + "learning_rate": 1.361954853396984e-07, + "loss": 0.1665, + "step": 7297 + }, + { + "epoch": 1.9419904204364022, + "grad_norm": 0.30349186062812805, + "learning_rate": 1.3617974146841942e-07, + "loss": 0.1882, + "step": 7298 + }, + { + "epoch": 1.9422565194252261, + "grad_norm": 0.36164501309394836, + "learning_rate": 1.3616399656522073e-07, + "loss": 0.2047, + "step": 7299 + }, + { + "epoch": 1.94252261841405, + "grad_norm": 0.27841246128082275, + "learning_rate": 1.3614825063055135e-07, + "loss": 0.1915, + "step": 7300 + }, + { + "epoch": 1.9427887174028737, + "grad_norm": 0.27227911353111267, + "learning_rate": 1.3613250366486037e-07, + "loss": 0.1967, + "step": 7301 + }, + { + "epoch": 1.9430548163916979, + "grad_norm": 0.3136804401874542, + "learning_rate": 1.3611675566859702e-07, + "loss": 0.2077, + "step": 7302 + }, + { + "epoch": 1.9433209153805215, + "grad_norm": 0.33561044931411743, + "learning_rate": 1.3610100664221038e-07, + "loss": 0.1772, + "step": 7303 + }, + { + "epoch": 1.9435870143693454, + "grad_norm": 0.32548925280570984, + "learning_rate": 1.3608525658614968e-07, + "loss": 0.1864, + "step": 7304 + }, + { + "epoch": 1.9438531133581693, + "grad_norm": 0.268410861492157, + "learning_rate": 1.360695055008641e-07, + "loss": 0.1724, + "step": 7305 + }, + { + "epoch": 1.944119212346993, + "grad_norm": 0.38056886196136475, + "learning_rate": 1.3605375338680295e-07, + "loss": 0.2105, + "step": 7306 + }, + { + "epoch": 1.944385311335817, + "grad_norm": 0.39507564902305603, + "learning_rate": 1.360380002444155e-07, + "loss": 0.1949, + "step": 7307 + }, + { + "epoch": 1.9446514103246408, + "grad_norm": 0.265150249004364, + "learning_rate": 1.3602224607415106e-07, + "loss": 0.1945, + "step": 7308 + }, + { + "epoch": 1.9449175093134645, + "grad_norm": 0.34945863485336304, + "learning_rate": 1.360064908764589e-07, + "loss": 0.2017, + "step": 7309 + }, + { + "epoch": 1.9451836083022884, + "grad_norm": 0.33939942717552185, + "learning_rate": 1.359907346517885e-07, + "loss": 0.2053, + "step": 7310 + }, + { + "epoch": 1.9454497072911123, + "grad_norm": 0.25501540303230286, + "learning_rate": 1.3597497740058922e-07, + "loss": 0.18, + "step": 7311 + }, + { + "epoch": 1.945715806279936, + "grad_norm": 0.3240993320941925, + "learning_rate": 1.3595921912331045e-07, + "loss": 0.1752, + "step": 7312 + }, + { + "epoch": 1.9459819052687601, + "grad_norm": 0.35455265641212463, + "learning_rate": 1.359434598204017e-07, + "loss": 0.1825, + "step": 7313 + }, + { + "epoch": 1.9462480042575838, + "grad_norm": 0.27864089608192444, + "learning_rate": 1.3592769949231243e-07, + "loss": 0.1957, + "step": 7314 + }, + { + "epoch": 1.9465141032464075, + "grad_norm": 0.29848363995552063, + "learning_rate": 1.3591193813949216e-07, + "loss": 0.1796, + "step": 7315 + }, + { + "epoch": 1.9467802022352316, + "grad_norm": 0.26774758100509644, + "learning_rate": 1.3589617576239044e-07, + "loss": 0.1882, + "step": 7316 + }, + { + "epoch": 1.9470463012240553, + "grad_norm": 0.2832013666629791, + "learning_rate": 1.3588041236145681e-07, + "loss": 0.1998, + "step": 7317 + }, + { + "epoch": 1.9473124002128792, + "grad_norm": 0.2843156158924103, + "learning_rate": 1.3586464793714094e-07, + "loss": 0.1871, + "step": 7318 + }, + { + "epoch": 1.9475784992017031, + "grad_norm": 0.3446902930736542, + "learning_rate": 1.3584888248989242e-07, + "loss": 0.1749, + "step": 7319 + }, + { + "epoch": 1.9478445981905268, + "grad_norm": 0.26219767332077026, + "learning_rate": 1.3583311602016094e-07, + "loss": 0.1945, + "step": 7320 + }, + { + "epoch": 1.9481106971793507, + "grad_norm": 0.2863503098487854, + "learning_rate": 1.3581734852839617e-07, + "loss": 0.1979, + "step": 7321 + }, + { + "epoch": 1.9483767961681746, + "grad_norm": 0.3368605971336365, + "learning_rate": 1.3580158001504786e-07, + "loss": 0.1826, + "step": 7322 + }, + { + "epoch": 1.9486428951569983, + "grad_norm": 0.2709243893623352, + "learning_rate": 1.3578581048056567e-07, + "loss": 0.1758, + "step": 7323 + }, + { + "epoch": 1.9489089941458222, + "grad_norm": 0.4769154191017151, + "learning_rate": 1.3577003992539947e-07, + "loss": 0.1705, + "step": 7324 + }, + { + "epoch": 1.949175093134646, + "grad_norm": 0.2775844931602478, + "learning_rate": 1.3575426834999905e-07, + "loss": 0.1859, + "step": 7325 + }, + { + "epoch": 1.9494411921234698, + "grad_norm": 0.2933977246284485, + "learning_rate": 1.3573849575481426e-07, + "loss": 0.1862, + "step": 7326 + }, + { + "epoch": 1.949707291112294, + "grad_norm": 0.25860530138015747, + "learning_rate": 1.3572272214029492e-07, + "loss": 0.1725, + "step": 7327 + }, + { + "epoch": 1.9499733901011176, + "grad_norm": 0.25155237317085266, + "learning_rate": 1.3570694750689096e-07, + "loss": 0.1647, + "step": 7328 + }, + { + "epoch": 1.9502394890899415, + "grad_norm": 0.25446105003356934, + "learning_rate": 1.3569117185505232e-07, + "loss": 0.1949, + "step": 7329 + }, + { + "epoch": 1.9505055880787654, + "grad_norm": 0.27869507670402527, + "learning_rate": 1.3567539518522888e-07, + "loss": 0.1665, + "step": 7330 + }, + { + "epoch": 1.950771687067589, + "grad_norm": 0.2881203591823578, + "learning_rate": 1.3565961749787072e-07, + "loss": 0.1676, + "step": 7331 + }, + { + "epoch": 1.951037786056413, + "grad_norm": 0.462383508682251, + "learning_rate": 1.356438387934278e-07, + "loss": 0.193, + "step": 7332 + }, + { + "epoch": 1.9513038850452369, + "grad_norm": 0.2610248327255249, + "learning_rate": 1.3562805907235014e-07, + "loss": 0.1731, + "step": 7333 + }, + { + "epoch": 1.9515699840340606, + "grad_norm": 0.31139448285102844, + "learning_rate": 1.3561227833508783e-07, + "loss": 0.1761, + "step": 7334 + }, + { + "epoch": 1.9518360830228845, + "grad_norm": 0.34177732467651367, + "learning_rate": 1.35596496582091e-07, + "loss": 0.2195, + "step": 7335 + }, + { + "epoch": 1.9521021820117084, + "grad_norm": 0.277300089597702, + "learning_rate": 1.3558071381380977e-07, + "loss": 0.1811, + "step": 7336 + }, + { + "epoch": 1.952368281000532, + "grad_norm": 0.2544694244861603, + "learning_rate": 1.3556493003069424e-07, + "loss": 0.1875, + "step": 7337 + }, + { + "epoch": 1.9526343799893562, + "grad_norm": 0.28408586978912354, + "learning_rate": 1.3554914523319467e-07, + "loss": 0.1779, + "step": 7338 + }, + { + "epoch": 1.9529004789781799, + "grad_norm": 0.26741987466812134, + "learning_rate": 1.3553335942176124e-07, + "loss": 0.1919, + "step": 7339 + }, + { + "epoch": 1.9531665779670038, + "grad_norm": 0.37930673360824585, + "learning_rate": 1.3551757259684416e-07, + "loss": 0.1938, + "step": 7340 + }, + { + "epoch": 1.9534326769558277, + "grad_norm": 0.3350761830806732, + "learning_rate": 1.3550178475889374e-07, + "loss": 0.2009, + "step": 7341 + }, + { + "epoch": 1.9536987759446514, + "grad_norm": 0.38335829973220825, + "learning_rate": 1.3548599590836028e-07, + "loss": 0.1971, + "step": 7342 + }, + { + "epoch": 1.9539648749334753, + "grad_norm": 0.28312036395072937, + "learning_rate": 1.3547020604569408e-07, + "loss": 0.1851, + "step": 7343 + }, + { + "epoch": 1.9542309739222992, + "grad_norm": 0.2724173069000244, + "learning_rate": 1.3545441517134557e-07, + "loss": 0.1628, + "step": 7344 + }, + { + "epoch": 1.9544970729111228, + "grad_norm": 0.4435981512069702, + "learning_rate": 1.3543862328576505e-07, + "loss": 0.171, + "step": 7345 + }, + { + "epoch": 1.9547631718999467, + "grad_norm": 0.35420113801956177, + "learning_rate": 1.3542283038940302e-07, + "loss": 0.2136, + "step": 7346 + }, + { + "epoch": 1.9550292708887707, + "grad_norm": 0.3709622323513031, + "learning_rate": 1.3540703648270988e-07, + "loss": 0.1735, + "step": 7347 + }, + { + "epoch": 1.9552953698775943, + "grad_norm": 0.28331905603408813, + "learning_rate": 1.353912415661361e-07, + "loss": 0.201, + "step": 7348 + }, + { + "epoch": 1.9555614688664185, + "grad_norm": 0.3939228057861328, + "learning_rate": 1.3537544564013216e-07, + "loss": 0.2055, + "step": 7349 + }, + { + "epoch": 1.9558275678552421, + "grad_norm": 0.327096164226532, + "learning_rate": 1.3535964870514866e-07, + "loss": 0.2034, + "step": 7350 + }, + { + "epoch": 1.956093666844066, + "grad_norm": 0.293702632188797, + "learning_rate": 1.3534385076163613e-07, + "loss": 0.1895, + "step": 7351 + }, + { + "epoch": 1.95635976583289, + "grad_norm": 0.30165308713912964, + "learning_rate": 1.3532805181004512e-07, + "loss": 0.1841, + "step": 7352 + }, + { + "epoch": 1.9566258648217136, + "grad_norm": 0.2550816237926483, + "learning_rate": 1.353122518508263e-07, + "loss": 0.1796, + "step": 7353 + }, + { + "epoch": 1.9568919638105375, + "grad_norm": 0.36966127157211304, + "learning_rate": 1.352964508844303e-07, + "loss": 0.2091, + "step": 7354 + }, + { + "epoch": 1.9571580627993614, + "grad_norm": 0.2507651150226593, + "learning_rate": 1.3528064891130782e-07, + "loss": 0.1959, + "step": 7355 + }, + { + "epoch": 1.9574241617881851, + "grad_norm": 0.3572951853275299, + "learning_rate": 1.352648459319095e-07, + "loss": 0.1954, + "step": 7356 + }, + { + "epoch": 1.957690260777009, + "grad_norm": 0.27565810084342957, + "learning_rate": 1.3524904194668615e-07, + "loss": 0.1961, + "step": 7357 + }, + { + "epoch": 1.957956359765833, + "grad_norm": 0.25304436683654785, + "learning_rate": 1.3523323695608848e-07, + "loss": 0.1704, + "step": 7358 + }, + { + "epoch": 1.9582224587546566, + "grad_norm": 0.2699600160121918, + "learning_rate": 1.3521743096056727e-07, + "loss": 0.187, + "step": 7359 + }, + { + "epoch": 1.9584885577434807, + "grad_norm": 0.4002685844898224, + "learning_rate": 1.352016239605734e-07, + "loss": 0.1793, + "step": 7360 + }, + { + "epoch": 1.9587546567323044, + "grad_norm": 0.3491087555885315, + "learning_rate": 1.3518581595655766e-07, + "loss": 0.183, + "step": 7361 + }, + { + "epoch": 1.959020755721128, + "grad_norm": 0.2664613127708435, + "learning_rate": 1.3517000694897093e-07, + "loss": 0.1674, + "step": 7362 + }, + { + "epoch": 1.9592868547099522, + "grad_norm": 0.310219943523407, + "learning_rate": 1.3515419693826416e-07, + "loss": 0.2063, + "step": 7363 + }, + { + "epoch": 1.959552953698776, + "grad_norm": 0.4670039415359497, + "learning_rate": 1.3513838592488824e-07, + "loss": 0.1902, + "step": 7364 + }, + { + "epoch": 1.9598190526875998, + "grad_norm": 0.38388097286224365, + "learning_rate": 1.3512257390929418e-07, + "loss": 0.1944, + "step": 7365 + }, + { + "epoch": 1.9600851516764237, + "grad_norm": 0.3511154055595398, + "learning_rate": 1.3510676089193292e-07, + "loss": 0.2107, + "step": 7366 + }, + { + "epoch": 1.9603512506652474, + "grad_norm": 0.26284390687942505, + "learning_rate": 1.350909468732555e-07, + "loss": 0.183, + "step": 7367 + }, + { + "epoch": 1.9606173496540713, + "grad_norm": 0.2581244707107544, + "learning_rate": 1.3507513185371295e-07, + "loss": 0.1713, + "step": 7368 + }, + { + "epoch": 1.9608834486428952, + "grad_norm": 0.36607927083969116, + "learning_rate": 1.350593158337564e-07, + "loss": 0.1756, + "step": 7369 + }, + { + "epoch": 1.961149547631719, + "grad_norm": 0.30931922793388367, + "learning_rate": 1.3504349881383689e-07, + "loss": 0.2063, + "step": 7370 + }, + { + "epoch": 1.9614156466205428, + "grad_norm": 0.3705548942089081, + "learning_rate": 1.3502768079440559e-07, + "loss": 0.2166, + "step": 7371 + }, + { + "epoch": 1.9616817456093667, + "grad_norm": 0.2446276992559433, + "learning_rate": 1.3501186177591364e-07, + "loss": 0.1771, + "step": 7372 + }, + { + "epoch": 1.9619478445981904, + "grad_norm": 0.35499632358551025, + "learning_rate": 1.3499604175881229e-07, + "loss": 0.2044, + "step": 7373 + }, + { + "epoch": 1.9622139435870145, + "grad_norm": 0.5009173154830933, + "learning_rate": 1.3498022074355264e-07, + "loss": 0.2059, + "step": 7374 + }, + { + "epoch": 1.9624800425758382, + "grad_norm": 0.28720301389694214, + "learning_rate": 1.3496439873058607e-07, + "loss": 0.1977, + "step": 7375 + }, + { + "epoch": 1.962746141564662, + "grad_norm": 0.25597891211509705, + "learning_rate": 1.349485757203638e-07, + "loss": 0.1671, + "step": 7376 + }, + { + "epoch": 1.963012240553486, + "grad_norm": 0.39320069551467896, + "learning_rate": 1.3493275171333709e-07, + "loss": 0.2104, + "step": 7377 + }, + { + "epoch": 1.9632783395423097, + "grad_norm": 0.25279149413108826, + "learning_rate": 1.3491692670995733e-07, + "loss": 0.1876, + "step": 7378 + }, + { + "epoch": 1.9635444385311336, + "grad_norm": 0.3897497355937958, + "learning_rate": 1.3490110071067588e-07, + "loss": 0.1891, + "step": 7379 + }, + { + "epoch": 1.9638105375199575, + "grad_norm": 0.2775239646434784, + "learning_rate": 1.3488527371594408e-07, + "loss": 0.1861, + "step": 7380 + }, + { + "epoch": 1.9640766365087812, + "grad_norm": 0.3465319573879242, + "learning_rate": 1.3486944572621343e-07, + "loss": 0.1789, + "step": 7381 + }, + { + "epoch": 1.964342735497605, + "grad_norm": 0.3211875855922699, + "learning_rate": 1.348536167419353e-07, + "loss": 0.1977, + "step": 7382 + }, + { + "epoch": 1.964608834486429, + "grad_norm": 0.25892093777656555, + "learning_rate": 1.348377867635612e-07, + "loss": 0.1797, + "step": 7383 + }, + { + "epoch": 1.9648749334752527, + "grad_norm": 0.28667473793029785, + "learning_rate": 1.3482195579154266e-07, + "loss": 0.2041, + "step": 7384 + }, + { + "epoch": 1.9651410324640768, + "grad_norm": 0.28063538670539856, + "learning_rate": 1.348061238263311e-07, + "loss": 0.1922, + "step": 7385 + }, + { + "epoch": 1.9654071314529005, + "grad_norm": 0.2918645441532135, + "learning_rate": 1.3479029086837825e-07, + "loss": 0.1843, + "step": 7386 + }, + { + "epoch": 1.9656732304417244, + "grad_norm": 0.3959598243236542, + "learning_rate": 1.347744569181356e-07, + "loss": 0.1996, + "step": 7387 + }, + { + "epoch": 1.9659393294305483, + "grad_norm": 0.24263688921928406, + "learning_rate": 1.347586219760547e-07, + "loss": 0.1713, + "step": 7388 + }, + { + "epoch": 1.966205428419372, + "grad_norm": 0.26323801279067993, + "learning_rate": 1.347427860425873e-07, + "loss": 0.186, + "step": 7389 + }, + { + "epoch": 1.9664715274081959, + "grad_norm": 0.35377639532089233, + "learning_rate": 1.3472694911818506e-07, + "loss": 0.1992, + "step": 7390 + }, + { + "epoch": 1.9667376263970198, + "grad_norm": 0.33721494674682617, + "learning_rate": 1.3471111120329969e-07, + "loss": 0.1813, + "step": 7391 + }, + { + "epoch": 1.9670037253858434, + "grad_norm": 0.263610303401947, + "learning_rate": 1.3469527229838282e-07, + "loss": 0.1755, + "step": 7392 + }, + { + "epoch": 1.9672698243746674, + "grad_norm": 0.4517683982849121, + "learning_rate": 1.3467943240388632e-07, + "loss": 0.207, + "step": 7393 + }, + { + "epoch": 1.9675359233634913, + "grad_norm": 0.44031426310539246, + "learning_rate": 1.3466359152026195e-07, + "loss": 0.1832, + "step": 7394 + }, + { + "epoch": 1.967802022352315, + "grad_norm": 0.2713046967983246, + "learning_rate": 1.3464774964796148e-07, + "loss": 0.1949, + "step": 7395 + }, + { + "epoch": 1.968068121341139, + "grad_norm": 0.2754189372062683, + "learning_rate": 1.3463190678743678e-07, + "loss": 0.1824, + "step": 7396 + }, + { + "epoch": 1.9683342203299627, + "grad_norm": 0.26502594351768494, + "learning_rate": 1.3461606293913977e-07, + "loss": 0.1734, + "step": 7397 + }, + { + "epoch": 1.9686003193187864, + "grad_norm": 0.308465838432312, + "learning_rate": 1.3460021810352225e-07, + "loss": 0.1878, + "step": 7398 + }, + { + "epoch": 1.9688664183076106, + "grad_norm": 0.4340057969093323, + "learning_rate": 1.345843722810362e-07, + "loss": 0.2001, + "step": 7399 + }, + { + "epoch": 1.9691325172964342, + "grad_norm": 0.24923861026763916, + "learning_rate": 1.345685254721336e-07, + "loss": 0.1778, + "step": 7400 + }, + { + "epoch": 1.9693986162852581, + "grad_norm": 0.27302631735801697, + "learning_rate": 1.345526776772664e-07, + "loss": 0.1776, + "step": 7401 + }, + { + "epoch": 1.969664715274082, + "grad_norm": 0.3188101649284363, + "learning_rate": 1.3453682889688663e-07, + "loss": 0.1891, + "step": 7402 + }, + { + "epoch": 1.9699308142629057, + "grad_norm": 0.3420531451702118, + "learning_rate": 1.345209791314463e-07, + "loss": 0.2073, + "step": 7403 + }, + { + "epoch": 1.9701969132517296, + "grad_norm": 0.4011087119579315, + "learning_rate": 1.3450512838139748e-07, + "loss": 0.1982, + "step": 7404 + }, + { + "epoch": 1.9704630122405535, + "grad_norm": 0.2570788860321045, + "learning_rate": 1.344892766471923e-07, + "loss": 0.1733, + "step": 7405 + }, + { + "epoch": 1.9707291112293772, + "grad_norm": 0.2561997175216675, + "learning_rate": 1.3447342392928288e-07, + "loss": 0.1739, + "step": 7406 + }, + { + "epoch": 1.9709952102182011, + "grad_norm": 0.27365580201148987, + "learning_rate": 1.3445757022812135e-07, + "loss": 0.1867, + "step": 7407 + }, + { + "epoch": 1.971261309207025, + "grad_norm": 0.3623085618019104, + "learning_rate": 1.3444171554415988e-07, + "loss": 0.2062, + "step": 7408 + }, + { + "epoch": 1.9715274081958487, + "grad_norm": 0.38600265979766846, + "learning_rate": 1.344258598778507e-07, + "loss": 0.1936, + "step": 7409 + }, + { + "epoch": 1.9717935071846728, + "grad_norm": 0.25753334164619446, + "learning_rate": 1.3441000322964602e-07, + "loss": 0.1661, + "step": 7410 + }, + { + "epoch": 1.9720596061734965, + "grad_norm": 0.31409960985183716, + "learning_rate": 1.3439414559999815e-07, + "loss": 0.1904, + "step": 7411 + }, + { + "epoch": 1.9723257051623204, + "grad_norm": 0.27618032693862915, + "learning_rate": 1.3437828698935936e-07, + "loss": 0.1909, + "step": 7412 + }, + { + "epoch": 1.9725918041511443, + "grad_norm": 0.27345988154411316, + "learning_rate": 1.3436242739818197e-07, + "loss": 0.1722, + "step": 7413 + }, + { + "epoch": 1.972857903139968, + "grad_norm": 0.2535339891910553, + "learning_rate": 1.343465668269183e-07, + "loss": 0.1838, + "step": 7414 + }, + { + "epoch": 1.973124002128792, + "grad_norm": 0.2858255207538605, + "learning_rate": 1.3433070527602075e-07, + "loss": 0.1706, + "step": 7415 + }, + { + "epoch": 1.9733901011176158, + "grad_norm": 0.3365011215209961, + "learning_rate": 1.3431484274594174e-07, + "loss": 0.1914, + "step": 7416 + }, + { + "epoch": 1.9736562001064395, + "grad_norm": 0.3069649338722229, + "learning_rate": 1.3429897923713366e-07, + "loss": 0.188, + "step": 7417 + }, + { + "epoch": 1.9739222990952634, + "grad_norm": 0.29061564803123474, + "learning_rate": 1.3428311475004897e-07, + "loss": 0.1989, + "step": 7418 + }, + { + "epoch": 1.9741883980840873, + "grad_norm": 0.3559923470020294, + "learning_rate": 1.3426724928514025e-07, + "loss": 0.1852, + "step": 7419 + }, + { + "epoch": 1.974454497072911, + "grad_norm": 0.31973162293434143, + "learning_rate": 1.3425138284285992e-07, + "loss": 0.1797, + "step": 7420 + }, + { + "epoch": 1.9747205960617351, + "grad_norm": 0.2949444353580475, + "learning_rate": 1.3423551542366052e-07, + "loss": 0.1855, + "step": 7421 + }, + { + "epoch": 1.9749866950505588, + "grad_norm": 0.36385053396224976, + "learning_rate": 1.3421964702799466e-07, + "loss": 0.1873, + "step": 7422 + }, + { + "epoch": 1.9752527940393827, + "grad_norm": 0.30308350920677185, + "learning_rate": 1.3420377765631494e-07, + "loss": 0.168, + "step": 7423 + }, + { + "epoch": 1.9755188930282066, + "grad_norm": 0.4498383700847626, + "learning_rate": 1.34187907309074e-07, + "loss": 0.1852, + "step": 7424 + }, + { + "epoch": 1.9757849920170303, + "grad_norm": 0.37428995966911316, + "learning_rate": 1.3417203598672442e-07, + "loss": 0.2126, + "step": 7425 + }, + { + "epoch": 1.9760510910058542, + "grad_norm": 0.27312660217285156, + "learning_rate": 1.3415616368971895e-07, + "loss": 0.1903, + "step": 7426 + }, + { + "epoch": 1.976317189994678, + "grad_norm": 0.33455023169517517, + "learning_rate": 1.3414029041851033e-07, + "loss": 0.2032, + "step": 7427 + }, + { + "epoch": 1.9765832889835018, + "grad_norm": 0.38131603598594666, + "learning_rate": 1.341244161735512e-07, + "loss": 0.188, + "step": 7428 + }, + { + "epoch": 1.9768493879723257, + "grad_norm": 0.2942029535770416, + "learning_rate": 1.3410854095529437e-07, + "loss": 0.1845, + "step": 7429 + }, + { + "epoch": 1.9771154869611496, + "grad_norm": 0.25387126207351685, + "learning_rate": 1.3409266476419268e-07, + "loss": 0.1744, + "step": 7430 + }, + { + "epoch": 1.9773815859499733, + "grad_norm": 0.27199167013168335, + "learning_rate": 1.340767876006989e-07, + "loss": 0.1903, + "step": 7431 + }, + { + "epoch": 1.9776476849387974, + "grad_norm": 0.2498457133769989, + "learning_rate": 1.3406090946526587e-07, + "loss": 0.1827, + "step": 7432 + }, + { + "epoch": 1.977913783927621, + "grad_norm": 0.2784152030944824, + "learning_rate": 1.3404503035834648e-07, + "loss": 0.1807, + "step": 7433 + }, + { + "epoch": 1.9781798829164448, + "grad_norm": 0.25885337591171265, + "learning_rate": 1.3402915028039366e-07, + "loss": 0.1957, + "step": 7434 + }, + { + "epoch": 1.9784459819052689, + "grad_norm": 0.2732153534889221, + "learning_rate": 1.3401326923186033e-07, + "loss": 0.1882, + "step": 7435 + }, + { + "epoch": 1.9787120808940926, + "grad_norm": 0.38183459639549255, + "learning_rate": 1.3399738721319942e-07, + "loss": 0.1657, + "step": 7436 + }, + { + "epoch": 1.9789781798829165, + "grad_norm": 0.35440459847450256, + "learning_rate": 1.3398150422486397e-07, + "loss": 0.1842, + "step": 7437 + }, + { + "epoch": 1.9792442788717404, + "grad_norm": 0.35762181878089905, + "learning_rate": 1.3396562026730694e-07, + "loss": 0.1842, + "step": 7438 + }, + { + "epoch": 1.979510377860564, + "grad_norm": 0.26219335198402405, + "learning_rate": 1.3394973534098142e-07, + "loss": 0.172, + "step": 7439 + }, + { + "epoch": 1.979776476849388, + "grad_norm": 0.28100430965423584, + "learning_rate": 1.3393384944634046e-07, + "loss": 0.1857, + "step": 7440 + }, + { + "epoch": 1.9800425758382119, + "grad_norm": 0.3130834698677063, + "learning_rate": 1.3391796258383714e-07, + "loss": 0.1837, + "step": 7441 + }, + { + "epoch": 1.9803086748270355, + "grad_norm": 0.3612997233867645, + "learning_rate": 1.339020747539246e-07, + "loss": 0.1857, + "step": 7442 + }, + { + "epoch": 1.9805747738158594, + "grad_norm": 0.3470420837402344, + "learning_rate": 1.33886185957056e-07, + "loss": 0.2023, + "step": 7443 + }, + { + "epoch": 1.9808408728046834, + "grad_norm": 0.34956157207489014, + "learning_rate": 1.3387029619368452e-07, + "loss": 0.1946, + "step": 7444 + }, + { + "epoch": 1.981106971793507, + "grad_norm": 0.2936836779117584, + "learning_rate": 1.3385440546426338e-07, + "loss": 0.197, + "step": 7445 + }, + { + "epoch": 1.9813730707823312, + "grad_norm": 0.2943107783794403, + "learning_rate": 1.3383851376924577e-07, + "loss": 0.1917, + "step": 7446 + }, + { + "epoch": 1.9816391697711548, + "grad_norm": 0.28170886635780334, + "learning_rate": 1.3382262110908503e-07, + "loss": 0.1809, + "step": 7447 + }, + { + "epoch": 1.9819052687599787, + "grad_norm": 0.34075552225112915, + "learning_rate": 1.3380672748423435e-07, + "loss": 0.2073, + "step": 7448 + }, + { + "epoch": 1.9821713677488026, + "grad_norm": 0.2794908285140991, + "learning_rate": 1.3379083289514715e-07, + "loss": 0.1865, + "step": 7449 + }, + { + "epoch": 1.9824374667376263, + "grad_norm": 0.4466099739074707, + "learning_rate": 1.337749373422767e-07, + "loss": 0.1857, + "step": 7450 + }, + { + "epoch": 1.9827035657264502, + "grad_norm": 0.24833571910858154, + "learning_rate": 1.3375904082607644e-07, + "loss": 0.1819, + "step": 7451 + }, + { + "epoch": 1.9829696647152741, + "grad_norm": 0.41975924372673035, + "learning_rate": 1.337431433469997e-07, + "loss": 0.2076, + "step": 7452 + }, + { + "epoch": 1.9832357637040978, + "grad_norm": 0.28234460949897766, + "learning_rate": 1.3372724490549993e-07, + "loss": 0.1838, + "step": 7453 + }, + { + "epoch": 1.9835018626929217, + "grad_norm": 0.35041332244873047, + "learning_rate": 1.337113455020306e-07, + "loss": 0.1794, + "step": 7454 + }, + { + "epoch": 1.9837679616817456, + "grad_norm": 0.2811439335346222, + "learning_rate": 1.336954451370452e-07, + "loss": 0.1721, + "step": 7455 + }, + { + "epoch": 1.9840340606705693, + "grad_norm": 0.40633732080459595, + "learning_rate": 1.3367954381099725e-07, + "loss": 0.1949, + "step": 7456 + }, + { + "epoch": 1.9843001596593934, + "grad_norm": 0.29563790559768677, + "learning_rate": 1.3366364152434025e-07, + "loss": 0.1746, + "step": 7457 + }, + { + "epoch": 1.9845662586482171, + "grad_norm": 0.36776185035705566, + "learning_rate": 1.3364773827752781e-07, + "loss": 0.229, + "step": 7458 + }, + { + "epoch": 1.984832357637041, + "grad_norm": 0.41916462779045105, + "learning_rate": 1.3363183407101352e-07, + "loss": 0.2014, + "step": 7459 + }, + { + "epoch": 1.985098456625865, + "grad_norm": 0.27108144760131836, + "learning_rate": 1.3361592890525092e-07, + "loss": 0.1795, + "step": 7460 + }, + { + "epoch": 1.9853645556146886, + "grad_norm": 0.35154643654823303, + "learning_rate": 1.3360002278069373e-07, + "loss": 0.2092, + "step": 7461 + }, + { + "epoch": 1.9856306546035125, + "grad_norm": 0.3831002116203308, + "learning_rate": 1.3358411569779565e-07, + "loss": 0.1885, + "step": 7462 + }, + { + "epoch": 1.9858967535923364, + "grad_norm": 0.2540339231491089, + "learning_rate": 1.3356820765701032e-07, + "loss": 0.155, + "step": 7463 + }, + { + "epoch": 1.98616285258116, + "grad_norm": 0.3495255708694458, + "learning_rate": 1.3355229865879147e-07, + "loss": 0.1908, + "step": 7464 + }, + { + "epoch": 1.986428951569984, + "grad_norm": 0.26917770504951477, + "learning_rate": 1.335363887035929e-07, + "loss": 0.181, + "step": 7465 + }, + { + "epoch": 1.986695050558808, + "grad_norm": 0.3832339942455292, + "learning_rate": 1.335204777918684e-07, + "loss": 0.1858, + "step": 7466 + }, + { + "epoch": 1.9869611495476316, + "grad_norm": 0.24061691761016846, + "learning_rate": 1.3350456592407175e-07, + "loss": 0.1471, + "step": 7467 + }, + { + "epoch": 1.9872272485364557, + "grad_norm": 0.34533336758613586, + "learning_rate": 1.3348865310065676e-07, + "loss": 0.1942, + "step": 7468 + }, + { + "epoch": 1.9874933475252794, + "grad_norm": 0.2680704891681671, + "learning_rate": 1.3347273932207734e-07, + "loss": 0.1706, + "step": 7469 + }, + { + "epoch": 1.9877594465141033, + "grad_norm": 0.30539292097091675, + "learning_rate": 1.334568245887874e-07, + "loss": 0.1881, + "step": 7470 + }, + { + "epoch": 1.9880255455029272, + "grad_norm": 0.2804717719554901, + "learning_rate": 1.334409089012408e-07, + "loss": 0.1836, + "step": 7471 + }, + { + "epoch": 1.9882916444917509, + "grad_norm": 0.2795630097389221, + "learning_rate": 1.3342499225989154e-07, + "loss": 0.1953, + "step": 7472 + }, + { + "epoch": 1.9885577434805748, + "grad_norm": 0.2525864541530609, + "learning_rate": 1.3340907466519359e-07, + "loss": 0.1864, + "step": 7473 + }, + { + "epoch": 1.9888238424693987, + "grad_norm": 0.39518746733665466, + "learning_rate": 1.3339315611760093e-07, + "loss": 0.1908, + "step": 7474 + }, + { + "epoch": 1.9890899414582224, + "grad_norm": 0.27138566970825195, + "learning_rate": 1.3337723661756762e-07, + "loss": 0.1921, + "step": 7475 + }, + { + "epoch": 1.9893560404470463, + "grad_norm": 0.2521570920944214, + "learning_rate": 1.3336131616554765e-07, + "loss": 0.1774, + "step": 7476 + }, + { + "epoch": 1.9896221394358702, + "grad_norm": 0.30427348613739014, + "learning_rate": 1.333453947619952e-07, + "loss": 0.1808, + "step": 7477 + }, + { + "epoch": 1.9898882384246939, + "grad_norm": 0.3103210926055908, + "learning_rate": 1.333294724073643e-07, + "loss": 0.1899, + "step": 7478 + }, + { + "epoch": 1.990154337413518, + "grad_norm": 0.3688463866710663, + "learning_rate": 1.333135491021091e-07, + "loss": 0.1818, + "step": 7479 + }, + { + "epoch": 1.9904204364023417, + "grad_norm": 0.3308406472206116, + "learning_rate": 1.332976248466838e-07, + "loss": 0.2046, + "step": 7480 + }, + { + "epoch": 1.9906865353911654, + "grad_norm": 0.34486380219459534, + "learning_rate": 1.332816996415426e-07, + "loss": 0.1942, + "step": 7481 + }, + { + "epoch": 1.9909526343799895, + "grad_norm": 0.27067944407463074, + "learning_rate": 1.332657734871397e-07, + "loss": 0.1773, + "step": 7482 + }, + { + "epoch": 1.9912187333688132, + "grad_norm": 0.393408864736557, + "learning_rate": 1.3324984638392928e-07, + "loss": 0.2034, + "step": 7483 + }, + { + "epoch": 1.991484832357637, + "grad_norm": 0.25870248675346375, + "learning_rate": 1.3323391833236573e-07, + "loss": 0.1692, + "step": 7484 + }, + { + "epoch": 1.991750931346461, + "grad_norm": 0.28061190247535706, + "learning_rate": 1.3321798933290328e-07, + "loss": 0.1934, + "step": 7485 + }, + { + "epoch": 1.9920170303352847, + "grad_norm": 0.2642682194709778, + "learning_rate": 1.3320205938599626e-07, + "loss": 0.1925, + "step": 7486 + }, + { + "epoch": 1.9922831293241086, + "grad_norm": 0.2538844347000122, + "learning_rate": 1.3318612849209906e-07, + "loss": 0.1635, + "step": 7487 + }, + { + "epoch": 1.9925492283129325, + "grad_norm": 0.4164590537548065, + "learning_rate": 1.3317019665166603e-07, + "loss": 0.2053, + "step": 7488 + }, + { + "epoch": 1.9928153273017561, + "grad_norm": 0.35074207186698914, + "learning_rate": 1.331542638651516e-07, + "loss": 0.2014, + "step": 7489 + }, + { + "epoch": 1.99308142629058, + "grad_norm": 0.26817241311073303, + "learning_rate": 1.3313833013301013e-07, + "loss": 0.1794, + "step": 7490 + }, + { + "epoch": 1.993347525279404, + "grad_norm": 0.33667996525764465, + "learning_rate": 1.3312239545569618e-07, + "loss": 0.1913, + "step": 7491 + }, + { + "epoch": 1.9936136242682276, + "grad_norm": 0.24042153358459473, + "learning_rate": 1.3310645983366422e-07, + "loss": 0.1582, + "step": 7492 + }, + { + "epoch": 1.9938797232570518, + "grad_norm": 0.25866109132766724, + "learning_rate": 1.3309052326736876e-07, + "loss": 0.1811, + "step": 7493 + }, + { + "epoch": 1.9941458222458754, + "grad_norm": 0.48693379759788513, + "learning_rate": 1.3307458575726429e-07, + "loss": 0.1942, + "step": 7494 + }, + { + "epoch": 1.9944119212346993, + "grad_norm": 0.30196529626846313, + "learning_rate": 1.3305864730380546e-07, + "loss": 0.1946, + "step": 7495 + }, + { + "epoch": 1.9946780202235233, + "grad_norm": 0.2989368140697479, + "learning_rate": 1.330427079074468e-07, + "loss": 0.1809, + "step": 7496 + }, + { + "epoch": 1.994944119212347, + "grad_norm": 0.2892813980579376, + "learning_rate": 1.33026767568643e-07, + "loss": 0.1788, + "step": 7497 + }, + { + "epoch": 1.9952102182011708, + "grad_norm": 0.28868088126182556, + "learning_rate": 1.3301082628784867e-07, + "loss": 0.1891, + "step": 7498 + }, + { + "epoch": 1.9954763171899947, + "grad_norm": 0.346878319978714, + "learning_rate": 1.3299488406551846e-07, + "loss": 0.1721, + "step": 7499 + }, + { + "epoch": 1.9957424161788184, + "grad_norm": 0.247183695435524, + "learning_rate": 1.3297894090210715e-07, + "loss": 0.1717, + "step": 7500 + }, + { + "epoch": 1.9960085151676423, + "grad_norm": 0.3613927364349365, + "learning_rate": 1.329629967980694e-07, + "loss": 0.2134, + "step": 7501 + }, + { + "epoch": 1.9962746141564662, + "grad_norm": 0.41998347640037537, + "learning_rate": 1.3294705175386002e-07, + "loss": 0.171, + "step": 7502 + }, + { + "epoch": 1.99654071314529, + "grad_norm": 0.3529558777809143, + "learning_rate": 1.3293110576993373e-07, + "loss": 0.2176, + "step": 7503 + }, + { + "epoch": 1.996806812134114, + "grad_norm": 0.37803369760513306, + "learning_rate": 1.329151588467454e-07, + "loss": 0.1939, + "step": 7504 + }, + { + "epoch": 1.9970729111229377, + "grad_norm": 0.32625868916511536, + "learning_rate": 1.328992109847499e-07, + "loss": 0.2051, + "step": 7505 + }, + { + "epoch": 1.9973390101117616, + "grad_norm": 0.3137415647506714, + "learning_rate": 1.32883262184402e-07, + "loss": 0.2026, + "step": 7506 + }, + { + "epoch": 1.9976051091005855, + "grad_norm": 0.2860855460166931, + "learning_rate": 1.3286731244615665e-07, + "loss": 0.1946, + "step": 7507 + }, + { + "epoch": 1.9978712080894092, + "grad_norm": 0.27786684036254883, + "learning_rate": 1.3285136177046873e-07, + "loss": 0.1814, + "step": 7508 + }, + { + "epoch": 1.9981373070782331, + "grad_norm": 0.31623944640159607, + "learning_rate": 1.3283541015779325e-07, + "loss": 0.1972, + "step": 7509 + }, + { + "epoch": 1.998403406067057, + "grad_norm": 0.26320570707321167, + "learning_rate": 1.3281945760858516e-07, + "loss": 0.1726, + "step": 7510 + }, + { + "epoch": 1.9986695050558807, + "grad_norm": 0.29754289984703064, + "learning_rate": 1.3280350412329944e-07, + "loss": 0.1809, + "step": 7511 + }, + { + "epoch": 1.9989356040447046, + "grad_norm": 0.32361364364624023, + "learning_rate": 1.327875497023911e-07, + "loss": 0.1966, + "step": 7512 + }, + { + "epoch": 1.9992017030335285, + "grad_norm": 0.2829449474811554, + "learning_rate": 1.3277159434631525e-07, + "loss": 0.1801, + "step": 7513 + }, + { + "epoch": 1.9994678020223522, + "grad_norm": 0.39015552401542664, + "learning_rate": 1.3275563805552689e-07, + "loss": 0.2003, + "step": 7514 + }, + { + "epoch": 1.9997339010111763, + "grad_norm": 0.349304735660553, + "learning_rate": 1.327396808304812e-07, + "loss": 0.1902, + "step": 7515 + }, + { + "epoch": 2.0, + "grad_norm": 0.2857195734977722, + "learning_rate": 1.3272372267163327e-07, + "loss": 0.1768, + "step": 7516 + }, + { + "epoch": 2.0002660989888237, + "grad_norm": 0.2794027030467987, + "learning_rate": 1.3270776357943829e-07, + "loss": 0.1994, + "step": 7517 + }, + { + "epoch": 2.000532197977648, + "grad_norm": 0.29084622859954834, + "learning_rate": 1.3269180355435141e-07, + "loss": 0.1966, + "step": 7518 + }, + { + "epoch": 2.0007982969664715, + "grad_norm": 0.2950534522533417, + "learning_rate": 1.3267584259682786e-07, + "loss": 0.1776, + "step": 7519 + }, + { + "epoch": 2.001064395955295, + "grad_norm": 0.3548552095890045, + "learning_rate": 1.3265988070732287e-07, + "loss": 0.1986, + "step": 7520 + }, + { + "epoch": 2.0013304949441193, + "grad_norm": 0.31774991750717163, + "learning_rate": 1.3264391788629176e-07, + "loss": 0.1849, + "step": 7521 + }, + { + "epoch": 2.001596593932943, + "grad_norm": 0.2918813228607178, + "learning_rate": 1.3262795413418972e-07, + "loss": 0.1933, + "step": 7522 + }, + { + "epoch": 2.001862692921767, + "grad_norm": 0.2681015729904175, + "learning_rate": 1.3261198945147217e-07, + "loss": 0.1993, + "step": 7523 + }, + { + "epoch": 2.002128791910591, + "grad_norm": 0.33941978216171265, + "learning_rate": 1.325960238385944e-07, + "loss": 0.1799, + "step": 7524 + }, + { + "epoch": 2.0023948908994145, + "grad_norm": 0.2824164927005768, + "learning_rate": 1.3258005729601177e-07, + "loss": 0.1963, + "step": 7525 + }, + { + "epoch": 2.0026609898882386, + "grad_norm": 0.27127692103385925, + "learning_rate": 1.3256408982417968e-07, + "loss": 0.177, + "step": 7526 + }, + { + "epoch": 2.0029270888770623, + "grad_norm": 0.3175415098667145, + "learning_rate": 1.325481214235536e-07, + "loss": 0.1956, + "step": 7527 + }, + { + "epoch": 2.003193187865886, + "grad_norm": 0.26565372943878174, + "learning_rate": 1.3253215209458897e-07, + "loss": 0.1678, + "step": 7528 + }, + { + "epoch": 2.00345928685471, + "grad_norm": 0.4259481430053711, + "learning_rate": 1.3251618183774125e-07, + "loss": 0.2269, + "step": 7529 + }, + { + "epoch": 2.0037253858435338, + "grad_norm": 0.38323974609375, + "learning_rate": 1.3250021065346593e-07, + "loss": 0.1921, + "step": 7530 + }, + { + "epoch": 2.0039914848323575, + "grad_norm": 0.3723304271697998, + "learning_rate": 1.3248423854221859e-07, + "loss": 0.1833, + "step": 7531 + }, + { + "epoch": 2.0042575838211816, + "grad_norm": 0.2905561923980713, + "learning_rate": 1.3246826550445473e-07, + "loss": 0.2009, + "step": 7532 + }, + { + "epoch": 2.0045236828100053, + "grad_norm": 0.32938072085380554, + "learning_rate": 1.3245229154062997e-07, + "loss": 0.1855, + "step": 7533 + }, + { + "epoch": 2.0047897817988294, + "grad_norm": 0.3303757309913635, + "learning_rate": 1.324363166511999e-07, + "loss": 0.187, + "step": 7534 + }, + { + "epoch": 2.005055880787653, + "grad_norm": 0.2784595787525177, + "learning_rate": 1.3242034083662017e-07, + "loss": 0.1889, + "step": 7535 + }, + { + "epoch": 2.0053219797764767, + "grad_norm": 0.6727943420410156, + "learning_rate": 1.3240436409734645e-07, + "loss": 0.1904, + "step": 7536 + }, + { + "epoch": 2.005588078765301, + "grad_norm": 0.26899418234825134, + "learning_rate": 1.323883864338344e-07, + "loss": 0.1929, + "step": 7537 + }, + { + "epoch": 2.0058541777541246, + "grad_norm": 0.41993623971939087, + "learning_rate": 1.3237240784653975e-07, + "loss": 0.19, + "step": 7538 + }, + { + "epoch": 2.0061202767429482, + "grad_norm": 0.29769203066825867, + "learning_rate": 1.3235642833591827e-07, + "loss": 0.1747, + "step": 7539 + }, + { + "epoch": 2.0063863757317724, + "grad_norm": 0.5026395916938782, + "learning_rate": 1.3234044790242566e-07, + "loss": 0.2156, + "step": 7540 + }, + { + "epoch": 2.006652474720596, + "grad_norm": 0.3391337990760803, + "learning_rate": 1.323244665465178e-07, + "loss": 0.1906, + "step": 7541 + }, + { + "epoch": 2.0069185737094197, + "grad_norm": 0.2583642899990082, + "learning_rate": 1.3230848426865045e-07, + "loss": 0.1729, + "step": 7542 + }, + { + "epoch": 2.007184672698244, + "grad_norm": 0.3188518285751343, + "learning_rate": 1.3229250106927948e-07, + "loss": 0.1837, + "step": 7543 + }, + { + "epoch": 2.0074507716870675, + "grad_norm": 0.27230072021484375, + "learning_rate": 1.3227651694886074e-07, + "loss": 0.1731, + "step": 7544 + }, + { + "epoch": 2.007716870675891, + "grad_norm": 0.2737715244293213, + "learning_rate": 1.3226053190785017e-07, + "loss": 0.181, + "step": 7545 + }, + { + "epoch": 2.0079829696647153, + "grad_norm": 0.27834829688072205, + "learning_rate": 1.3224454594670365e-07, + "loss": 0.1901, + "step": 7546 + }, + { + "epoch": 2.008249068653539, + "grad_norm": 0.27296993136405945, + "learning_rate": 1.3222855906587717e-07, + "loss": 0.1906, + "step": 7547 + }, + { + "epoch": 2.008515167642363, + "grad_norm": 0.36830341815948486, + "learning_rate": 1.3221257126582668e-07, + "loss": 0.1767, + "step": 7548 + }, + { + "epoch": 2.008781266631187, + "grad_norm": 0.28320035338401794, + "learning_rate": 1.3219658254700823e-07, + "loss": 0.1752, + "step": 7549 + }, + { + "epoch": 2.0090473656200105, + "grad_norm": 0.28589335083961487, + "learning_rate": 1.321805929098778e-07, + "loss": 0.1968, + "step": 7550 + }, + { + "epoch": 2.0093134646088346, + "grad_norm": 0.2716760039329529, + "learning_rate": 1.3216460235489146e-07, + "loss": 0.1867, + "step": 7551 + }, + { + "epoch": 2.0095795635976583, + "grad_norm": 0.3368282616138458, + "learning_rate": 1.321486108825053e-07, + "loss": 0.1728, + "step": 7552 + }, + { + "epoch": 2.009845662586482, + "grad_norm": 0.30680936574935913, + "learning_rate": 1.3213261849317546e-07, + "loss": 0.1863, + "step": 7553 + }, + { + "epoch": 2.010111761575306, + "grad_norm": 0.27752119302749634, + "learning_rate": 1.32116625187358e-07, + "loss": 0.1812, + "step": 7554 + }, + { + "epoch": 2.01037786056413, + "grad_norm": 0.27906695008277893, + "learning_rate": 1.3210063096550915e-07, + "loss": 0.1758, + "step": 7555 + }, + { + "epoch": 2.0106439595529535, + "grad_norm": 0.2585432827472687, + "learning_rate": 1.3208463582808507e-07, + "loss": 0.1758, + "step": 7556 + }, + { + "epoch": 2.0109100585417776, + "grad_norm": 0.2644117772579193, + "learning_rate": 1.3206863977554197e-07, + "loss": 0.1833, + "step": 7557 + }, + { + "epoch": 2.0111761575306013, + "grad_norm": 0.2675779461860657, + "learning_rate": 1.320526428083361e-07, + "loss": 0.179, + "step": 7558 + }, + { + "epoch": 2.0114422565194254, + "grad_norm": 0.30524060130119324, + "learning_rate": 1.3203664492692373e-07, + "loss": 0.1982, + "step": 7559 + }, + { + "epoch": 2.011708355508249, + "grad_norm": 0.29245173931121826, + "learning_rate": 1.3202064613176114e-07, + "loss": 0.1839, + "step": 7560 + }, + { + "epoch": 2.011974454497073, + "grad_norm": 0.3651466369628906, + "learning_rate": 1.3200464642330467e-07, + "loss": 0.1936, + "step": 7561 + }, + { + "epoch": 2.012240553485897, + "grad_norm": 0.3643714487552643, + "learning_rate": 1.3198864580201062e-07, + "loss": 0.207, + "step": 7562 + }, + { + "epoch": 2.0125066524747206, + "grad_norm": 0.2787547707557678, + "learning_rate": 1.3197264426833537e-07, + "loss": 0.2042, + "step": 7563 + }, + { + "epoch": 2.0127727514635443, + "grad_norm": 0.420675128698349, + "learning_rate": 1.3195664182273535e-07, + "loss": 0.191, + "step": 7564 + }, + { + "epoch": 2.0130388504523684, + "grad_norm": 0.3836469352245331, + "learning_rate": 1.31940638465667e-07, + "loss": 0.2197, + "step": 7565 + }, + { + "epoch": 2.013304949441192, + "grad_norm": 0.3275909721851349, + "learning_rate": 1.319246341975867e-07, + "loss": 0.1955, + "step": 7566 + }, + { + "epoch": 2.0135710484300158, + "grad_norm": 0.2422790825366974, + "learning_rate": 1.3190862901895096e-07, + "loss": 0.1738, + "step": 7567 + }, + { + "epoch": 2.01383714741884, + "grad_norm": 0.2844204306602478, + "learning_rate": 1.3189262293021633e-07, + "loss": 0.1846, + "step": 7568 + }, + { + "epoch": 2.0141032464076636, + "grad_norm": 0.2534800171852112, + "learning_rate": 1.3187661593183923e-07, + "loss": 0.1907, + "step": 7569 + }, + { + "epoch": 2.0143693453964877, + "grad_norm": 0.27437546849250793, + "learning_rate": 1.3186060802427628e-07, + "loss": 0.193, + "step": 7570 + }, + { + "epoch": 2.0146354443853114, + "grad_norm": 0.3346426486968994, + "learning_rate": 1.3184459920798404e-07, + "loss": 0.1961, + "step": 7571 + }, + { + "epoch": 2.014901543374135, + "grad_norm": 0.26751288771629333, + "learning_rate": 1.3182858948341915e-07, + "loss": 0.1852, + "step": 7572 + }, + { + "epoch": 2.015167642362959, + "grad_norm": 0.26014870405197144, + "learning_rate": 1.3181257885103816e-07, + "loss": 0.1604, + "step": 7573 + }, + { + "epoch": 2.015433741351783, + "grad_norm": 0.266690194606781, + "learning_rate": 1.317965673112978e-07, + "loss": 0.1796, + "step": 7574 + }, + { + "epoch": 2.0156998403406066, + "grad_norm": 0.2550191879272461, + "learning_rate": 1.3178055486465473e-07, + "loss": 0.1632, + "step": 7575 + }, + { + "epoch": 2.0159659393294307, + "grad_norm": 0.29750561714172363, + "learning_rate": 1.3176454151156567e-07, + "loss": 0.2019, + "step": 7576 + }, + { + "epoch": 2.0162320383182544, + "grad_norm": 0.35628700256347656, + "learning_rate": 1.3174852725248732e-07, + "loss": 0.216, + "step": 7577 + }, + { + "epoch": 2.016498137307078, + "grad_norm": 0.2857828438282013, + "learning_rate": 1.3173251208787645e-07, + "loss": 0.1608, + "step": 7578 + }, + { + "epoch": 2.016764236295902, + "grad_norm": 0.2839861810207367, + "learning_rate": 1.3171649601818986e-07, + "loss": 0.1809, + "step": 7579 + }, + { + "epoch": 2.017030335284726, + "grad_norm": 0.2933330833911896, + "learning_rate": 1.3170047904388434e-07, + "loss": 0.1937, + "step": 7580 + }, + { + "epoch": 2.0172964342735495, + "grad_norm": 0.2719610035419464, + "learning_rate": 1.3168446116541673e-07, + "loss": 0.1925, + "step": 7581 + }, + { + "epoch": 2.0175625332623737, + "grad_norm": 0.3999505639076233, + "learning_rate": 1.3166844238324392e-07, + "loss": 0.1915, + "step": 7582 + }, + { + "epoch": 2.0178286322511974, + "grad_norm": 0.2564387321472168, + "learning_rate": 1.316524226978228e-07, + "loss": 0.1813, + "step": 7583 + }, + { + "epoch": 2.0180947312400215, + "grad_norm": 0.27330607175827026, + "learning_rate": 1.3163640210961022e-07, + "loss": 0.2008, + "step": 7584 + }, + { + "epoch": 2.018360830228845, + "grad_norm": 0.2799892723560333, + "learning_rate": 1.316203806190632e-07, + "loss": 0.1812, + "step": 7585 + }, + { + "epoch": 2.018626929217669, + "grad_norm": 0.2799573838710785, + "learning_rate": 1.3160435822663867e-07, + "loss": 0.1816, + "step": 7586 + }, + { + "epoch": 2.018893028206493, + "grad_norm": 0.27358826994895935, + "learning_rate": 1.3158833493279362e-07, + "loss": 0.1803, + "step": 7587 + }, + { + "epoch": 2.0191591271953166, + "grad_norm": 0.30945491790771484, + "learning_rate": 1.3157231073798505e-07, + "loss": 0.1909, + "step": 7588 + }, + { + "epoch": 2.0194252261841403, + "grad_norm": 0.31611424684524536, + "learning_rate": 1.3155628564267001e-07, + "loss": 0.1761, + "step": 7589 + }, + { + "epoch": 2.0196913251729645, + "grad_norm": 0.4080422818660736, + "learning_rate": 1.315402596473056e-07, + "loss": 0.184, + "step": 7590 + }, + { + "epoch": 2.019957424161788, + "grad_norm": 0.34991970658302307, + "learning_rate": 1.3152423275234888e-07, + "loss": 0.1783, + "step": 7591 + }, + { + "epoch": 2.020223523150612, + "grad_norm": 0.31871622800827026, + "learning_rate": 1.31508204958257e-07, + "loss": 0.1787, + "step": 7592 + }, + { + "epoch": 2.020489622139436, + "grad_norm": 0.43884512782096863, + "learning_rate": 1.3149217626548707e-07, + "loss": 0.1691, + "step": 7593 + }, + { + "epoch": 2.0207557211282596, + "grad_norm": 0.2684060037136078, + "learning_rate": 1.3147614667449629e-07, + "loss": 0.1828, + "step": 7594 + }, + { + "epoch": 2.0210218201170838, + "grad_norm": 0.2758670151233673, + "learning_rate": 1.3146011618574182e-07, + "loss": 0.1997, + "step": 7595 + }, + { + "epoch": 2.0212879191059074, + "grad_norm": 0.27786681056022644, + "learning_rate": 1.3144408479968093e-07, + "loss": 0.1805, + "step": 7596 + }, + { + "epoch": 2.021554018094731, + "grad_norm": 0.26403000950813293, + "learning_rate": 1.3142805251677084e-07, + "loss": 0.1862, + "step": 7597 + }, + { + "epoch": 2.0218201170835552, + "grad_norm": 0.348368376493454, + "learning_rate": 1.314120193374688e-07, + "loss": 0.1803, + "step": 7598 + }, + { + "epoch": 2.022086216072379, + "grad_norm": 0.2647024095058441, + "learning_rate": 1.3139598526223216e-07, + "loss": 0.1744, + "step": 7599 + }, + { + "epoch": 2.0223523150612026, + "grad_norm": 0.28442054986953735, + "learning_rate": 1.3137995029151823e-07, + "loss": 0.1875, + "step": 7600 + }, + { + "epoch": 2.0226184140500267, + "grad_norm": 0.3418618142604828, + "learning_rate": 1.3136391442578434e-07, + "loss": 0.1909, + "step": 7601 + }, + { + "epoch": 2.0228845130388504, + "grad_norm": 0.268399178981781, + "learning_rate": 1.3134787766548787e-07, + "loss": 0.1791, + "step": 7602 + }, + { + "epoch": 2.023150612027674, + "grad_norm": 0.33302393555641174, + "learning_rate": 1.3133184001108623e-07, + "loss": 0.183, + "step": 7603 + }, + { + "epoch": 2.0234167110164982, + "grad_norm": 0.262350469827652, + "learning_rate": 1.3131580146303684e-07, + "loss": 0.1824, + "step": 7604 + }, + { + "epoch": 2.023682810005322, + "grad_norm": 0.2798521816730499, + "learning_rate": 1.3129976202179718e-07, + "loss": 0.1873, + "step": 7605 + }, + { + "epoch": 2.023948908994146, + "grad_norm": 0.2520003616809845, + "learning_rate": 1.3128372168782465e-07, + "loss": 0.164, + "step": 7606 + }, + { + "epoch": 2.0242150079829697, + "grad_norm": 0.26352718472480774, + "learning_rate": 1.3126768046157688e-07, + "loss": 0.1775, + "step": 7607 + }, + { + "epoch": 2.0244811069717934, + "grad_norm": 0.310805082321167, + "learning_rate": 1.3125163834351127e-07, + "loss": 0.1871, + "step": 7608 + }, + { + "epoch": 2.0247472059606175, + "grad_norm": 0.3473651707172394, + "learning_rate": 1.3123559533408545e-07, + "loss": 0.2061, + "step": 7609 + }, + { + "epoch": 2.025013304949441, + "grad_norm": 0.27151039242744446, + "learning_rate": 1.3121955143375696e-07, + "loss": 0.1951, + "step": 7610 + }, + { + "epoch": 2.025279403938265, + "grad_norm": 0.26641130447387695, + "learning_rate": 1.3120350664298345e-07, + "loss": 0.1795, + "step": 7611 + }, + { + "epoch": 2.025545502927089, + "grad_norm": 0.2914122939109802, + "learning_rate": 1.3118746096222252e-07, + "loss": 0.1757, + "step": 7612 + }, + { + "epoch": 2.0258116019159127, + "grad_norm": 0.26857590675354004, + "learning_rate": 1.3117141439193178e-07, + "loss": 0.193, + "step": 7613 + }, + { + "epoch": 2.0260777009047364, + "grad_norm": 0.34218165278434753, + "learning_rate": 1.31155366932569e-07, + "loss": 0.1849, + "step": 7614 + }, + { + "epoch": 2.0263437998935605, + "grad_norm": 0.26599055528640747, + "learning_rate": 1.3113931858459184e-07, + "loss": 0.1873, + "step": 7615 + }, + { + "epoch": 2.026609898882384, + "grad_norm": 0.2867498993873596, + "learning_rate": 1.3112326934845802e-07, + "loss": 0.1849, + "step": 7616 + }, + { + "epoch": 2.0268759978712083, + "grad_norm": 0.3696919083595276, + "learning_rate": 1.3110721922462534e-07, + "loss": 0.1862, + "step": 7617 + }, + { + "epoch": 2.027142096860032, + "grad_norm": 0.33152148127555847, + "learning_rate": 1.310911682135515e-07, + "loss": 0.1786, + "step": 7618 + }, + { + "epoch": 2.0274081958488557, + "grad_norm": 0.3924887478351593, + "learning_rate": 1.3107511631569442e-07, + "loss": 0.1856, + "step": 7619 + }, + { + "epoch": 2.02767429483768, + "grad_norm": 0.26637229323387146, + "learning_rate": 1.3105906353151185e-07, + "loss": 0.1792, + "step": 7620 + }, + { + "epoch": 2.0279403938265035, + "grad_norm": 0.4327774941921234, + "learning_rate": 1.3104300986146167e-07, + "loss": 0.1677, + "step": 7621 + }, + { + "epoch": 2.028206492815327, + "grad_norm": 0.2848295271396637, + "learning_rate": 1.310269553060018e-07, + "loss": 0.1823, + "step": 7622 + }, + { + "epoch": 2.0284725918041513, + "grad_norm": 0.26378774642944336, + "learning_rate": 1.3101089986559007e-07, + "loss": 0.1813, + "step": 7623 + }, + { + "epoch": 2.028738690792975, + "grad_norm": 0.2957829535007477, + "learning_rate": 1.3099484354068448e-07, + "loss": 0.1862, + "step": 7624 + }, + { + "epoch": 2.0290047897817987, + "grad_norm": 0.3484317362308502, + "learning_rate": 1.3097878633174298e-07, + "loss": 0.1844, + "step": 7625 + }, + { + "epoch": 2.029270888770623, + "grad_norm": 0.2930407226085663, + "learning_rate": 1.3096272823922352e-07, + "loss": 0.1792, + "step": 7626 + }, + { + "epoch": 2.0295369877594465, + "grad_norm": 0.2786675691604614, + "learning_rate": 1.3094666926358416e-07, + "loss": 0.1625, + "step": 7627 + }, + { + "epoch": 2.02980308674827, + "grad_norm": 0.26878899335861206, + "learning_rate": 1.3093060940528285e-07, + "loss": 0.1788, + "step": 7628 + }, + { + "epoch": 2.0300691857370943, + "grad_norm": 0.30823224782943726, + "learning_rate": 1.3091454866477775e-07, + "loss": 0.1807, + "step": 7629 + }, + { + "epoch": 2.030335284725918, + "grad_norm": 0.38098645210266113, + "learning_rate": 1.308984870425269e-07, + "loss": 0.2112, + "step": 7630 + }, + { + "epoch": 2.030601383714742, + "grad_norm": 0.29336413741111755, + "learning_rate": 1.308824245389884e-07, + "loss": 0.1932, + "step": 7631 + }, + { + "epoch": 2.0308674827035658, + "grad_norm": 0.28198519349098206, + "learning_rate": 1.308663611546204e-07, + "loss": 0.2083, + "step": 7632 + }, + { + "epoch": 2.0311335816923894, + "grad_norm": 0.25764328241348267, + "learning_rate": 1.3085029688988106e-07, + "loss": 0.1601, + "step": 7633 + }, + { + "epoch": 2.0313996806812136, + "grad_norm": 0.5419142842292786, + "learning_rate": 1.3083423174522857e-07, + "loss": 0.237, + "step": 7634 + }, + { + "epoch": 2.0316657796700373, + "grad_norm": 0.3409779965877533, + "learning_rate": 1.308181657211211e-07, + "loss": 0.1808, + "step": 7635 + }, + { + "epoch": 2.031931878658861, + "grad_norm": 0.3135060966014862, + "learning_rate": 1.3080209881801693e-07, + "loss": 0.1785, + "step": 7636 + }, + { + "epoch": 2.032197977647685, + "grad_norm": 0.38016295433044434, + "learning_rate": 1.3078603103637432e-07, + "loss": 0.1906, + "step": 7637 + }, + { + "epoch": 2.0324640766365087, + "grad_norm": 0.2672610282897949, + "learning_rate": 1.3076996237665154e-07, + "loss": 0.1546, + "step": 7638 + }, + { + "epoch": 2.0327301756253324, + "grad_norm": 0.28728413581848145, + "learning_rate": 1.307538928393069e-07, + "loss": 0.1986, + "step": 7639 + }, + { + "epoch": 2.0329962746141566, + "grad_norm": 0.26674893498420715, + "learning_rate": 1.3073782242479875e-07, + "loss": 0.1828, + "step": 7640 + }, + { + "epoch": 2.0332623736029802, + "grad_norm": 0.26486828923225403, + "learning_rate": 1.3072175113358545e-07, + "loss": 0.1777, + "step": 7641 + }, + { + "epoch": 2.0335284725918044, + "grad_norm": 0.9301261901855469, + "learning_rate": 1.3070567896612533e-07, + "loss": 0.1938, + "step": 7642 + }, + { + "epoch": 2.033794571580628, + "grad_norm": 0.28605595231056213, + "learning_rate": 1.3068960592287692e-07, + "loss": 0.1832, + "step": 7643 + }, + { + "epoch": 2.0340606705694517, + "grad_norm": 0.3300149142742157, + "learning_rate": 1.3067353200429855e-07, + "loss": 0.1812, + "step": 7644 + }, + { + "epoch": 2.034326769558276, + "grad_norm": 0.2885955572128296, + "learning_rate": 1.3065745721084873e-07, + "loss": 0.1913, + "step": 7645 + }, + { + "epoch": 2.0345928685470995, + "grad_norm": 0.2720564901828766, + "learning_rate": 1.306413815429859e-07, + "loss": 0.1883, + "step": 7646 + }, + { + "epoch": 2.034858967535923, + "grad_norm": 0.3582986891269684, + "learning_rate": 1.3062530500116863e-07, + "loss": 0.1757, + "step": 7647 + }, + { + "epoch": 2.0351250665247473, + "grad_norm": 0.3775138854980469, + "learning_rate": 1.3060922758585542e-07, + "loss": 0.2035, + "step": 7648 + }, + { + "epoch": 2.035391165513571, + "grad_norm": 0.3680064380168915, + "learning_rate": 1.3059314929750483e-07, + "loss": 0.1753, + "step": 7649 + }, + { + "epoch": 2.0356572645023947, + "grad_norm": 0.24685536324977875, + "learning_rate": 1.3057707013657547e-07, + "loss": 0.1648, + "step": 7650 + }, + { + "epoch": 2.035923363491219, + "grad_norm": 0.2691609561443329, + "learning_rate": 1.3056099010352592e-07, + "loss": 0.1783, + "step": 7651 + }, + { + "epoch": 2.0361894624800425, + "grad_norm": 0.32328999042510986, + "learning_rate": 1.3054490919881485e-07, + "loss": 0.1914, + "step": 7652 + }, + { + "epoch": 2.036455561468866, + "grad_norm": 0.2665250599384308, + "learning_rate": 1.305288274229009e-07, + "loss": 0.1894, + "step": 7653 + }, + { + "epoch": 2.0367216604576903, + "grad_norm": 0.24976596236228943, + "learning_rate": 1.305127447762427e-07, + "loss": 0.1733, + "step": 7654 + }, + { + "epoch": 2.036987759446514, + "grad_norm": 0.33085399866104126, + "learning_rate": 1.3049666125929907e-07, + "loss": 0.1968, + "step": 7655 + }, + { + "epoch": 2.037253858435338, + "grad_norm": 0.2514086961746216, + "learning_rate": 1.3048057687252865e-07, + "loss": 0.1873, + "step": 7656 + }, + { + "epoch": 2.037519957424162, + "grad_norm": 0.2611011862754822, + "learning_rate": 1.3046449161639025e-07, + "loss": 0.1866, + "step": 7657 + }, + { + "epoch": 2.0377860564129855, + "grad_norm": 0.2689473032951355, + "learning_rate": 1.3044840549134265e-07, + "loss": 0.1916, + "step": 7658 + }, + { + "epoch": 2.0380521554018096, + "grad_norm": 0.28240475058555603, + "learning_rate": 1.3043231849784468e-07, + "loss": 0.1846, + "step": 7659 + }, + { + "epoch": 2.0383182543906333, + "grad_norm": 0.2873818874359131, + "learning_rate": 1.304162306363551e-07, + "loss": 0.1771, + "step": 7660 + }, + { + "epoch": 2.038584353379457, + "grad_norm": 0.31182587146759033, + "learning_rate": 1.3040014190733284e-07, + "loss": 0.1801, + "step": 7661 + }, + { + "epoch": 2.038850452368281, + "grad_norm": 0.33428168296813965, + "learning_rate": 1.3038405231123674e-07, + "loss": 0.1913, + "step": 7662 + }, + { + "epoch": 2.039116551357105, + "grad_norm": 0.3596677780151367, + "learning_rate": 1.3036796184852575e-07, + "loss": 0.1825, + "step": 7663 + }, + { + "epoch": 2.0393826503459285, + "grad_norm": 0.3229128420352936, + "learning_rate": 1.3035187051965874e-07, + "loss": 0.1946, + "step": 7664 + }, + { + "epoch": 2.0396487493347526, + "grad_norm": 0.272666335105896, + "learning_rate": 1.3033577832509474e-07, + "loss": 0.1908, + "step": 7665 + }, + { + "epoch": 2.0399148483235763, + "grad_norm": 0.29550158977508545, + "learning_rate": 1.303196852652927e-07, + "loss": 0.1896, + "step": 7666 + }, + { + "epoch": 2.0401809473124004, + "grad_norm": 0.35683056712150574, + "learning_rate": 1.303035913407116e-07, + "loss": 0.199, + "step": 7667 + }, + { + "epoch": 2.040447046301224, + "grad_norm": 0.2838864028453827, + "learning_rate": 1.3028749655181047e-07, + "loss": 0.1781, + "step": 7668 + }, + { + "epoch": 2.0407131452900478, + "grad_norm": 0.26021984219551086, + "learning_rate": 1.3027140089904846e-07, + "loss": 0.1757, + "step": 7669 + }, + { + "epoch": 2.040979244278872, + "grad_norm": 0.33467036485671997, + "learning_rate": 1.3025530438288453e-07, + "loss": 0.1904, + "step": 7670 + }, + { + "epoch": 2.0412453432676956, + "grad_norm": 0.3463757634162903, + "learning_rate": 1.3023920700377784e-07, + "loss": 0.1953, + "step": 7671 + }, + { + "epoch": 2.0415114422565193, + "grad_norm": 0.31396564841270447, + "learning_rate": 1.3022310876218752e-07, + "loss": 0.2014, + "step": 7672 + }, + { + "epoch": 2.0417775412453434, + "grad_norm": 0.36639514565467834, + "learning_rate": 1.3020700965857275e-07, + "loss": 0.2179, + "step": 7673 + }, + { + "epoch": 2.042043640234167, + "grad_norm": 0.3356325626373291, + "learning_rate": 1.3019090969339262e-07, + "loss": 0.1895, + "step": 7674 + }, + { + "epoch": 2.0423097392229907, + "grad_norm": 0.31659743189811707, + "learning_rate": 1.3017480886710647e-07, + "loss": 0.1844, + "step": 7675 + }, + { + "epoch": 2.042575838211815, + "grad_norm": 0.3317108154296875, + "learning_rate": 1.301587071801734e-07, + "loss": 0.1907, + "step": 7676 + }, + { + "epoch": 2.0428419372006386, + "grad_norm": 0.26143601536750793, + "learning_rate": 1.3014260463305275e-07, + "loss": 0.1705, + "step": 7677 + }, + { + "epoch": 2.0431080361894627, + "grad_norm": 0.3357401490211487, + "learning_rate": 1.3012650122620375e-07, + "loss": 0.1907, + "step": 7678 + }, + { + "epoch": 2.0433741351782864, + "grad_norm": 0.264139324426651, + "learning_rate": 1.301103969600857e-07, + "loss": 0.1764, + "step": 7679 + }, + { + "epoch": 2.04364023416711, + "grad_norm": 0.4461187422275543, + "learning_rate": 1.3009429183515797e-07, + "loss": 0.2248, + "step": 7680 + }, + { + "epoch": 2.043906333155934, + "grad_norm": 0.2887195944786072, + "learning_rate": 1.3007818585187987e-07, + "loss": 0.2024, + "step": 7681 + }, + { + "epoch": 2.044172432144758, + "grad_norm": 0.2872243821620941, + "learning_rate": 1.3006207901071074e-07, + "loss": 0.1987, + "step": 7682 + }, + { + "epoch": 2.0444385311335815, + "grad_norm": 0.28020942211151123, + "learning_rate": 1.300459713121101e-07, + "loss": 0.186, + "step": 7683 + }, + { + "epoch": 2.0447046301224057, + "grad_norm": 0.3090774714946747, + "learning_rate": 1.3002986275653728e-07, + "loss": 0.1659, + "step": 7684 + }, + { + "epoch": 2.0449707291112293, + "grad_norm": 0.25550249218940735, + "learning_rate": 1.3001375334445174e-07, + "loss": 0.1865, + "step": 7685 + }, + { + "epoch": 2.045236828100053, + "grad_norm": 0.2630590498447418, + "learning_rate": 1.2999764307631296e-07, + "loss": 0.1664, + "step": 7686 + }, + { + "epoch": 2.045502927088877, + "grad_norm": 0.33363378047943115, + "learning_rate": 1.2998153195258045e-07, + "loss": 0.2, + "step": 7687 + }, + { + "epoch": 2.045769026077701, + "grad_norm": 0.28391242027282715, + "learning_rate": 1.2996541997371374e-07, + "loss": 0.1896, + "step": 7688 + }, + { + "epoch": 2.046035125066525, + "grad_norm": 0.4210326373577118, + "learning_rate": 1.299493071401723e-07, + "loss": 0.2134, + "step": 7689 + }, + { + "epoch": 2.0463012240553486, + "grad_norm": 0.3073268234729767, + "learning_rate": 1.2993319345241585e-07, + "loss": 0.1863, + "step": 7690 + }, + { + "epoch": 2.0465673230441723, + "grad_norm": 0.39635026454925537, + "learning_rate": 1.2991707891090381e-07, + "loss": 0.2121, + "step": 7691 + }, + { + "epoch": 2.0468334220329965, + "grad_norm": 0.37437450885772705, + "learning_rate": 1.2990096351609595e-07, + "loss": 0.1987, + "step": 7692 + }, + { + "epoch": 2.04709952102182, + "grad_norm": 0.28821131587028503, + "learning_rate": 1.298848472684518e-07, + "loss": 0.1762, + "step": 7693 + }, + { + "epoch": 2.047365620010644, + "grad_norm": 0.3483983874320984, + "learning_rate": 1.298687301684311e-07, + "loss": 0.1863, + "step": 7694 + }, + { + "epoch": 2.047631718999468, + "grad_norm": 0.2837856113910675, + "learning_rate": 1.2985261221649354e-07, + "loss": 0.1607, + "step": 7695 + }, + { + "epoch": 2.0478978179882916, + "grad_norm": 0.32866019010543823, + "learning_rate": 1.298364934130988e-07, + "loss": 0.181, + "step": 7696 + }, + { + "epoch": 2.0481639169771153, + "grad_norm": 0.28678807616233826, + "learning_rate": 1.298203737587066e-07, + "loss": 0.1826, + "step": 7697 + }, + { + "epoch": 2.0484300159659394, + "grad_norm": 0.4221196174621582, + "learning_rate": 1.298042532537768e-07, + "loss": 0.2146, + "step": 7698 + }, + { + "epoch": 2.048696114954763, + "grad_norm": 0.2714022099971771, + "learning_rate": 1.297881318987691e-07, + "loss": 0.1798, + "step": 7699 + }, + { + "epoch": 2.048962213943587, + "grad_norm": 0.29642748832702637, + "learning_rate": 1.2977200969414335e-07, + "loss": 0.1963, + "step": 7700 + }, + { + "epoch": 2.049228312932411, + "grad_norm": 0.26541370153427124, + "learning_rate": 1.297558866403594e-07, + "loss": 0.179, + "step": 7701 + }, + { + "epoch": 2.0494944119212346, + "grad_norm": 0.3553166687488556, + "learning_rate": 1.297397627378771e-07, + "loss": 0.2005, + "step": 7702 + }, + { + "epoch": 2.0497605109100587, + "grad_norm": 0.3298395872116089, + "learning_rate": 1.297236379871563e-07, + "loss": 0.1873, + "step": 7703 + }, + { + "epoch": 2.0500266098988824, + "grad_norm": 0.4177525043487549, + "learning_rate": 1.2970751238865693e-07, + "loss": 0.2038, + "step": 7704 + }, + { + "epoch": 2.050292708887706, + "grad_norm": 0.33408892154693604, + "learning_rate": 1.2969138594283898e-07, + "loss": 0.1911, + "step": 7705 + }, + { + "epoch": 2.05055880787653, + "grad_norm": 0.31642112135887146, + "learning_rate": 1.2967525865016236e-07, + "loss": 0.1917, + "step": 7706 + }, + { + "epoch": 2.050824906865354, + "grad_norm": 0.3405494689941406, + "learning_rate": 1.2965913051108706e-07, + "loss": 0.1969, + "step": 7707 + }, + { + "epoch": 2.0510910058541776, + "grad_norm": 0.25838735699653625, + "learning_rate": 1.2964300152607307e-07, + "loss": 0.1711, + "step": 7708 + }, + { + "epoch": 2.0513571048430017, + "grad_norm": 0.2653888463973999, + "learning_rate": 1.2962687169558045e-07, + "loss": 0.1785, + "step": 7709 + }, + { + "epoch": 2.0516232038318254, + "grad_norm": 0.2746529281139374, + "learning_rate": 1.2961074102006922e-07, + "loss": 0.1884, + "step": 7710 + }, + { + "epoch": 2.051889302820649, + "grad_norm": 0.3233918845653534, + "learning_rate": 1.2959460949999953e-07, + "loss": 0.1933, + "step": 7711 + }, + { + "epoch": 2.052155401809473, + "grad_norm": 0.3842676281929016, + "learning_rate": 1.295784771358314e-07, + "loss": 0.1939, + "step": 7712 + }, + { + "epoch": 2.052421500798297, + "grad_norm": 0.4044651985168457, + "learning_rate": 1.2956234392802503e-07, + "loss": 0.2101, + "step": 7713 + }, + { + "epoch": 2.052687599787121, + "grad_norm": 0.2562454342842102, + "learning_rate": 1.2954620987704052e-07, + "loss": 0.1713, + "step": 7714 + }, + { + "epoch": 2.0529536987759447, + "grad_norm": 0.2682897448539734, + "learning_rate": 1.2953007498333804e-07, + "loss": 0.1723, + "step": 7715 + }, + { + "epoch": 2.0532197977647684, + "grad_norm": 0.2764056324958801, + "learning_rate": 1.2951393924737785e-07, + "loss": 0.1578, + "step": 7716 + }, + { + "epoch": 2.0534858967535925, + "grad_norm": 0.27872157096862793, + "learning_rate": 1.2949780266962013e-07, + "loss": 0.1784, + "step": 7717 + }, + { + "epoch": 2.053751995742416, + "grad_norm": 0.43816548585891724, + "learning_rate": 1.2948166525052513e-07, + "loss": 0.1741, + "step": 7718 + }, + { + "epoch": 2.05401809473124, + "grad_norm": 0.2671913206577301, + "learning_rate": 1.2946552699055313e-07, + "loss": 0.1816, + "step": 7719 + }, + { + "epoch": 2.054284193720064, + "grad_norm": 0.2556583881378174, + "learning_rate": 1.2944938789016443e-07, + "loss": 0.1698, + "step": 7720 + }, + { + "epoch": 2.0545502927088877, + "grad_norm": 0.2905570864677429, + "learning_rate": 1.2943324794981932e-07, + "loss": 0.2091, + "step": 7721 + }, + { + "epoch": 2.0548163916977114, + "grad_norm": 0.3577449321746826, + "learning_rate": 1.294171071699782e-07, + "loss": 0.1948, + "step": 7722 + }, + { + "epoch": 2.0550824906865355, + "grad_norm": 0.327816367149353, + "learning_rate": 1.2940096555110138e-07, + "loss": 0.1911, + "step": 7723 + }, + { + "epoch": 2.055348589675359, + "grad_norm": 0.27643337845802307, + "learning_rate": 1.293848230936493e-07, + "loss": 0.1938, + "step": 7724 + }, + { + "epoch": 2.0556146886641833, + "grad_norm": 0.27275213599205017, + "learning_rate": 1.2936867979808233e-07, + "loss": 0.1908, + "step": 7725 + }, + { + "epoch": 2.055880787653007, + "grad_norm": 0.323233038187027, + "learning_rate": 1.2935253566486092e-07, + "loss": 0.1876, + "step": 7726 + }, + { + "epoch": 2.0561468866418307, + "grad_norm": 0.27957579493522644, + "learning_rate": 1.2933639069444557e-07, + "loss": 0.1893, + "step": 7727 + }, + { + "epoch": 2.0564129856306548, + "grad_norm": 0.36431756615638733, + "learning_rate": 1.2932024488729675e-07, + "loss": 0.1818, + "step": 7728 + }, + { + "epoch": 2.0566790846194785, + "grad_norm": 0.4307677745819092, + "learning_rate": 1.2930409824387494e-07, + "loss": 0.2173, + "step": 7729 + }, + { + "epoch": 2.056945183608302, + "grad_norm": 0.3422883450984955, + "learning_rate": 1.292879507646407e-07, + "loss": 0.1826, + "step": 7730 + }, + { + "epoch": 2.0572112825971263, + "grad_norm": 1.1748523712158203, + "learning_rate": 1.292718024500546e-07, + "loss": 0.1907, + "step": 7731 + }, + { + "epoch": 2.05747738158595, + "grad_norm": 0.2521352171897888, + "learning_rate": 1.2925565330057723e-07, + "loss": 0.1882, + "step": 7732 + }, + { + "epoch": 2.0577434805747736, + "grad_norm": 0.36786380410194397, + "learning_rate": 1.2923950331666913e-07, + "loss": 0.2064, + "step": 7733 + }, + { + "epoch": 2.0580095795635978, + "grad_norm": 0.34961557388305664, + "learning_rate": 1.2922335249879102e-07, + "loss": 0.2052, + "step": 7734 + }, + { + "epoch": 2.0582756785524214, + "grad_norm": 0.3571675419807434, + "learning_rate": 1.292072008474035e-07, + "loss": 0.2009, + "step": 7735 + }, + { + "epoch": 2.0585417775412456, + "grad_norm": 0.35263487696647644, + "learning_rate": 1.2919104836296725e-07, + "loss": 0.195, + "step": 7736 + }, + { + "epoch": 2.0588078765300692, + "grad_norm": 0.3283372223377228, + "learning_rate": 1.29174895045943e-07, + "loss": 0.182, + "step": 7737 + }, + { + "epoch": 2.059073975518893, + "grad_norm": 0.26348358392715454, + "learning_rate": 1.2915874089679143e-07, + "loss": 0.1976, + "step": 7738 + }, + { + "epoch": 2.059340074507717, + "grad_norm": 0.2828284204006195, + "learning_rate": 1.2914258591597337e-07, + "loss": 0.1912, + "step": 7739 + }, + { + "epoch": 2.0596061734965407, + "grad_norm": 0.26955267786979675, + "learning_rate": 1.2912643010394946e-07, + "loss": 0.1892, + "step": 7740 + }, + { + "epoch": 2.0598722724853644, + "grad_norm": 0.3428240120410919, + "learning_rate": 1.2911027346118065e-07, + "loss": 0.1961, + "step": 7741 + }, + { + "epoch": 2.0601383714741885, + "grad_norm": 0.25863343477249146, + "learning_rate": 1.2909411598812764e-07, + "loss": 0.1696, + "step": 7742 + }, + { + "epoch": 2.0604044704630122, + "grad_norm": 0.3197110891342163, + "learning_rate": 1.2907795768525134e-07, + "loss": 0.1883, + "step": 7743 + }, + { + "epoch": 2.060670569451836, + "grad_norm": 0.3403381407260895, + "learning_rate": 1.2906179855301257e-07, + "loss": 0.1831, + "step": 7744 + }, + { + "epoch": 2.06093666844066, + "grad_norm": 0.4162381887435913, + "learning_rate": 1.2904563859187225e-07, + "loss": 0.1807, + "step": 7745 + }, + { + "epoch": 2.0612027674294837, + "grad_norm": 0.3363092839717865, + "learning_rate": 1.2902947780229134e-07, + "loss": 0.1973, + "step": 7746 + }, + { + "epoch": 2.0614688664183074, + "grad_norm": 0.3129630982875824, + "learning_rate": 1.290133161847307e-07, + "loss": 0.1918, + "step": 7747 + }, + { + "epoch": 2.0617349654071315, + "grad_norm": 0.26093584299087524, + "learning_rate": 1.2899715373965132e-07, + "loss": 0.1658, + "step": 7748 + }, + { + "epoch": 2.062001064395955, + "grad_norm": 0.3346070349216461, + "learning_rate": 1.289809904675142e-07, + "loss": 0.1901, + "step": 7749 + }, + { + "epoch": 2.0622671633847793, + "grad_norm": 0.3340350091457367, + "learning_rate": 1.2896482636878038e-07, + "loss": 0.1933, + "step": 7750 + }, + { + "epoch": 2.062533262373603, + "grad_norm": 0.27920225262641907, + "learning_rate": 1.2894866144391076e-07, + "loss": 0.1923, + "step": 7751 + }, + { + "epoch": 2.0627993613624267, + "grad_norm": 0.2815043330192566, + "learning_rate": 1.2893249569336656e-07, + "loss": 0.2005, + "step": 7752 + }, + { + "epoch": 2.063065460351251, + "grad_norm": 0.3431757688522339, + "learning_rate": 1.289163291176088e-07, + "loss": 0.2016, + "step": 7753 + }, + { + "epoch": 2.0633315593400745, + "grad_norm": 0.3169099986553192, + "learning_rate": 1.2890016171709851e-07, + "loss": 0.1842, + "step": 7754 + }, + { + "epoch": 2.063597658328898, + "grad_norm": 0.28027206659317017, + "learning_rate": 1.2888399349229693e-07, + "loss": 0.184, + "step": 7755 + }, + { + "epoch": 2.0638637573177223, + "grad_norm": 0.34445491433143616, + "learning_rate": 1.2886782444366515e-07, + "loss": 0.1879, + "step": 7756 + }, + { + "epoch": 2.064129856306546, + "grad_norm": 0.2659629285335541, + "learning_rate": 1.2885165457166435e-07, + "loss": 0.1652, + "step": 7757 + }, + { + "epoch": 2.0643959552953697, + "grad_norm": 0.44302845001220703, + "learning_rate": 1.288354838767557e-07, + "loss": 0.2068, + "step": 7758 + }, + { + "epoch": 2.064662054284194, + "grad_norm": 0.4159306287765503, + "learning_rate": 1.288193123594005e-07, + "loss": 0.1984, + "step": 7759 + }, + { + "epoch": 2.0649281532730175, + "grad_norm": 0.28327104449272156, + "learning_rate": 1.2880314002005993e-07, + "loss": 0.1827, + "step": 7760 + }, + { + "epoch": 2.0651942522618416, + "grad_norm": 0.25904375314712524, + "learning_rate": 1.287869668591953e-07, + "loss": 0.1778, + "step": 7761 + }, + { + "epoch": 2.0654603512506653, + "grad_norm": 0.2474534958600998, + "learning_rate": 1.2877079287726784e-07, + "loss": 0.1772, + "step": 7762 + }, + { + "epoch": 2.065726450239489, + "grad_norm": 0.27857744693756104, + "learning_rate": 1.2875461807473894e-07, + "loss": 0.19, + "step": 7763 + }, + { + "epoch": 2.065992549228313, + "grad_norm": 0.26919302344322205, + "learning_rate": 1.2873844245206983e-07, + "loss": 0.1611, + "step": 7764 + }, + { + "epoch": 2.066258648217137, + "grad_norm": 0.3114972412586212, + "learning_rate": 1.2872226600972202e-07, + "loss": 0.1691, + "step": 7765 + }, + { + "epoch": 2.0665247472059605, + "grad_norm": 0.2802828252315521, + "learning_rate": 1.2870608874815677e-07, + "loss": 0.1814, + "step": 7766 + }, + { + "epoch": 2.0667908461947846, + "grad_norm": 0.3406689167022705, + "learning_rate": 1.2868991066783557e-07, + "loss": 0.1829, + "step": 7767 + }, + { + "epoch": 2.0670569451836083, + "grad_norm": 0.36560457944869995, + "learning_rate": 1.286737317692198e-07, + "loss": 0.1923, + "step": 7768 + }, + { + "epoch": 2.067323044172432, + "grad_norm": 0.32216399908065796, + "learning_rate": 1.2865755205277092e-07, + "loss": 0.1808, + "step": 7769 + }, + { + "epoch": 2.067589143161256, + "grad_norm": 0.3004697561264038, + "learning_rate": 1.2864137151895042e-07, + "loss": 0.1881, + "step": 7770 + }, + { + "epoch": 2.0678552421500798, + "grad_norm": 0.28166911005973816, + "learning_rate": 1.2862519016821982e-07, + "loss": 0.1861, + "step": 7771 + }, + { + "epoch": 2.0681213411389034, + "grad_norm": 0.3420814573764801, + "learning_rate": 1.2860900800104065e-07, + "loss": 0.1903, + "step": 7772 + }, + { + "epoch": 2.0683874401277276, + "grad_norm": 0.26561254262924194, + "learning_rate": 1.2859282501787437e-07, + "loss": 0.1867, + "step": 7773 + }, + { + "epoch": 2.0686535391165513, + "grad_norm": 0.2575954794883728, + "learning_rate": 1.285766412191827e-07, + "loss": 0.185, + "step": 7774 + }, + { + "epoch": 2.0689196381053754, + "grad_norm": 0.28034698963165283, + "learning_rate": 1.285604566054271e-07, + "loss": 0.1916, + "step": 7775 + }, + { + "epoch": 2.069185737094199, + "grad_norm": 0.2692583501338959, + "learning_rate": 1.2854427117706924e-07, + "loss": 0.168, + "step": 7776 + }, + { + "epoch": 2.0694518360830227, + "grad_norm": 0.2892026901245117, + "learning_rate": 1.2852808493457074e-07, + "loss": 0.1833, + "step": 7777 + }, + { + "epoch": 2.069717935071847, + "grad_norm": 0.268342524766922, + "learning_rate": 1.2851189787839334e-07, + "loss": 0.183, + "step": 7778 + }, + { + "epoch": 2.0699840340606706, + "grad_norm": 0.31516262888908386, + "learning_rate": 1.2849571000899863e-07, + "loss": 0.1817, + "step": 7779 + }, + { + "epoch": 2.0702501330494942, + "grad_norm": 0.26487109065055847, + "learning_rate": 1.284795213268484e-07, + "loss": 0.1752, + "step": 7780 + }, + { + "epoch": 2.0705162320383184, + "grad_norm": 0.26843369007110596, + "learning_rate": 1.2846333183240433e-07, + "loss": 0.1838, + "step": 7781 + }, + { + "epoch": 2.070782331027142, + "grad_norm": 0.33234038949012756, + "learning_rate": 1.2844714152612816e-07, + "loss": 0.1865, + "step": 7782 + }, + { + "epoch": 2.0710484300159657, + "grad_norm": 0.30337777733802795, + "learning_rate": 1.2843095040848174e-07, + "loss": 0.1938, + "step": 7783 + }, + { + "epoch": 2.07131452900479, + "grad_norm": 0.417820006608963, + "learning_rate": 1.2841475847992685e-07, + "loss": 0.1837, + "step": 7784 + }, + { + "epoch": 2.0715806279936135, + "grad_norm": 0.2868247628211975, + "learning_rate": 1.283985657409253e-07, + "loss": 0.1836, + "step": 7785 + }, + { + "epoch": 2.0718467269824377, + "grad_norm": 0.3033157289028168, + "learning_rate": 1.2838237219193895e-07, + "loss": 0.1957, + "step": 7786 + }, + { + "epoch": 2.0721128259712613, + "grad_norm": 0.38959088921546936, + "learning_rate": 1.2836617783342966e-07, + "loss": 0.1977, + "step": 7787 + }, + { + "epoch": 2.072378924960085, + "grad_norm": 0.2535431385040283, + "learning_rate": 1.2834998266585935e-07, + "loss": 0.1724, + "step": 7788 + }, + { + "epoch": 2.072645023948909, + "grad_norm": 0.3154250383377075, + "learning_rate": 1.2833378668968991e-07, + "loss": 0.1951, + "step": 7789 + }, + { + "epoch": 2.072911122937733, + "grad_norm": 0.37521323561668396, + "learning_rate": 1.2831758990538332e-07, + "loss": 0.186, + "step": 7790 + }, + { + "epoch": 2.0731772219265565, + "grad_norm": 0.2756919264793396, + "learning_rate": 1.2830139231340147e-07, + "loss": 0.2007, + "step": 7791 + }, + { + "epoch": 2.0734433209153806, + "grad_norm": 0.28331509232521057, + "learning_rate": 1.2828519391420644e-07, + "loss": 0.1914, + "step": 7792 + }, + { + "epoch": 2.0737094199042043, + "grad_norm": 0.2807724177837372, + "learning_rate": 1.282689947082602e-07, + "loss": 0.19, + "step": 7793 + }, + { + "epoch": 2.073975518893028, + "grad_norm": 0.3617778718471527, + "learning_rate": 1.2825279469602476e-07, + "loss": 0.1848, + "step": 7794 + }, + { + "epoch": 2.074241617881852, + "grad_norm": 0.2682477831840515, + "learning_rate": 1.2823659387796224e-07, + "loss": 0.1904, + "step": 7795 + }, + { + "epoch": 2.074507716870676, + "grad_norm": 0.293304979801178, + "learning_rate": 1.2822039225453468e-07, + "loss": 0.1962, + "step": 7796 + }, + { + "epoch": 2.0747738158595, + "grad_norm": 0.28625932335853577, + "learning_rate": 1.2820418982620417e-07, + "loss": 0.1935, + "step": 7797 + }, + { + "epoch": 2.0750399148483236, + "grad_norm": 0.29141807556152344, + "learning_rate": 1.2818798659343283e-07, + "loss": 0.2083, + "step": 7798 + }, + { + "epoch": 2.0753060138371473, + "grad_norm": 0.2757241427898407, + "learning_rate": 1.2817178255668286e-07, + "loss": 0.2084, + "step": 7799 + }, + { + "epoch": 2.0755721128259714, + "grad_norm": 0.2796119451522827, + "learning_rate": 1.2815557771641642e-07, + "loss": 0.188, + "step": 7800 + }, + { + "epoch": 2.075838211814795, + "grad_norm": 0.2702614367008209, + "learning_rate": 1.2813937207309567e-07, + "loss": 0.1717, + "step": 7801 + }, + { + "epoch": 2.076104310803619, + "grad_norm": 0.26837876439094543, + "learning_rate": 1.2812316562718282e-07, + "loss": 0.1926, + "step": 7802 + }, + { + "epoch": 2.076370409792443, + "grad_norm": 0.26376259326934814, + "learning_rate": 1.2810695837914016e-07, + "loss": 0.168, + "step": 7803 + }, + { + "epoch": 2.0766365087812666, + "grad_norm": 0.3731048107147217, + "learning_rate": 1.2809075032942996e-07, + "loss": 0.2105, + "step": 7804 + }, + { + "epoch": 2.0769026077700903, + "grad_norm": 0.37223026156425476, + "learning_rate": 1.2807454147851444e-07, + "loss": 0.2049, + "step": 7805 + }, + { + "epoch": 2.0771687067589144, + "grad_norm": 0.31675514578819275, + "learning_rate": 1.2805833182685594e-07, + "loss": 0.1848, + "step": 7806 + }, + { + "epoch": 2.077434805747738, + "grad_norm": 0.3446292281150818, + "learning_rate": 1.280421213749168e-07, + "loss": 0.1942, + "step": 7807 + }, + { + "epoch": 2.077700904736562, + "grad_norm": 0.2718641459941864, + "learning_rate": 1.2802591012315941e-07, + "loss": 0.1928, + "step": 7808 + }, + { + "epoch": 2.077967003725386, + "grad_norm": 0.27684688568115234, + "learning_rate": 1.280096980720461e-07, + "loss": 0.1975, + "step": 7809 + }, + { + "epoch": 2.0782331027142096, + "grad_norm": 0.2505100667476654, + "learning_rate": 1.2799348522203927e-07, + "loss": 0.1799, + "step": 7810 + }, + { + "epoch": 2.0784992017030337, + "grad_norm": 0.3829229474067688, + "learning_rate": 1.2797727157360135e-07, + "loss": 0.2013, + "step": 7811 + }, + { + "epoch": 2.0787653006918574, + "grad_norm": 0.5003920793533325, + "learning_rate": 1.2796105712719482e-07, + "loss": 0.1817, + "step": 7812 + }, + { + "epoch": 2.079031399680681, + "grad_norm": 0.2870906591415405, + "learning_rate": 1.279448418832821e-07, + "loss": 0.1978, + "step": 7813 + }, + { + "epoch": 2.079297498669505, + "grad_norm": 0.27470728754997253, + "learning_rate": 1.2792862584232572e-07, + "loss": 0.1902, + "step": 7814 + }, + { + "epoch": 2.079563597658329, + "grad_norm": 0.34840697050094604, + "learning_rate": 1.2791240900478816e-07, + "loss": 0.1751, + "step": 7815 + }, + { + "epoch": 2.0798296966471526, + "grad_norm": 0.30162543058395386, + "learning_rate": 1.2789619137113196e-07, + "loss": 0.1852, + "step": 7816 + }, + { + "epoch": 2.0800957956359767, + "grad_norm": 0.2770906388759613, + "learning_rate": 1.278799729418197e-07, + "loss": 0.1857, + "step": 7817 + }, + { + "epoch": 2.0803618946248004, + "grad_norm": 0.2840717136859894, + "learning_rate": 1.27863753717314e-07, + "loss": 0.2039, + "step": 7818 + }, + { + "epoch": 2.080627993613624, + "grad_norm": 0.35752296447753906, + "learning_rate": 1.2784753369807738e-07, + "loss": 0.1799, + "step": 7819 + }, + { + "epoch": 2.080894092602448, + "grad_norm": 0.2608625292778015, + "learning_rate": 1.278313128845725e-07, + "loss": 0.1771, + "step": 7820 + }, + { + "epoch": 2.081160191591272, + "grad_norm": 0.27502357959747314, + "learning_rate": 1.2781509127726207e-07, + "loss": 0.1789, + "step": 7821 + }, + { + "epoch": 2.081426290580096, + "grad_norm": 0.2734745740890503, + "learning_rate": 1.2779886887660868e-07, + "loss": 0.1883, + "step": 7822 + }, + { + "epoch": 2.0816923895689197, + "grad_norm": 0.30638226866722107, + "learning_rate": 1.2778264568307507e-07, + "loss": 0.1916, + "step": 7823 + }, + { + "epoch": 2.0819584885577433, + "grad_norm": 0.3339099586009979, + "learning_rate": 1.2776642169712392e-07, + "loss": 0.1958, + "step": 7824 + }, + { + "epoch": 2.0822245875465675, + "grad_norm": 0.30643561482429504, + "learning_rate": 1.2775019691921803e-07, + "loss": 0.1876, + "step": 7825 + }, + { + "epoch": 2.082490686535391, + "grad_norm": 0.39149922132492065, + "learning_rate": 1.2773397134982013e-07, + "loss": 0.2022, + "step": 7826 + }, + { + "epoch": 2.082756785524215, + "grad_norm": 0.2671419084072113, + "learning_rate": 1.27717744989393e-07, + "loss": 0.1704, + "step": 7827 + }, + { + "epoch": 2.083022884513039, + "grad_norm": 0.3901032507419586, + "learning_rate": 1.2770151783839947e-07, + "loss": 0.2149, + "step": 7828 + }, + { + "epoch": 2.0832889835018626, + "grad_norm": 0.26182952523231506, + "learning_rate": 1.2768528989730235e-07, + "loss": 0.175, + "step": 7829 + }, + { + "epoch": 2.0835550824906863, + "grad_norm": 0.6534215807914734, + "learning_rate": 1.2766906116656454e-07, + "loss": 0.1726, + "step": 7830 + }, + { + "epoch": 2.0838211814795105, + "grad_norm": 0.4385562539100647, + "learning_rate": 1.276528316466488e-07, + "loss": 0.2097, + "step": 7831 + }, + { + "epoch": 2.084087280468334, + "grad_norm": 0.2767641246318817, + "learning_rate": 1.2763660133801816e-07, + "loss": 0.187, + "step": 7832 + }, + { + "epoch": 2.0843533794571583, + "grad_norm": 0.2898675203323364, + "learning_rate": 1.276203702411355e-07, + "loss": 0.1903, + "step": 7833 + }, + { + "epoch": 2.084619478445982, + "grad_norm": 0.41623303294181824, + "learning_rate": 1.276041383564637e-07, + "loss": 0.1995, + "step": 7834 + }, + { + "epoch": 2.0848855774348056, + "grad_norm": 0.34941235184669495, + "learning_rate": 1.2758790568446585e-07, + "loss": 0.2029, + "step": 7835 + }, + { + "epoch": 2.0851516764236298, + "grad_norm": 0.5867877006530762, + "learning_rate": 1.2757167222560484e-07, + "loss": 0.1995, + "step": 7836 + }, + { + "epoch": 2.0854177754124534, + "grad_norm": 0.27020740509033203, + "learning_rate": 1.2755543798034368e-07, + "loss": 0.1871, + "step": 7837 + }, + { + "epoch": 2.085683874401277, + "grad_norm": 0.30559054017066956, + "learning_rate": 1.2753920294914545e-07, + "loss": 0.1928, + "step": 7838 + }, + { + "epoch": 2.0859499733901012, + "grad_norm": 0.2644394338130951, + "learning_rate": 1.2752296713247318e-07, + "loss": 0.1768, + "step": 7839 + }, + { + "epoch": 2.086216072378925, + "grad_norm": 0.29362156987190247, + "learning_rate": 1.2750673053079e-07, + "loss": 0.1765, + "step": 7840 + }, + { + "epoch": 2.0864821713677486, + "grad_norm": 0.24881891906261444, + "learning_rate": 1.2749049314455896e-07, + "loss": 0.1743, + "step": 7841 + }, + { + "epoch": 2.0867482703565727, + "grad_norm": 0.2514629065990448, + "learning_rate": 1.2747425497424315e-07, + "loss": 0.1718, + "step": 7842 + }, + { + "epoch": 2.0870143693453964, + "grad_norm": 0.2818650007247925, + "learning_rate": 1.274580160203058e-07, + "loss": 0.1964, + "step": 7843 + }, + { + "epoch": 2.0872804683342205, + "grad_norm": 0.24894988536834717, + "learning_rate": 1.2744177628321e-07, + "loss": 0.1637, + "step": 7844 + }, + { + "epoch": 2.0875465673230442, + "grad_norm": 0.2818915545940399, + "learning_rate": 1.27425535763419e-07, + "loss": 0.1762, + "step": 7845 + }, + { + "epoch": 2.087812666311868, + "grad_norm": 0.2566450238227844, + "learning_rate": 1.2740929446139598e-07, + "loss": 0.1755, + "step": 7846 + }, + { + "epoch": 2.088078765300692, + "grad_norm": 0.39655712246894836, + "learning_rate": 1.273930523776042e-07, + "loss": 0.2082, + "step": 7847 + }, + { + "epoch": 2.0883448642895157, + "grad_norm": 0.2703687250614166, + "learning_rate": 1.273768095125069e-07, + "loss": 0.1704, + "step": 7848 + }, + { + "epoch": 2.0886109632783394, + "grad_norm": 0.2826833724975586, + "learning_rate": 1.2736056586656734e-07, + "loss": 0.1905, + "step": 7849 + }, + { + "epoch": 2.0888770622671635, + "grad_norm": 0.35302746295928955, + "learning_rate": 1.2734432144024885e-07, + "loss": 0.1825, + "step": 7850 + }, + { + "epoch": 2.089143161255987, + "grad_norm": 0.26130422949790955, + "learning_rate": 1.2732807623401477e-07, + "loss": 0.1618, + "step": 7851 + }, + { + "epoch": 2.089409260244811, + "grad_norm": 0.26906609535217285, + "learning_rate": 1.2731183024832837e-07, + "loss": 0.1896, + "step": 7852 + }, + { + "epoch": 2.089675359233635, + "grad_norm": 0.28036773204803467, + "learning_rate": 1.272955834836531e-07, + "loss": 0.1776, + "step": 7853 + }, + { + "epoch": 2.0899414582224587, + "grad_norm": 0.25240227580070496, + "learning_rate": 1.2727933594045235e-07, + "loss": 0.1608, + "step": 7854 + }, + { + "epoch": 2.090207557211283, + "grad_norm": 0.27270272374153137, + "learning_rate": 1.2726308761918944e-07, + "loss": 0.1728, + "step": 7855 + }, + { + "epoch": 2.0904736562001065, + "grad_norm": 0.28143981099128723, + "learning_rate": 1.2724683852032791e-07, + "loss": 0.1736, + "step": 7856 + }, + { + "epoch": 2.09073975518893, + "grad_norm": 0.5291578769683838, + "learning_rate": 1.2723058864433118e-07, + "loss": 0.1848, + "step": 7857 + }, + { + "epoch": 2.0910058541777543, + "grad_norm": 0.294485867023468, + "learning_rate": 1.2721433799166273e-07, + "loss": 0.1757, + "step": 7858 + }, + { + "epoch": 2.091271953166578, + "grad_norm": 0.2959039509296417, + "learning_rate": 1.2719808656278605e-07, + "loss": 0.1781, + "step": 7859 + }, + { + "epoch": 2.0915380521554017, + "grad_norm": 0.2714017331600189, + "learning_rate": 1.2718183435816464e-07, + "loss": 0.1886, + "step": 7860 + }, + { + "epoch": 2.091804151144226, + "grad_norm": 0.2597012519836426, + "learning_rate": 1.2716558137826213e-07, + "loss": 0.1958, + "step": 7861 + }, + { + "epoch": 2.0920702501330495, + "grad_norm": 0.34166982769966125, + "learning_rate": 1.2714932762354202e-07, + "loss": 0.1748, + "step": 7862 + }, + { + "epoch": 2.092336349121873, + "grad_norm": 0.3052658140659332, + "learning_rate": 1.2713307309446792e-07, + "loss": 0.1829, + "step": 7863 + }, + { + "epoch": 2.0926024481106973, + "grad_norm": 0.2807537913322449, + "learning_rate": 1.271168177915034e-07, + "loss": 0.2011, + "step": 7864 + }, + { + "epoch": 2.092868547099521, + "grad_norm": 0.27765795588493347, + "learning_rate": 1.271005617151122e-07, + "loss": 0.179, + "step": 7865 + }, + { + "epoch": 2.0931346460883447, + "grad_norm": 0.28873661160469055, + "learning_rate": 1.2708430486575785e-07, + "loss": 0.1853, + "step": 7866 + }, + { + "epoch": 2.093400745077169, + "grad_norm": 0.2906394600868225, + "learning_rate": 1.2706804724390409e-07, + "loss": 0.1847, + "step": 7867 + }, + { + "epoch": 2.0936668440659925, + "grad_norm": 0.38107573986053467, + "learning_rate": 1.2705178885001465e-07, + "loss": 0.1944, + "step": 7868 + }, + { + "epoch": 2.0939329430548166, + "grad_norm": 0.2647227644920349, + "learning_rate": 1.2703552968455322e-07, + "loss": 0.1725, + "step": 7869 + }, + { + "epoch": 2.0941990420436403, + "grad_norm": 0.29761138558387756, + "learning_rate": 1.2701926974798354e-07, + "loss": 0.1883, + "step": 7870 + }, + { + "epoch": 2.094465141032464, + "grad_norm": 0.2831283211708069, + "learning_rate": 1.2700300904076936e-07, + "loss": 0.189, + "step": 7871 + }, + { + "epoch": 2.094731240021288, + "grad_norm": 0.30076754093170166, + "learning_rate": 1.2698674756337448e-07, + "loss": 0.1805, + "step": 7872 + }, + { + "epoch": 2.0949973390101118, + "grad_norm": 0.25998175144195557, + "learning_rate": 1.2697048531626274e-07, + "loss": 0.1834, + "step": 7873 + }, + { + "epoch": 2.0952634379989354, + "grad_norm": 0.2687026560306549, + "learning_rate": 1.2695422229989797e-07, + "loss": 0.1685, + "step": 7874 + }, + { + "epoch": 2.0955295369877596, + "grad_norm": 0.2688996493816376, + "learning_rate": 1.2693795851474396e-07, + "loss": 0.178, + "step": 7875 + }, + { + "epoch": 2.0957956359765832, + "grad_norm": 0.34607216715812683, + "learning_rate": 1.2692169396126469e-07, + "loss": 0.2093, + "step": 7876 + }, + { + "epoch": 2.096061734965407, + "grad_norm": 0.2503330707550049, + "learning_rate": 1.2690542863992397e-07, + "loss": 0.1678, + "step": 7877 + }, + { + "epoch": 2.096327833954231, + "grad_norm": 0.26752716302871704, + "learning_rate": 1.2688916255118572e-07, + "loss": 0.1814, + "step": 7878 + }, + { + "epoch": 2.0965939329430547, + "grad_norm": 0.26499560475349426, + "learning_rate": 1.26872895695514e-07, + "loss": 0.1752, + "step": 7879 + }, + { + "epoch": 2.096860031931879, + "grad_norm": 0.3250536620616913, + "learning_rate": 1.268566280733726e-07, + "loss": 0.1961, + "step": 7880 + }, + { + "epoch": 2.0971261309207025, + "grad_norm": 0.28031617403030396, + "learning_rate": 1.2684035968522564e-07, + "loss": 0.1895, + "step": 7881 + }, + { + "epoch": 2.0973922299095262, + "grad_norm": 0.26980432868003845, + "learning_rate": 1.2682409053153703e-07, + "loss": 0.1833, + "step": 7882 + }, + { + "epoch": 2.0976583288983504, + "grad_norm": 0.30048155784606934, + "learning_rate": 1.268078206127709e-07, + "loss": 0.1863, + "step": 7883 + }, + { + "epoch": 2.097924427887174, + "grad_norm": 0.2722359001636505, + "learning_rate": 1.2679154992939125e-07, + "loss": 0.1864, + "step": 7884 + }, + { + "epoch": 2.0981905268759977, + "grad_norm": 0.325496107339859, + "learning_rate": 1.267752784818621e-07, + "loss": 0.1898, + "step": 7885 + }, + { + "epoch": 2.098456625864822, + "grad_norm": 0.3257209062576294, + "learning_rate": 1.2675900627064767e-07, + "loss": 0.1892, + "step": 7886 + }, + { + "epoch": 2.0987227248536455, + "grad_norm": 0.2645958960056305, + "learning_rate": 1.2674273329621197e-07, + "loss": 0.164, + "step": 7887 + }, + { + "epoch": 2.098988823842469, + "grad_norm": 0.2697816491127014, + "learning_rate": 1.2672645955901918e-07, + "loss": 0.1864, + "step": 7888 + }, + { + "epoch": 2.0992549228312933, + "grad_norm": 0.2578624486923218, + "learning_rate": 1.2671018505953343e-07, + "loss": 0.1697, + "step": 7889 + }, + { + "epoch": 2.099521021820117, + "grad_norm": 0.24674078822135925, + "learning_rate": 1.2669390979821894e-07, + "loss": 0.1882, + "step": 7890 + }, + { + "epoch": 2.0997871208089407, + "grad_norm": 0.27852803468704224, + "learning_rate": 1.266776337755399e-07, + "loss": 0.1837, + "step": 7891 + }, + { + "epoch": 2.100053219797765, + "grad_norm": 0.44367605447769165, + "learning_rate": 1.2666135699196052e-07, + "loss": 0.1794, + "step": 7892 + }, + { + "epoch": 2.1003193187865885, + "grad_norm": 0.2609894573688507, + "learning_rate": 1.2664507944794504e-07, + "loss": 0.1721, + "step": 7893 + }, + { + "epoch": 2.1005854177754126, + "grad_norm": 0.2579825818538666, + "learning_rate": 1.266288011439578e-07, + "loss": 0.1722, + "step": 7894 + }, + { + "epoch": 2.1008515167642363, + "grad_norm": 0.2607465088367462, + "learning_rate": 1.2661252208046303e-07, + "loss": 0.1779, + "step": 7895 + }, + { + "epoch": 2.10111761575306, + "grad_norm": 0.23758919537067413, + "learning_rate": 1.26596242257925e-07, + "loss": 0.1583, + "step": 7896 + }, + { + "epoch": 2.101383714741884, + "grad_norm": 0.37356990575790405, + "learning_rate": 1.2657996167680815e-07, + "loss": 0.1942, + "step": 7897 + }, + { + "epoch": 2.101649813730708, + "grad_norm": 0.2456342875957489, + "learning_rate": 1.2656368033757677e-07, + "loss": 0.1776, + "step": 7898 + }, + { + "epoch": 2.1019159127195315, + "grad_norm": 0.33998626470565796, + "learning_rate": 1.2654739824069525e-07, + "loss": 0.191, + "step": 7899 + }, + { + "epoch": 2.1021820117083556, + "grad_norm": 0.35951152443885803, + "learning_rate": 1.2653111538662795e-07, + "loss": 0.1924, + "step": 7900 + }, + { + "epoch": 2.1024481106971793, + "grad_norm": 0.3704642653465271, + "learning_rate": 1.2651483177583937e-07, + "loss": 0.2012, + "step": 7901 + }, + { + "epoch": 2.1027142096860034, + "grad_norm": 0.46317315101623535, + "learning_rate": 1.264985474087939e-07, + "loss": 0.1834, + "step": 7902 + }, + { + "epoch": 2.102980308674827, + "grad_norm": 0.29595160484313965, + "learning_rate": 1.2648226228595595e-07, + "loss": 0.1782, + "step": 7903 + }, + { + "epoch": 2.103246407663651, + "grad_norm": 0.2864260673522949, + "learning_rate": 1.2646597640779012e-07, + "loss": 0.1843, + "step": 7904 + }, + { + "epoch": 2.103512506652475, + "grad_norm": 0.2811625599861145, + "learning_rate": 1.2644968977476087e-07, + "loss": 0.1804, + "step": 7905 + }, + { + "epoch": 2.1037786056412986, + "grad_norm": 0.3968392312526703, + "learning_rate": 1.2643340238733272e-07, + "loss": 0.1879, + "step": 7906 + }, + { + "epoch": 2.1040447046301223, + "grad_norm": 0.27382346987724304, + "learning_rate": 1.264171142459702e-07, + "loss": 0.1573, + "step": 7907 + }, + { + "epoch": 2.1043108036189464, + "grad_norm": 0.4036925733089447, + "learning_rate": 1.264008253511379e-07, + "loss": 0.195, + "step": 7908 + }, + { + "epoch": 2.10457690260777, + "grad_norm": 0.2880987524986267, + "learning_rate": 1.2638453570330041e-07, + "loss": 0.1919, + "step": 7909 + }, + { + "epoch": 2.1048430015965938, + "grad_norm": 0.26888740062713623, + "learning_rate": 1.2636824530292233e-07, + "loss": 0.1829, + "step": 7910 + }, + { + "epoch": 2.105109100585418, + "grad_norm": 0.3754008114337921, + "learning_rate": 1.2635195415046832e-07, + "loss": 0.1705, + "step": 7911 + }, + { + "epoch": 2.1053751995742416, + "grad_norm": 0.31669411063194275, + "learning_rate": 1.2633566224640304e-07, + "loss": 0.1747, + "step": 7912 + }, + { + "epoch": 2.1056412985630653, + "grad_norm": 0.2593175768852234, + "learning_rate": 1.2631936959119117e-07, + "loss": 0.1706, + "step": 7913 + }, + { + "epoch": 2.1059073975518894, + "grad_norm": 0.2688130736351013, + "learning_rate": 1.2630307618529738e-07, + "loss": 0.1938, + "step": 7914 + }, + { + "epoch": 2.106173496540713, + "grad_norm": 0.28288534283638, + "learning_rate": 1.262867820291864e-07, + "loss": 0.1873, + "step": 7915 + }, + { + "epoch": 2.106439595529537, + "grad_norm": 0.2789905369281769, + "learning_rate": 1.2627048712332297e-07, + "loss": 0.1843, + "step": 7916 + }, + { + "epoch": 2.106705694518361, + "grad_norm": 0.34985074400901794, + "learning_rate": 1.2625419146817188e-07, + "loss": 0.1954, + "step": 7917 + }, + { + "epoch": 2.1069717935071846, + "grad_norm": 0.2802732288837433, + "learning_rate": 1.262378950641979e-07, + "loss": 0.1945, + "step": 7918 + }, + { + "epoch": 2.1072378924960087, + "grad_norm": 0.26159656047821045, + "learning_rate": 1.2622159791186585e-07, + "loss": 0.1785, + "step": 7919 + }, + { + "epoch": 2.1075039914848324, + "grad_norm": 0.3456420600414276, + "learning_rate": 1.2620530001164052e-07, + "loss": 0.1842, + "step": 7920 + }, + { + "epoch": 2.107770090473656, + "grad_norm": 0.2525159418582916, + "learning_rate": 1.2618900136398678e-07, + "loss": 0.162, + "step": 7921 + }, + { + "epoch": 2.10803618946248, + "grad_norm": 0.3757179379463196, + "learning_rate": 1.2617270196936952e-07, + "loss": 0.1945, + "step": 7922 + }, + { + "epoch": 2.108302288451304, + "grad_norm": 0.2836836278438568, + "learning_rate": 1.261564018282536e-07, + "loss": 0.1698, + "step": 7923 + }, + { + "epoch": 2.1085683874401275, + "grad_norm": 0.26304152607917786, + "learning_rate": 1.2614010094110396e-07, + "loss": 0.1712, + "step": 7924 + }, + { + "epoch": 2.1088344864289517, + "grad_norm": 0.2691046893596649, + "learning_rate": 1.2612379930838552e-07, + "loss": 0.1634, + "step": 7925 + }, + { + "epoch": 2.1091005854177753, + "grad_norm": 0.7747384309768677, + "learning_rate": 1.261074969305632e-07, + "loss": 0.1826, + "step": 7926 + }, + { + "epoch": 2.1093666844065995, + "grad_norm": 0.25232017040252686, + "learning_rate": 1.2609119380810207e-07, + "loss": 0.1641, + "step": 7927 + }, + { + "epoch": 2.109632783395423, + "grad_norm": 0.4026581645011902, + "learning_rate": 1.2607488994146703e-07, + "loss": 0.1924, + "step": 7928 + }, + { + "epoch": 2.109898882384247, + "grad_norm": 0.40337637066841125, + "learning_rate": 1.2605858533112313e-07, + "loss": 0.2118, + "step": 7929 + }, + { + "epoch": 2.110164981373071, + "grad_norm": 0.2704067826271057, + "learning_rate": 1.2604227997753546e-07, + "loss": 0.1911, + "step": 7930 + }, + { + "epoch": 2.1104310803618946, + "grad_norm": 0.2913413941860199, + "learning_rate": 1.2602597388116904e-07, + "loss": 0.1824, + "step": 7931 + }, + { + "epoch": 2.1106971793507183, + "grad_norm": 0.26359009742736816, + "learning_rate": 1.2600966704248896e-07, + "loss": 0.1698, + "step": 7932 + }, + { + "epoch": 2.1109632783395424, + "grad_norm": 0.3362525403499603, + "learning_rate": 1.2599335946196032e-07, + "loss": 0.1698, + "step": 7933 + }, + { + "epoch": 2.111229377328366, + "grad_norm": 0.2764287292957306, + "learning_rate": 1.2597705114004823e-07, + "loss": 0.1913, + "step": 7934 + }, + { + "epoch": 2.11149547631719, + "grad_norm": 0.25498440861701965, + "learning_rate": 1.259607420772179e-07, + "loss": 0.1782, + "step": 7935 + }, + { + "epoch": 2.111761575306014, + "grad_norm": 0.4860849380493164, + "learning_rate": 1.2594443227393438e-07, + "loss": 0.199, + "step": 7936 + }, + { + "epoch": 2.1120276742948376, + "grad_norm": 0.24850179255008698, + "learning_rate": 1.25928121730663e-07, + "loss": 0.1653, + "step": 7937 + }, + { + "epoch": 2.1122937732836613, + "grad_norm": 0.34457218647003174, + "learning_rate": 1.2591181044786888e-07, + "loss": 0.184, + "step": 7938 + }, + { + "epoch": 2.1125598722724854, + "grad_norm": 0.3646604120731354, + "learning_rate": 1.2589549842601724e-07, + "loss": 0.2024, + "step": 7939 + }, + { + "epoch": 2.112825971261309, + "grad_norm": 0.24801990389823914, + "learning_rate": 1.258791856655734e-07, + "loss": 0.1646, + "step": 7940 + }, + { + "epoch": 2.1130920702501332, + "grad_norm": 0.2582978904247284, + "learning_rate": 1.2586287216700258e-07, + "loss": 0.1545, + "step": 7941 + }, + { + "epoch": 2.113358169238957, + "grad_norm": 0.34041905403137207, + "learning_rate": 1.2584655793077008e-07, + "loss": 0.1722, + "step": 7942 + }, + { + "epoch": 2.1136242682277806, + "grad_norm": 0.2762170433998108, + "learning_rate": 1.2583024295734121e-07, + "loss": 0.1896, + "step": 7943 + }, + { + "epoch": 2.1138903672166047, + "grad_norm": 0.25249892473220825, + "learning_rate": 1.2581392724718136e-07, + "loss": 0.172, + "step": 7944 + }, + { + "epoch": 2.1141564662054284, + "grad_norm": 0.34353554248809814, + "learning_rate": 1.2579761080075585e-07, + "loss": 0.2021, + "step": 7945 + }, + { + "epoch": 2.114422565194252, + "grad_norm": 0.2603914141654968, + "learning_rate": 1.2578129361853e-07, + "loss": 0.1776, + "step": 7946 + }, + { + "epoch": 2.114688664183076, + "grad_norm": 0.2436153143644333, + "learning_rate": 1.257649757009693e-07, + "loss": 0.1741, + "step": 7947 + }, + { + "epoch": 2.1149547631719, + "grad_norm": 0.2660391330718994, + "learning_rate": 1.2574865704853914e-07, + "loss": 0.1829, + "step": 7948 + }, + { + "epoch": 2.1152208621607236, + "grad_norm": 0.382308691740036, + "learning_rate": 1.2573233766170498e-07, + "loss": 0.1857, + "step": 7949 + }, + { + "epoch": 2.1154869611495477, + "grad_norm": 0.3518318235874176, + "learning_rate": 1.2571601754093223e-07, + "loss": 0.1936, + "step": 7950 + }, + { + "epoch": 2.1157530601383714, + "grad_norm": 0.2825189530849457, + "learning_rate": 1.256996966866864e-07, + "loss": 0.1886, + "step": 7951 + }, + { + "epoch": 2.1160191591271955, + "grad_norm": 0.2764933705329895, + "learning_rate": 1.25683375099433e-07, + "loss": 0.1773, + "step": 7952 + }, + { + "epoch": 2.116285258116019, + "grad_norm": 0.42741379141807556, + "learning_rate": 1.256670527796376e-07, + "loss": 0.1949, + "step": 7953 + }, + { + "epoch": 2.116551357104843, + "grad_norm": 0.25721415877342224, + "learning_rate": 1.2565072972776561e-07, + "loss": 0.176, + "step": 7954 + }, + { + "epoch": 2.116817456093667, + "grad_norm": 0.3098341226577759, + "learning_rate": 1.2563440594428274e-07, + "loss": 0.2016, + "step": 7955 + }, + { + "epoch": 2.1170835550824907, + "grad_norm": 0.33805376291275024, + "learning_rate": 1.256180814296545e-07, + "loss": 0.1752, + "step": 7956 + }, + { + "epoch": 2.1173496540713144, + "grad_norm": 0.2467779815196991, + "learning_rate": 1.2560175618434653e-07, + "loss": 0.1685, + "step": 7957 + }, + { + "epoch": 2.1176157530601385, + "grad_norm": 0.4182392656803131, + "learning_rate": 1.2558543020882445e-07, + "loss": 0.1875, + "step": 7958 + }, + { + "epoch": 2.117881852048962, + "grad_norm": 0.29980748891830444, + "learning_rate": 1.2556910350355392e-07, + "loss": 0.1781, + "step": 7959 + }, + { + "epoch": 2.118147951037786, + "grad_norm": 0.31665217876434326, + "learning_rate": 1.255527760690006e-07, + "loss": 0.1952, + "step": 7960 + }, + { + "epoch": 2.11841405002661, + "grad_norm": 0.26776373386383057, + "learning_rate": 1.2553644790563013e-07, + "loss": 0.1855, + "step": 7961 + }, + { + "epoch": 2.1186801490154337, + "grad_norm": 0.3313705027103424, + "learning_rate": 1.2552011901390832e-07, + "loss": 0.1895, + "step": 7962 + }, + { + "epoch": 2.118946248004258, + "grad_norm": 0.271197110414505, + "learning_rate": 1.2550378939430084e-07, + "loss": 0.1799, + "step": 7963 + }, + { + "epoch": 2.1192123469930815, + "grad_norm": 0.27662351727485657, + "learning_rate": 1.2548745904727348e-07, + "loss": 0.1816, + "step": 7964 + }, + { + "epoch": 2.119478445981905, + "grad_norm": 0.2920103669166565, + "learning_rate": 1.2547112797329193e-07, + "loss": 0.175, + "step": 7965 + }, + { + "epoch": 2.1197445449707293, + "grad_norm": 0.2628104090690613, + "learning_rate": 1.254547961728221e-07, + "loss": 0.1856, + "step": 7966 + }, + { + "epoch": 2.120010643959553, + "grad_norm": 0.32654139399528503, + "learning_rate": 1.2543846364632976e-07, + "loss": 0.1856, + "step": 7967 + }, + { + "epoch": 2.1202767429483766, + "grad_norm": 0.38389942049980164, + "learning_rate": 1.2542213039428075e-07, + "loss": 0.1946, + "step": 7968 + }, + { + "epoch": 2.1205428419372008, + "grad_norm": 0.28321224451065063, + "learning_rate": 1.2540579641714092e-07, + "loss": 0.1847, + "step": 7969 + }, + { + "epoch": 2.1208089409260245, + "grad_norm": 0.24580572545528412, + "learning_rate": 1.2538946171537612e-07, + "loss": 0.1707, + "step": 7970 + }, + { + "epoch": 2.121075039914848, + "grad_norm": 0.2837364077568054, + "learning_rate": 1.253731262894523e-07, + "loss": 0.1774, + "step": 7971 + }, + { + "epoch": 2.1213411389036723, + "grad_norm": 0.2896627187728882, + "learning_rate": 1.2535679013983535e-07, + "loss": 0.1721, + "step": 7972 + }, + { + "epoch": 2.121607237892496, + "grad_norm": 0.25818049907684326, + "learning_rate": 1.2534045326699123e-07, + "loss": 0.1755, + "step": 7973 + }, + { + "epoch": 2.12187333688132, + "grad_norm": 0.3453352153301239, + "learning_rate": 1.253241156713859e-07, + "loss": 0.1867, + "step": 7974 + }, + { + "epoch": 2.1221394358701438, + "grad_norm": 0.27290451526641846, + "learning_rate": 1.253077773534853e-07, + "loss": 0.1837, + "step": 7975 + }, + { + "epoch": 2.1224055348589674, + "grad_norm": 0.4116024672985077, + "learning_rate": 1.2529143831375546e-07, + "loss": 0.1796, + "step": 7976 + }, + { + "epoch": 2.1226716338477916, + "grad_norm": 0.27123400568962097, + "learning_rate": 1.2527509855266245e-07, + "loss": 0.1705, + "step": 7977 + }, + { + "epoch": 2.1229377328366152, + "grad_norm": 0.2552346885204315, + "learning_rate": 1.2525875807067224e-07, + "loss": 0.1737, + "step": 7978 + }, + { + "epoch": 2.123203831825439, + "grad_norm": 0.3966948986053467, + "learning_rate": 1.252424168682509e-07, + "loss": 0.1988, + "step": 7979 + }, + { + "epoch": 2.123469930814263, + "grad_norm": 0.2669415771961212, + "learning_rate": 1.2522607494586456e-07, + "loss": 0.1923, + "step": 7980 + }, + { + "epoch": 2.1237360298030867, + "grad_norm": 0.3747343122959137, + "learning_rate": 1.252097323039793e-07, + "loss": 0.1717, + "step": 7981 + }, + { + "epoch": 2.1240021287919104, + "grad_norm": 0.38838350772857666, + "learning_rate": 1.2519338894306124e-07, + "loss": 0.1778, + "step": 7982 + }, + { + "epoch": 2.1242682277807345, + "grad_norm": 0.34755831956863403, + "learning_rate": 1.2517704486357652e-07, + "loss": 0.1815, + "step": 7983 + }, + { + "epoch": 2.1245343267695582, + "grad_norm": 0.32791441679000854, + "learning_rate": 1.2516070006599132e-07, + "loss": 0.1901, + "step": 7984 + }, + { + "epoch": 2.124800425758382, + "grad_norm": 0.2601814568042755, + "learning_rate": 1.2514435455077183e-07, + "loss": 0.1729, + "step": 7985 + }, + { + "epoch": 2.125066524747206, + "grad_norm": 0.27956753969192505, + "learning_rate": 1.2512800831838426e-07, + "loss": 0.1741, + "step": 7986 + }, + { + "epoch": 2.1253326237360297, + "grad_norm": 0.3397403061389923, + "learning_rate": 1.251116613692948e-07, + "loss": 0.2016, + "step": 7987 + }, + { + "epoch": 2.125598722724854, + "grad_norm": 0.388857364654541, + "learning_rate": 1.2509531370396976e-07, + "loss": 0.1818, + "step": 7988 + }, + { + "epoch": 2.1258648217136775, + "grad_norm": 0.26082488894462585, + "learning_rate": 1.2507896532287542e-07, + "loss": 0.1689, + "step": 7989 + }, + { + "epoch": 2.126130920702501, + "grad_norm": 0.5561133623123169, + "learning_rate": 1.2506261622647797e-07, + "loss": 0.1787, + "step": 7990 + }, + { + "epoch": 2.1263970196913253, + "grad_norm": 0.3876844346523285, + "learning_rate": 1.2504626641524377e-07, + "loss": 0.1918, + "step": 7991 + }, + { + "epoch": 2.126663118680149, + "grad_norm": 0.27104830741882324, + "learning_rate": 1.250299158896392e-07, + "loss": 0.1809, + "step": 7992 + }, + { + "epoch": 2.1269292176689727, + "grad_norm": 0.2935081720352173, + "learning_rate": 1.2501356465013052e-07, + "loss": 0.1853, + "step": 7993 + }, + { + "epoch": 2.127195316657797, + "grad_norm": 0.2663858234882355, + "learning_rate": 1.2499721269718416e-07, + "loss": 0.1827, + "step": 7994 + }, + { + "epoch": 2.1274614156466205, + "grad_norm": 0.2680341899394989, + "learning_rate": 1.249808600312665e-07, + "loss": 0.1837, + "step": 7995 + }, + { + "epoch": 2.127727514635444, + "grad_norm": 0.32603368163108826, + "learning_rate": 1.2496450665284394e-07, + "loss": 0.1825, + "step": 7996 + }, + { + "epoch": 2.1279936136242683, + "grad_norm": 0.36236029863357544, + "learning_rate": 1.2494815256238293e-07, + "loss": 0.1973, + "step": 7997 + }, + { + "epoch": 2.128259712613092, + "grad_norm": 0.320419043302536, + "learning_rate": 1.249317977603499e-07, + "loss": 0.1884, + "step": 7998 + }, + { + "epoch": 2.128525811601916, + "grad_norm": 0.2596675753593445, + "learning_rate": 1.2491544224721135e-07, + "loss": 0.1801, + "step": 7999 + }, + { + "epoch": 2.12879191059074, + "grad_norm": 0.2679581344127655, + "learning_rate": 1.2489908602343376e-07, + "loss": 0.1795, + "step": 8000 + }, + { + "epoch": 2.1290580095795635, + "grad_norm": 0.2732389271259308, + "learning_rate": 1.248827290894836e-07, + "loss": 0.188, + "step": 8001 + }, + { + "epoch": 2.1293241085683876, + "grad_norm": 0.2694370746612549, + "learning_rate": 1.2486637144582748e-07, + "loss": 0.198, + "step": 8002 + }, + { + "epoch": 2.1295902075572113, + "grad_norm": 0.27865737676620483, + "learning_rate": 1.248500130929319e-07, + "loss": 0.1974, + "step": 8003 + }, + { + "epoch": 2.129856306546035, + "grad_norm": 0.24714457988739014, + "learning_rate": 1.2483365403126346e-07, + "loss": 0.1616, + "step": 8004 + }, + { + "epoch": 2.130122405534859, + "grad_norm": 0.2760639190673828, + "learning_rate": 1.2481729426128873e-07, + "loss": 0.1862, + "step": 8005 + }, + { + "epoch": 2.130388504523683, + "grad_norm": 0.25863635540008545, + "learning_rate": 1.2480093378347435e-07, + "loss": 0.1878, + "step": 8006 + }, + { + "epoch": 2.1306546035125065, + "grad_norm": 0.27831920981407166, + "learning_rate": 1.2478457259828695e-07, + "loss": 0.1876, + "step": 8007 + }, + { + "epoch": 2.1309207025013306, + "grad_norm": 0.3854982256889343, + "learning_rate": 1.2476821070619317e-07, + "loss": 0.1923, + "step": 8008 + }, + { + "epoch": 2.1311868014901543, + "grad_norm": 0.2684367597103119, + "learning_rate": 1.2475184810765967e-07, + "loss": 0.1732, + "step": 8009 + }, + { + "epoch": 2.131452900478978, + "grad_norm": 0.251615047454834, + "learning_rate": 1.2473548480315317e-07, + "loss": 0.1705, + "step": 8010 + }, + { + "epoch": 2.131718999467802, + "grad_norm": 0.3249287009239197, + "learning_rate": 1.2471912079314038e-07, + "loss": 0.2098, + "step": 8011 + }, + { + "epoch": 2.1319850984566258, + "grad_norm": 0.3396536111831665, + "learning_rate": 1.24702756078088e-07, + "loss": 0.194, + "step": 8012 + }, + { + "epoch": 2.13225119744545, + "grad_norm": 0.25715336203575134, + "learning_rate": 1.246863906584629e-07, + "loss": 0.1605, + "step": 8013 + }, + { + "epoch": 2.1325172964342736, + "grad_norm": 0.3156684935092926, + "learning_rate": 1.246700245347317e-07, + "loss": 0.1812, + "step": 8014 + }, + { + "epoch": 2.1327833954230973, + "grad_norm": 0.2891734540462494, + "learning_rate": 1.2465365770736132e-07, + "loss": 0.2109, + "step": 8015 + }, + { + "epoch": 2.1330494944119214, + "grad_norm": 0.24076558649539948, + "learning_rate": 1.2463729017681847e-07, + "loss": 0.1627, + "step": 8016 + }, + { + "epoch": 2.133315593400745, + "grad_norm": 0.5528431534767151, + "learning_rate": 1.2462092194357008e-07, + "loss": 0.1853, + "step": 8017 + }, + { + "epoch": 2.1335816923895687, + "grad_norm": 0.3201196491718292, + "learning_rate": 1.2460455300808296e-07, + "loss": 0.1848, + "step": 8018 + }, + { + "epoch": 2.133847791378393, + "grad_norm": 0.30337631702423096, + "learning_rate": 1.2458818337082396e-07, + "loss": 0.1822, + "step": 8019 + }, + { + "epoch": 2.1341138903672165, + "grad_norm": 0.3304184079170227, + "learning_rate": 1.2457181303226004e-07, + "loss": 0.2128, + "step": 8020 + }, + { + "epoch": 2.1343799893560407, + "grad_norm": 0.26040181517601013, + "learning_rate": 1.2455544199285806e-07, + "loss": 0.1644, + "step": 8021 + }, + { + "epoch": 2.1346460883448644, + "grad_norm": 0.275628924369812, + "learning_rate": 1.24539070253085e-07, + "loss": 0.1564, + "step": 8022 + }, + { + "epoch": 2.134912187333688, + "grad_norm": 0.3309251070022583, + "learning_rate": 1.2452269781340775e-07, + "loss": 0.1853, + "step": 8023 + }, + { + "epoch": 2.135178286322512, + "grad_norm": 0.2496664971113205, + "learning_rate": 1.2450632467429334e-07, + "loss": 0.1598, + "step": 8024 + }, + { + "epoch": 2.135444385311336, + "grad_norm": 0.2782982885837555, + "learning_rate": 1.2448995083620877e-07, + "loss": 0.1916, + "step": 8025 + }, + { + "epoch": 2.1357104843001595, + "grad_norm": 0.2952575385570526, + "learning_rate": 1.2447357629962102e-07, + "loss": 0.1931, + "step": 8026 + }, + { + "epoch": 2.1359765832889837, + "grad_norm": 0.35757145285606384, + "learning_rate": 1.2445720106499713e-07, + "loss": 0.2021, + "step": 8027 + }, + { + "epoch": 2.1362426822778073, + "grad_norm": 0.31065836548805237, + "learning_rate": 1.244408251328042e-07, + "loss": 0.186, + "step": 8028 + }, + { + "epoch": 2.136508781266631, + "grad_norm": 0.37540069222450256, + "learning_rate": 1.2442444850350927e-07, + "loss": 0.1996, + "step": 8029 + }, + { + "epoch": 2.136774880255455, + "grad_norm": 0.27318403124809265, + "learning_rate": 1.2440807117757938e-07, + "loss": 0.1791, + "step": 8030 + }, + { + "epoch": 2.137040979244279, + "grad_norm": 0.2454986572265625, + "learning_rate": 1.2439169315548178e-07, + "loss": 0.164, + "step": 8031 + }, + { + "epoch": 2.1373070782331025, + "grad_norm": 0.2879069149494171, + "learning_rate": 1.2437531443768347e-07, + "loss": 0.1882, + "step": 8032 + }, + { + "epoch": 2.1375731772219266, + "grad_norm": 0.32525405287742615, + "learning_rate": 1.243589350246517e-07, + "loss": 0.1881, + "step": 8033 + }, + { + "epoch": 2.1378392762107503, + "grad_norm": 0.3792788088321686, + "learning_rate": 1.2434255491685354e-07, + "loss": 0.2097, + "step": 8034 + }, + { + "epoch": 2.1381053751995744, + "grad_norm": 1.22763991355896, + "learning_rate": 1.243261741147563e-07, + "loss": 0.1872, + "step": 8035 + }, + { + "epoch": 2.138371474188398, + "grad_norm": 0.3110146224498749, + "learning_rate": 1.2430979261882712e-07, + "loss": 0.186, + "step": 8036 + }, + { + "epoch": 2.138637573177222, + "grad_norm": 0.3565302789211273, + "learning_rate": 1.2429341042953326e-07, + "loss": 0.1971, + "step": 8037 + }, + { + "epoch": 2.138903672166046, + "grad_norm": 0.2750866711139679, + "learning_rate": 1.2427702754734196e-07, + "loss": 0.1987, + "step": 8038 + }, + { + "epoch": 2.1391697711548696, + "grad_norm": 0.2885468900203705, + "learning_rate": 1.242606439727205e-07, + "loss": 0.1815, + "step": 8039 + }, + { + "epoch": 2.1394358701436933, + "grad_norm": 0.3050251603126526, + "learning_rate": 1.242442597061362e-07, + "loss": 0.1728, + "step": 8040 + }, + { + "epoch": 2.1397019691325174, + "grad_norm": 0.28542277216911316, + "learning_rate": 1.2422787474805634e-07, + "loss": 0.1927, + "step": 8041 + }, + { + "epoch": 2.139968068121341, + "grad_norm": 0.302010715007782, + "learning_rate": 1.2421148909894822e-07, + "loss": 0.1881, + "step": 8042 + }, + { + "epoch": 2.140234167110165, + "grad_norm": 0.4688500761985779, + "learning_rate": 1.2419510275927926e-07, + "loss": 0.2097, + "step": 8043 + }, + { + "epoch": 2.140500266098989, + "grad_norm": 0.34488433599472046, + "learning_rate": 1.241787157295168e-07, + "loss": 0.2021, + "step": 8044 + }, + { + "epoch": 2.1407663650878126, + "grad_norm": 0.2538907825946808, + "learning_rate": 1.2416232801012823e-07, + "loss": 0.1689, + "step": 8045 + }, + { + "epoch": 2.1410324640766367, + "grad_norm": 0.3014608919620514, + "learning_rate": 1.2414593960158098e-07, + "loss": 0.1748, + "step": 8046 + }, + { + "epoch": 2.1412985630654604, + "grad_norm": 0.2677623927593231, + "learning_rate": 1.2412955050434247e-07, + "loss": 0.1725, + "step": 8047 + }, + { + "epoch": 2.141564662054284, + "grad_norm": 0.39921173453330994, + "learning_rate": 1.241131607188801e-07, + "loss": 0.2287, + "step": 8048 + }, + { + "epoch": 2.141830761043108, + "grad_norm": 0.28636813163757324, + "learning_rate": 1.2409677024566143e-07, + "loss": 0.1801, + "step": 8049 + }, + { + "epoch": 2.142096860031932, + "grad_norm": 0.42718493938446045, + "learning_rate": 1.240803790851539e-07, + "loss": 0.1974, + "step": 8050 + }, + { + "epoch": 2.1423629590207556, + "grad_norm": 0.32711803913116455, + "learning_rate": 1.2406398723782504e-07, + "loss": 0.1741, + "step": 8051 + }, + { + "epoch": 2.1426290580095797, + "grad_norm": 0.2654954195022583, + "learning_rate": 1.240475947041423e-07, + "loss": 0.1979, + "step": 8052 + }, + { + "epoch": 2.1428951569984034, + "grad_norm": 0.3176085948944092, + "learning_rate": 1.2403120148457336e-07, + "loss": 0.2152, + "step": 8053 + }, + { + "epoch": 2.143161255987227, + "grad_norm": 0.2890773117542267, + "learning_rate": 1.240148075795857e-07, + "loss": 0.2038, + "step": 8054 + }, + { + "epoch": 2.143427354976051, + "grad_norm": 0.29126623272895813, + "learning_rate": 1.2399841298964691e-07, + "loss": 0.1733, + "step": 8055 + }, + { + "epoch": 2.143693453964875, + "grad_norm": 0.28066977858543396, + "learning_rate": 1.239820177152246e-07, + "loss": 0.1895, + "step": 8056 + }, + { + "epoch": 2.1439595529536986, + "grad_norm": 0.3616546094417572, + "learning_rate": 1.2396562175678647e-07, + "loss": 0.2028, + "step": 8057 + }, + { + "epoch": 2.1442256519425227, + "grad_norm": 0.27934134006500244, + "learning_rate": 1.2394922511480006e-07, + "loss": 0.1896, + "step": 8058 + }, + { + "epoch": 2.1444917509313464, + "grad_norm": 0.25541234016418457, + "learning_rate": 1.239328277897331e-07, + "loss": 0.1587, + "step": 8059 + }, + { + "epoch": 2.1447578499201705, + "grad_norm": 0.2621113359928131, + "learning_rate": 1.2391642978205324e-07, + "loss": 0.179, + "step": 8060 + }, + { + "epoch": 2.145023948908994, + "grad_norm": 0.28403666615486145, + "learning_rate": 1.2390003109222823e-07, + "loss": 0.205, + "step": 8061 + }, + { + "epoch": 2.145290047897818, + "grad_norm": 0.6001901030540466, + "learning_rate": 1.2388363172072576e-07, + "loss": 0.2051, + "step": 8062 + }, + { + "epoch": 2.145556146886642, + "grad_norm": 0.4301295280456543, + "learning_rate": 1.2386723166801356e-07, + "loss": 0.1904, + "step": 8063 + }, + { + "epoch": 2.1458222458754657, + "grad_norm": 0.2932555675506592, + "learning_rate": 1.2385083093455944e-07, + "loss": 0.1667, + "step": 8064 + }, + { + "epoch": 2.1460883448642893, + "grad_norm": 0.2698357403278351, + "learning_rate": 1.2383442952083113e-07, + "loss": 0.1768, + "step": 8065 + }, + { + "epoch": 2.1463544438531135, + "grad_norm": 0.3472820818424225, + "learning_rate": 1.2381802742729649e-07, + "loss": 0.197, + "step": 8066 + }, + { + "epoch": 2.146620542841937, + "grad_norm": 0.26829320192337036, + "learning_rate": 1.2380162465442326e-07, + "loss": 0.1813, + "step": 8067 + }, + { + "epoch": 2.1468866418307613, + "grad_norm": 0.256136029958725, + "learning_rate": 1.2378522120267937e-07, + "loss": 0.1728, + "step": 8068 + }, + { + "epoch": 2.147152740819585, + "grad_norm": 0.27000027894973755, + "learning_rate": 1.237688170725326e-07, + "loss": 0.1709, + "step": 8069 + }, + { + "epoch": 2.1474188398084086, + "grad_norm": 0.2765986919403076, + "learning_rate": 1.2375241226445088e-07, + "loss": 0.1877, + "step": 8070 + }, + { + "epoch": 2.1476849387972328, + "grad_norm": 0.2754260003566742, + "learning_rate": 1.237360067789021e-07, + "loss": 0.1871, + "step": 8071 + }, + { + "epoch": 2.1479510377860564, + "grad_norm": 0.2925972044467926, + "learning_rate": 1.2371960061635417e-07, + "loss": 0.189, + "step": 8072 + }, + { + "epoch": 2.14821713677488, + "grad_norm": 0.29267418384552, + "learning_rate": 1.2370319377727504e-07, + "loss": 0.1989, + "step": 8073 + }, + { + "epoch": 2.1484832357637043, + "grad_norm": 0.2510054111480713, + "learning_rate": 1.2368678626213262e-07, + "loss": 0.1613, + "step": 8074 + }, + { + "epoch": 2.148749334752528, + "grad_norm": 0.6497068405151367, + "learning_rate": 1.2367037807139494e-07, + "loss": 0.1726, + "step": 8075 + }, + { + "epoch": 2.1490154337413516, + "grad_norm": 0.2618049681186676, + "learning_rate": 1.2365396920553e-07, + "loss": 0.1762, + "step": 8076 + }, + { + "epoch": 2.1492815327301757, + "grad_norm": 0.2798936367034912, + "learning_rate": 1.2363755966500578e-07, + "loss": 0.1793, + "step": 8077 + }, + { + "epoch": 2.1495476317189994, + "grad_norm": 0.27293825149536133, + "learning_rate": 1.2362114945029032e-07, + "loss": 0.1797, + "step": 8078 + }, + { + "epoch": 2.149813730707823, + "grad_norm": 0.34946006536483765, + "learning_rate": 1.2360473856185167e-07, + "loss": 0.1893, + "step": 8079 + }, + { + "epoch": 2.1500798296966472, + "grad_norm": 0.27666690945625305, + "learning_rate": 1.2358832700015792e-07, + "loss": 0.1761, + "step": 8080 + }, + { + "epoch": 2.150345928685471, + "grad_norm": 0.2638992965221405, + "learning_rate": 1.2357191476567715e-07, + "loss": 0.1825, + "step": 8081 + }, + { + "epoch": 2.1506120276742946, + "grad_norm": 0.25265252590179443, + "learning_rate": 1.2355550185887747e-07, + "loss": 0.166, + "step": 8082 + }, + { + "epoch": 2.1508781266631187, + "grad_norm": 0.3063059449195862, + "learning_rate": 1.2353908828022701e-07, + "loss": 0.1867, + "step": 8083 + }, + { + "epoch": 2.1511442256519424, + "grad_norm": 0.44074171781539917, + "learning_rate": 1.2352267403019391e-07, + "loss": 0.1871, + "step": 8084 + }, + { + "epoch": 2.1514103246407665, + "grad_norm": 0.39162981510162354, + "learning_rate": 1.2350625910924635e-07, + "loss": 0.1827, + "step": 8085 + }, + { + "epoch": 2.15167642362959, + "grad_norm": 0.2921571731567383, + "learning_rate": 1.2348984351785253e-07, + "loss": 0.1869, + "step": 8086 + }, + { + "epoch": 2.151942522618414, + "grad_norm": 0.3130781650543213, + "learning_rate": 1.2347342725648065e-07, + "loss": 0.186, + "step": 8087 + }, + { + "epoch": 2.152208621607238, + "grad_norm": 0.2923809587955475, + "learning_rate": 1.2345701032559888e-07, + "loss": 0.181, + "step": 8088 + }, + { + "epoch": 2.1524747205960617, + "grad_norm": 0.28396251797676086, + "learning_rate": 1.2344059272567552e-07, + "loss": 0.1752, + "step": 8089 + }, + { + "epoch": 2.1527408195848854, + "grad_norm": 0.3136967420578003, + "learning_rate": 1.2342417445717886e-07, + "loss": 0.1745, + "step": 8090 + }, + { + "epoch": 2.1530069185737095, + "grad_norm": 0.3412531018257141, + "learning_rate": 1.234077555205771e-07, + "loss": 0.19, + "step": 8091 + }, + { + "epoch": 2.153273017562533, + "grad_norm": 0.33264291286468506, + "learning_rate": 1.2339133591633858e-07, + "loss": 0.1987, + "step": 8092 + }, + { + "epoch": 2.1535391165513573, + "grad_norm": 0.27987635135650635, + "learning_rate": 1.2337491564493164e-07, + "loss": 0.1709, + "step": 8093 + }, + { + "epoch": 2.153805215540181, + "grad_norm": 0.3821612298488617, + "learning_rate": 1.2335849470682462e-07, + "loss": 0.1856, + "step": 8094 + }, + { + "epoch": 2.1540713145290047, + "grad_norm": 0.2547106146812439, + "learning_rate": 1.2334207310248587e-07, + "loss": 0.1658, + "step": 8095 + }, + { + "epoch": 2.154337413517829, + "grad_norm": 0.2785050868988037, + "learning_rate": 1.2332565083238373e-07, + "loss": 0.1805, + "step": 8096 + }, + { + "epoch": 2.1546035125066525, + "grad_norm": 0.2752068042755127, + "learning_rate": 1.2330922789698665e-07, + "loss": 0.1861, + "step": 8097 + }, + { + "epoch": 2.154869611495476, + "grad_norm": 0.27360498905181885, + "learning_rate": 1.2329280429676304e-07, + "loss": 0.2018, + "step": 8098 + }, + { + "epoch": 2.1551357104843003, + "grad_norm": 0.2657673954963684, + "learning_rate": 1.2327638003218127e-07, + "loss": 0.1918, + "step": 8099 + }, + { + "epoch": 2.155401809473124, + "grad_norm": 0.7571748495101929, + "learning_rate": 1.2325995510370987e-07, + "loss": 0.1606, + "step": 8100 + }, + { + "epoch": 2.1556679084619477, + "grad_norm": 0.4034060537815094, + "learning_rate": 1.232435295118173e-07, + "loss": 0.1832, + "step": 8101 + }, + { + "epoch": 2.155934007450772, + "grad_norm": 0.34182459115982056, + "learning_rate": 1.2322710325697203e-07, + "loss": 0.1925, + "step": 8102 + }, + { + "epoch": 2.1562001064395955, + "grad_norm": 0.3834214210510254, + "learning_rate": 1.2321067633964255e-07, + "loss": 0.188, + "step": 8103 + }, + { + "epoch": 2.156466205428419, + "grad_norm": 0.33110275864601135, + "learning_rate": 1.2319424876029745e-07, + "loss": 0.1953, + "step": 8104 + }, + { + "epoch": 2.1567323044172433, + "grad_norm": 0.3355504870414734, + "learning_rate": 1.231778205194052e-07, + "loss": 0.1903, + "step": 8105 + }, + { + "epoch": 2.156998403406067, + "grad_norm": 0.3244190812110901, + "learning_rate": 1.2316139161743444e-07, + "loss": 0.193, + "step": 8106 + }, + { + "epoch": 2.157264502394891, + "grad_norm": 0.2848764955997467, + "learning_rate": 1.2314496205485372e-07, + "loss": 0.1792, + "step": 8107 + }, + { + "epoch": 2.1575306013837148, + "grad_norm": 0.2703205347061157, + "learning_rate": 1.2312853183213166e-07, + "loss": 0.1812, + "step": 8108 + }, + { + "epoch": 2.1577967003725385, + "grad_norm": 0.29114583134651184, + "learning_rate": 1.2311210094973685e-07, + "loss": 0.1965, + "step": 8109 + }, + { + "epoch": 2.1580627993613626, + "grad_norm": 0.3859729468822479, + "learning_rate": 1.2309566940813795e-07, + "loss": 0.194, + "step": 8110 + }, + { + "epoch": 2.1583288983501863, + "grad_norm": 0.3864772617816925, + "learning_rate": 1.2307923720780362e-07, + "loss": 0.2037, + "step": 8111 + }, + { + "epoch": 2.15859499733901, + "grad_norm": 0.3853074014186859, + "learning_rate": 1.2306280434920257e-07, + "loss": 0.207, + "step": 8112 + }, + { + "epoch": 2.158861096327834, + "grad_norm": 0.2888564467430115, + "learning_rate": 1.230463708328035e-07, + "loss": 0.191, + "step": 8113 + }, + { + "epoch": 2.1591271953166578, + "grad_norm": 0.28948166966438293, + "learning_rate": 1.2302993665907503e-07, + "loss": 0.1948, + "step": 8114 + }, + { + "epoch": 2.1593932943054814, + "grad_norm": 0.2657705843448639, + "learning_rate": 1.2301350182848603e-07, + "loss": 0.1702, + "step": 8115 + }, + { + "epoch": 2.1596593932943056, + "grad_norm": 0.40841126441955566, + "learning_rate": 1.2299706634150518e-07, + "loss": 0.1937, + "step": 8116 + }, + { + "epoch": 2.1599254922831292, + "grad_norm": 0.3318862318992615, + "learning_rate": 1.2298063019860126e-07, + "loss": 0.1935, + "step": 8117 + }, + { + "epoch": 2.1601915912719534, + "grad_norm": 0.3318102955818176, + "learning_rate": 1.2296419340024305e-07, + "loss": 0.1936, + "step": 8118 + }, + { + "epoch": 2.160457690260777, + "grad_norm": 0.29215407371520996, + "learning_rate": 1.2294775594689938e-07, + "loss": 0.1845, + "step": 8119 + }, + { + "epoch": 2.1607237892496007, + "grad_norm": 0.4858647584915161, + "learning_rate": 1.2293131783903912e-07, + "loss": 0.1799, + "step": 8120 + }, + { + "epoch": 2.160989888238425, + "grad_norm": 0.2817660868167877, + "learning_rate": 1.22914879077131e-07, + "loss": 0.1918, + "step": 8121 + }, + { + "epoch": 2.1612559872272485, + "grad_norm": 0.2841614782810211, + "learning_rate": 1.2289843966164403e-07, + "loss": 0.1785, + "step": 8122 + }, + { + "epoch": 2.1615220862160722, + "grad_norm": 0.3874400556087494, + "learning_rate": 1.2288199959304702e-07, + "loss": 0.1898, + "step": 8123 + }, + { + "epoch": 2.1617881852048964, + "grad_norm": 0.37213096022605896, + "learning_rate": 1.2286555887180887e-07, + "loss": 0.1944, + "step": 8124 + }, + { + "epoch": 2.16205428419372, + "grad_norm": 0.40643107891082764, + "learning_rate": 1.228491174983985e-07, + "loss": 0.1888, + "step": 8125 + }, + { + "epoch": 2.1623203831825437, + "grad_norm": 0.3865828514099121, + "learning_rate": 1.228326754732849e-07, + "loss": 0.2072, + "step": 8126 + }, + { + "epoch": 2.162586482171368, + "grad_norm": 0.4557808041572571, + "learning_rate": 1.22816232796937e-07, + "loss": 0.201, + "step": 8127 + }, + { + "epoch": 2.1628525811601915, + "grad_norm": 0.2911529242992401, + "learning_rate": 1.2279978946982375e-07, + "loss": 0.2122, + "step": 8128 + }, + { + "epoch": 2.163118680149015, + "grad_norm": 0.26316219568252563, + "learning_rate": 1.2278334549241416e-07, + "loss": 0.1619, + "step": 8129 + }, + { + "epoch": 2.1633847791378393, + "grad_norm": 0.41041553020477295, + "learning_rate": 1.227669008651773e-07, + "loss": 0.1957, + "step": 8130 + }, + { + "epoch": 2.163650878126663, + "grad_norm": 0.26981866359710693, + "learning_rate": 1.2275045558858212e-07, + "loss": 0.1729, + "step": 8131 + }, + { + "epoch": 2.163916977115487, + "grad_norm": 0.28218236565589905, + "learning_rate": 1.2273400966309773e-07, + "loss": 0.1841, + "step": 8132 + }, + { + "epoch": 2.164183076104311, + "grad_norm": 0.33707839250564575, + "learning_rate": 1.227175630891932e-07, + "loss": 0.1925, + "step": 8133 + }, + { + "epoch": 2.1644491750931345, + "grad_norm": 0.28572455048561096, + "learning_rate": 1.2270111586733763e-07, + "loss": 0.1756, + "step": 8134 + }, + { + "epoch": 2.1647152740819586, + "grad_norm": 0.2801037132740021, + "learning_rate": 1.2268466799800008e-07, + "loss": 0.1835, + "step": 8135 + }, + { + "epoch": 2.1649813730707823, + "grad_norm": 0.3740065395832062, + "learning_rate": 1.2266821948164967e-07, + "loss": 0.1888, + "step": 8136 + }, + { + "epoch": 2.165247472059606, + "grad_norm": 0.3011002540588379, + "learning_rate": 1.2265177031875562e-07, + "loss": 0.199, + "step": 8137 + }, + { + "epoch": 2.16551357104843, + "grad_norm": 0.3200371265411377, + "learning_rate": 1.2263532050978703e-07, + "loss": 0.175, + "step": 8138 + }, + { + "epoch": 2.165779670037254, + "grad_norm": 0.2773683965206146, + "learning_rate": 1.226188700552131e-07, + "loss": 0.173, + "step": 8139 + }, + { + "epoch": 2.166045769026078, + "grad_norm": 0.5282797813415527, + "learning_rate": 1.2260241895550302e-07, + "loss": 0.1993, + "step": 8140 + }, + { + "epoch": 2.1663118680149016, + "grad_norm": 0.2682245373725891, + "learning_rate": 1.2258596721112606e-07, + "loss": 0.1769, + "step": 8141 + }, + { + "epoch": 2.1665779670037253, + "grad_norm": 0.26697006821632385, + "learning_rate": 1.225695148225514e-07, + "loss": 0.1881, + "step": 8142 + }, + { + "epoch": 2.1668440659925494, + "grad_norm": 0.2535161077976227, + "learning_rate": 1.225530617902483e-07, + "loss": 0.1581, + "step": 8143 + }, + { + "epoch": 2.167110164981373, + "grad_norm": 0.25776129961013794, + "learning_rate": 1.2253660811468608e-07, + "loss": 0.1837, + "step": 8144 + }, + { + "epoch": 2.167376263970197, + "grad_norm": 0.33270978927612305, + "learning_rate": 1.22520153796334e-07, + "loss": 0.1807, + "step": 8145 + }, + { + "epoch": 2.167642362959021, + "grad_norm": 0.2951946258544922, + "learning_rate": 1.2250369883566133e-07, + "loss": 0.1881, + "step": 8146 + }, + { + "epoch": 2.1679084619478446, + "grad_norm": 0.3627220094203949, + "learning_rate": 1.224872432331374e-07, + "loss": 0.1721, + "step": 8147 + }, + { + "epoch": 2.1681745609366683, + "grad_norm": 0.3746505081653595, + "learning_rate": 1.2247078698923168e-07, + "loss": 0.1895, + "step": 8148 + }, + { + "epoch": 2.1684406599254924, + "grad_norm": 0.2848285436630249, + "learning_rate": 1.2245433010441343e-07, + "loss": 0.1789, + "step": 8149 + }, + { + "epoch": 2.168706758914316, + "grad_norm": 0.29106152057647705, + "learning_rate": 1.2243787257915207e-07, + "loss": 0.1842, + "step": 8150 + }, + { + "epoch": 2.1689728579031398, + "grad_norm": 0.33464720845222473, + "learning_rate": 1.2242141441391694e-07, + "loss": 0.1847, + "step": 8151 + }, + { + "epoch": 2.169238956891964, + "grad_norm": 0.355694055557251, + "learning_rate": 1.2240495560917756e-07, + "loss": 0.2031, + "step": 8152 + }, + { + "epoch": 2.1695050558807876, + "grad_norm": 0.25969579815864563, + "learning_rate": 1.223884961654033e-07, + "loss": 0.1665, + "step": 8153 + }, + { + "epoch": 2.1697711548696117, + "grad_norm": 0.2760595679283142, + "learning_rate": 1.223720360830636e-07, + "loss": 0.1826, + "step": 8154 + }, + { + "epoch": 2.1700372538584354, + "grad_norm": 0.2803528606891632, + "learning_rate": 1.2235557536262805e-07, + "loss": 0.1889, + "step": 8155 + }, + { + "epoch": 2.170303352847259, + "grad_norm": 0.2692826986312866, + "learning_rate": 1.22339114004566e-07, + "loss": 0.1816, + "step": 8156 + }, + { + "epoch": 2.170569451836083, + "grad_norm": 0.2809937298297882, + "learning_rate": 1.2232265200934702e-07, + "loss": 0.1871, + "step": 8157 + }, + { + "epoch": 2.170835550824907, + "grad_norm": 0.32375919818878174, + "learning_rate": 1.2230618937744067e-07, + "loss": 0.1774, + "step": 8158 + }, + { + "epoch": 2.1711016498137305, + "grad_norm": 0.3343255817890167, + "learning_rate": 1.2228972610931646e-07, + "loss": 0.1782, + "step": 8159 + }, + { + "epoch": 2.1713677488025547, + "grad_norm": 0.34947457909584045, + "learning_rate": 1.2227326220544397e-07, + "loss": 0.1735, + "step": 8160 + }, + { + "epoch": 2.1716338477913784, + "grad_norm": 0.2992110550403595, + "learning_rate": 1.2225679766629275e-07, + "loss": 0.188, + "step": 8161 + }, + { + "epoch": 2.171899946780202, + "grad_norm": 0.25628533959388733, + "learning_rate": 1.2224033249233248e-07, + "loss": 0.1704, + "step": 8162 + }, + { + "epoch": 2.172166045769026, + "grad_norm": 0.327495276927948, + "learning_rate": 1.222238666840327e-07, + "loss": 0.1834, + "step": 8163 + }, + { + "epoch": 2.17243214475785, + "grad_norm": 0.3437797427177429, + "learning_rate": 1.2220740024186308e-07, + "loss": 0.1915, + "step": 8164 + }, + { + "epoch": 2.172698243746674, + "grad_norm": 0.32894712686538696, + "learning_rate": 1.2219093316629333e-07, + "loss": 0.1844, + "step": 8165 + }, + { + "epoch": 2.1729643427354977, + "grad_norm": 0.3030150532722473, + "learning_rate": 1.22174465457793e-07, + "loss": 0.1742, + "step": 8166 + }, + { + "epoch": 2.1732304417243213, + "grad_norm": 0.3494333028793335, + "learning_rate": 1.2215799711683188e-07, + "loss": 0.1878, + "step": 8167 + }, + { + "epoch": 2.1734965407131455, + "grad_norm": 0.3780393600463867, + "learning_rate": 1.2214152814387968e-07, + "loss": 0.1787, + "step": 8168 + }, + { + "epoch": 2.173762639701969, + "grad_norm": 0.3507882356643677, + "learning_rate": 1.2212505853940605e-07, + "loss": 0.1655, + "step": 8169 + }, + { + "epoch": 2.174028738690793, + "grad_norm": 0.2803824841976166, + "learning_rate": 1.2210858830388084e-07, + "loss": 0.1911, + "step": 8170 + }, + { + "epoch": 2.174294837679617, + "grad_norm": 0.25501003861427307, + "learning_rate": 1.2209211743777377e-07, + "loss": 0.1753, + "step": 8171 + }, + { + "epoch": 2.1745609366684406, + "grad_norm": 0.4037012755870819, + "learning_rate": 1.2207564594155457e-07, + "loss": 0.1881, + "step": 8172 + }, + { + "epoch": 2.1748270356572643, + "grad_norm": 0.3476118743419647, + "learning_rate": 1.2205917381569314e-07, + "loss": 0.174, + "step": 8173 + }, + { + "epoch": 2.1750931346460884, + "grad_norm": 0.31263893842697144, + "learning_rate": 1.220427010606592e-07, + "loss": 0.1746, + "step": 8174 + }, + { + "epoch": 2.175359233634912, + "grad_norm": 0.3580380976200104, + "learning_rate": 1.2202622767692265e-07, + "loss": 0.1937, + "step": 8175 + }, + { + "epoch": 2.175625332623736, + "grad_norm": 0.28086745738983154, + "learning_rate": 1.2200975366495334e-07, + "loss": 0.2011, + "step": 8176 + }, + { + "epoch": 2.17589143161256, + "grad_norm": 0.32020121812820435, + "learning_rate": 1.2199327902522113e-07, + "loss": 0.181, + "step": 8177 + }, + { + "epoch": 2.1761575306013836, + "grad_norm": 0.27718037366867065, + "learning_rate": 1.219768037581959e-07, + "loss": 0.1834, + "step": 8178 + }, + { + "epoch": 2.1764236295902077, + "grad_norm": 0.2708565890789032, + "learning_rate": 1.2196032786434755e-07, + "loss": 0.1833, + "step": 8179 + }, + { + "epoch": 2.1766897285790314, + "grad_norm": 0.33722689747810364, + "learning_rate": 1.2194385134414606e-07, + "loss": 0.1867, + "step": 8180 + }, + { + "epoch": 2.176955827567855, + "grad_norm": 0.3025814890861511, + "learning_rate": 1.2192737419806133e-07, + "loss": 0.1791, + "step": 8181 + }, + { + "epoch": 2.1772219265566792, + "grad_norm": 0.26693740487098694, + "learning_rate": 1.2191089642656328e-07, + "loss": 0.1766, + "step": 8182 + }, + { + "epoch": 2.177488025545503, + "grad_norm": 0.26106977462768555, + "learning_rate": 1.2189441803012203e-07, + "loss": 0.1734, + "step": 8183 + }, + { + "epoch": 2.1777541245343266, + "grad_norm": 0.3261716663837433, + "learning_rate": 1.218779390092074e-07, + "loss": 0.1836, + "step": 8184 + }, + { + "epoch": 2.1780202235231507, + "grad_norm": 0.3786602318286896, + "learning_rate": 1.2186145936428953e-07, + "loss": 0.2, + "step": 8185 + }, + { + "epoch": 2.1782863225119744, + "grad_norm": 0.2762817144393921, + "learning_rate": 1.218449790958384e-07, + "loss": 0.1706, + "step": 8186 + }, + { + "epoch": 2.1785524215007985, + "grad_norm": 0.30967283248901367, + "learning_rate": 1.2182849820432407e-07, + "loss": 0.1777, + "step": 8187 + }, + { + "epoch": 2.178818520489622, + "grad_norm": 0.3365628123283386, + "learning_rate": 1.2181201669021667e-07, + "loss": 0.1963, + "step": 8188 + }, + { + "epoch": 2.179084619478446, + "grad_norm": 0.2741766571998596, + "learning_rate": 1.2179553455398617e-07, + "loss": 0.1742, + "step": 8189 + }, + { + "epoch": 2.17935071846727, + "grad_norm": 0.32165104150772095, + "learning_rate": 1.2177905179610276e-07, + "loss": 0.1803, + "step": 8190 + }, + { + "epoch": 2.1796168174560937, + "grad_norm": 0.2612735629081726, + "learning_rate": 1.217625684170365e-07, + "loss": 0.184, + "step": 8191 + }, + { + "epoch": 2.1798829164449174, + "grad_norm": 0.25377801060676575, + "learning_rate": 1.217460844172576e-07, + "loss": 0.1843, + "step": 8192 + }, + { + "epoch": 2.1801490154337415, + "grad_norm": 0.37724441289901733, + "learning_rate": 1.217295997972362e-07, + "loss": 0.1936, + "step": 8193 + }, + { + "epoch": 2.180415114422565, + "grad_norm": 0.39519211649894714, + "learning_rate": 1.2171311455744243e-07, + "loss": 0.207, + "step": 8194 + }, + { + "epoch": 2.180681213411389, + "grad_norm": 0.35126468539237976, + "learning_rate": 1.216966286983465e-07, + "loss": 0.1826, + "step": 8195 + }, + { + "epoch": 2.180947312400213, + "grad_norm": 0.28032323718070984, + "learning_rate": 1.2168014222041865e-07, + "loss": 0.1739, + "step": 8196 + }, + { + "epoch": 2.1812134113890367, + "grad_norm": 0.35166457295417786, + "learning_rate": 1.2166365512412907e-07, + "loss": 0.182, + "step": 8197 + }, + { + "epoch": 2.1814795103778604, + "grad_norm": 0.25140872597694397, + "learning_rate": 1.2164716740994804e-07, + "loss": 0.1598, + "step": 8198 + }, + { + "epoch": 2.1817456093666845, + "grad_norm": 0.2796361446380615, + "learning_rate": 1.2163067907834582e-07, + "loss": 0.181, + "step": 8199 + }, + { + "epoch": 2.182011708355508, + "grad_norm": 0.49057722091674805, + "learning_rate": 1.2161419012979263e-07, + "loss": 0.1786, + "step": 8200 + }, + { + "epoch": 2.1822778073443323, + "grad_norm": 0.2548057734966278, + "learning_rate": 1.2159770056475885e-07, + "loss": 0.1781, + "step": 8201 + }, + { + "epoch": 2.182543906333156, + "grad_norm": 0.33628544211387634, + "learning_rate": 1.2158121038371474e-07, + "loss": 0.1758, + "step": 8202 + }, + { + "epoch": 2.1828100053219797, + "grad_norm": 0.28966590762138367, + "learning_rate": 1.2156471958713066e-07, + "loss": 0.1831, + "step": 8203 + }, + { + "epoch": 2.183076104310804, + "grad_norm": 0.2671586573123932, + "learning_rate": 1.2154822817547696e-07, + "loss": 0.1721, + "step": 8204 + }, + { + "epoch": 2.1833422032996275, + "grad_norm": 0.36420395970344543, + "learning_rate": 1.2153173614922402e-07, + "loss": 0.1776, + "step": 8205 + }, + { + "epoch": 2.183608302288451, + "grad_norm": 0.26059725880622864, + "learning_rate": 1.215152435088422e-07, + "loss": 0.1763, + "step": 8206 + }, + { + "epoch": 2.1838744012772753, + "grad_norm": 0.2761191725730896, + "learning_rate": 1.214987502548019e-07, + "loss": 0.1822, + "step": 8207 + }, + { + "epoch": 2.184140500266099, + "grad_norm": 0.34419676661491394, + "learning_rate": 1.2148225638757356e-07, + "loss": 0.1901, + "step": 8208 + }, + { + "epoch": 2.1844065992549226, + "grad_norm": 0.26664918661117554, + "learning_rate": 1.2146576190762764e-07, + "loss": 0.1855, + "step": 8209 + }, + { + "epoch": 2.1846726982437468, + "grad_norm": 0.2473597228527069, + "learning_rate": 1.2144926681543455e-07, + "loss": 0.1631, + "step": 8210 + }, + { + "epoch": 2.1849387972325705, + "grad_norm": 0.263286292552948, + "learning_rate": 1.214327711114648e-07, + "loss": 0.1757, + "step": 8211 + }, + { + "epoch": 2.1852048962213946, + "grad_norm": 0.2606152594089508, + "learning_rate": 1.2141627479618884e-07, + "loss": 0.171, + "step": 8212 + }, + { + "epoch": 2.1854709952102183, + "grad_norm": 0.26211193203926086, + "learning_rate": 1.213997778700772e-07, + "loss": 0.1879, + "step": 8213 + }, + { + "epoch": 2.185737094199042, + "grad_norm": 0.26798123121261597, + "learning_rate": 1.2138328033360044e-07, + "loss": 0.1803, + "step": 8214 + }, + { + "epoch": 2.186003193187866, + "grad_norm": 0.29561087489128113, + "learning_rate": 1.2136678218722902e-07, + "loss": 0.1983, + "step": 8215 + }, + { + "epoch": 2.1862692921766897, + "grad_norm": 0.2756437659263611, + "learning_rate": 1.213502834314336e-07, + "loss": 0.1853, + "step": 8216 + }, + { + "epoch": 2.1865353911655134, + "grad_norm": 0.2808998227119446, + "learning_rate": 1.213337840666847e-07, + "loss": 0.1812, + "step": 8217 + }, + { + "epoch": 2.1868014901543376, + "grad_norm": 0.357623428106308, + "learning_rate": 1.2131728409345293e-07, + "loss": 0.178, + "step": 8218 + }, + { + "epoch": 2.1870675891431612, + "grad_norm": 0.39348968863487244, + "learning_rate": 1.2130078351220886e-07, + "loss": 0.1942, + "step": 8219 + }, + { + "epoch": 2.187333688131985, + "grad_norm": 0.2569487690925598, + "learning_rate": 1.212842823234232e-07, + "loss": 0.178, + "step": 8220 + }, + { + "epoch": 2.187599787120809, + "grad_norm": 0.3764972388744354, + "learning_rate": 1.2126778052756655e-07, + "loss": 0.1968, + "step": 8221 + }, + { + "epoch": 2.1878658861096327, + "grad_norm": 0.27742257714271545, + "learning_rate": 1.2125127812510958e-07, + "loss": 0.1882, + "step": 8222 + }, + { + "epoch": 2.1881319850984564, + "grad_norm": 0.4539504647254944, + "learning_rate": 1.2123477511652295e-07, + "loss": 0.1858, + "step": 8223 + }, + { + "epoch": 2.1883980840872805, + "grad_norm": 0.31687116622924805, + "learning_rate": 1.212182715022774e-07, + "loss": 0.1856, + "step": 8224 + }, + { + "epoch": 2.188664183076104, + "grad_norm": 0.306304007768631, + "learning_rate": 1.2120176728284363e-07, + "loss": 0.1789, + "step": 8225 + }, + { + "epoch": 2.1889302820649283, + "grad_norm": 0.2875177264213562, + "learning_rate": 1.2118526245869238e-07, + "loss": 0.1901, + "step": 8226 + }, + { + "epoch": 2.189196381053752, + "grad_norm": 0.28327709436416626, + "learning_rate": 1.2116875703029438e-07, + "loss": 0.194, + "step": 8227 + }, + { + "epoch": 2.1894624800425757, + "grad_norm": 0.2649803161621094, + "learning_rate": 1.2115225099812044e-07, + "loss": 0.1679, + "step": 8228 + }, + { + "epoch": 2.1897285790314, + "grad_norm": 0.26426637172698975, + "learning_rate": 1.211357443626413e-07, + "loss": 0.162, + "step": 8229 + }, + { + "epoch": 2.1899946780202235, + "grad_norm": 0.24521775543689728, + "learning_rate": 1.2111923712432776e-07, + "loss": 0.1585, + "step": 8230 + }, + { + "epoch": 2.190260777009047, + "grad_norm": 0.33835941553115845, + "learning_rate": 1.211027292836507e-07, + "loss": 0.1805, + "step": 8231 + }, + { + "epoch": 2.1905268759978713, + "grad_norm": 0.5383341312408447, + "learning_rate": 1.210862208410809e-07, + "loss": 0.1979, + "step": 8232 + }, + { + "epoch": 2.190792974986695, + "grad_norm": 0.263285756111145, + "learning_rate": 1.2106971179708923e-07, + "loss": 0.1732, + "step": 8233 + }, + { + "epoch": 2.1910590739755187, + "grad_norm": 0.27243897318840027, + "learning_rate": 1.2105320215214656e-07, + "loss": 0.1663, + "step": 8234 + }, + { + "epoch": 2.191325172964343, + "grad_norm": 0.27820947766304016, + "learning_rate": 1.210366919067238e-07, + "loss": 0.1986, + "step": 8235 + }, + { + "epoch": 2.1915912719531665, + "grad_norm": 0.2637464702129364, + "learning_rate": 1.2102018106129184e-07, + "loss": 0.1679, + "step": 8236 + }, + { + "epoch": 2.1918573709419906, + "grad_norm": 0.273364394903183, + "learning_rate": 1.210036696163216e-07, + "loss": 0.1702, + "step": 8237 + }, + { + "epoch": 2.1921234699308143, + "grad_norm": 0.29764464497566223, + "learning_rate": 1.20987157572284e-07, + "loss": 0.1888, + "step": 8238 + }, + { + "epoch": 2.192389568919638, + "grad_norm": 0.32416802644729614, + "learning_rate": 1.2097064492964998e-07, + "loss": 0.1922, + "step": 8239 + }, + { + "epoch": 2.192655667908462, + "grad_norm": 0.42197707295417786, + "learning_rate": 1.209541316888906e-07, + "loss": 0.2075, + "step": 8240 + }, + { + "epoch": 2.192921766897286, + "grad_norm": 0.28733906149864197, + "learning_rate": 1.209376178504768e-07, + "loss": 0.1648, + "step": 8241 + }, + { + "epoch": 2.1931878658861095, + "grad_norm": 0.37860915064811707, + "learning_rate": 1.2092110341487963e-07, + "loss": 0.1795, + "step": 8242 + }, + { + "epoch": 2.1934539648749336, + "grad_norm": 0.3235436677932739, + "learning_rate": 1.2090458838257002e-07, + "loss": 0.1718, + "step": 8243 + }, + { + "epoch": 2.1937200638637573, + "grad_norm": 0.3403884768486023, + "learning_rate": 1.208880727540191e-07, + "loss": 0.1997, + "step": 8244 + }, + { + "epoch": 2.193986162852581, + "grad_norm": 0.3233678638935089, + "learning_rate": 1.2087155652969788e-07, + "loss": 0.1881, + "step": 8245 + }, + { + "epoch": 2.194252261841405, + "grad_norm": 0.42813342809677124, + "learning_rate": 1.2085503971007748e-07, + "loss": 0.2252, + "step": 8246 + }, + { + "epoch": 2.1945183608302288, + "grad_norm": 0.4233801066875458, + "learning_rate": 1.2083852229562895e-07, + "loss": 0.1845, + "step": 8247 + }, + { + "epoch": 2.1947844598190525, + "grad_norm": 0.34492620825767517, + "learning_rate": 1.2082200428682342e-07, + "loss": 0.1984, + "step": 8248 + }, + { + "epoch": 2.1950505588078766, + "grad_norm": 0.34492623805999756, + "learning_rate": 1.20805485684132e-07, + "loss": 0.1766, + "step": 8249 + }, + { + "epoch": 2.1953166577967003, + "grad_norm": 0.2940952181816101, + "learning_rate": 1.207889664880259e-07, + "loss": 0.1868, + "step": 8250 + }, + { + "epoch": 2.1955827567855244, + "grad_norm": 0.27493149042129517, + "learning_rate": 1.2077244669897616e-07, + "loss": 0.174, + "step": 8251 + }, + { + "epoch": 2.195848855774348, + "grad_norm": 0.32386183738708496, + "learning_rate": 1.2075592631745405e-07, + "loss": 0.1744, + "step": 8252 + }, + { + "epoch": 2.1961149547631718, + "grad_norm": 0.28414186835289, + "learning_rate": 1.2073940534393079e-07, + "loss": 0.1576, + "step": 8253 + }, + { + "epoch": 2.196381053751996, + "grad_norm": 0.27559536695480347, + "learning_rate": 1.2072288377887751e-07, + "loss": 0.1807, + "step": 8254 + }, + { + "epoch": 2.1966471527408196, + "grad_norm": 0.30940762162208557, + "learning_rate": 1.2070636162276547e-07, + "loss": 0.1751, + "step": 8255 + }, + { + "epoch": 2.1969132517296432, + "grad_norm": 0.3019636869430542, + "learning_rate": 1.2068983887606593e-07, + "loss": 0.1784, + "step": 8256 + }, + { + "epoch": 2.1971793507184674, + "grad_norm": 0.27175748348236084, + "learning_rate": 1.2067331553925012e-07, + "loss": 0.1731, + "step": 8257 + }, + { + "epoch": 2.197445449707291, + "grad_norm": 0.2770686447620392, + "learning_rate": 1.2065679161278938e-07, + "loss": 0.1709, + "step": 8258 + }, + { + "epoch": 2.197711548696115, + "grad_norm": 0.25627270340919495, + "learning_rate": 1.206402670971549e-07, + "loss": 0.1682, + "step": 8259 + }, + { + "epoch": 2.197977647684939, + "grad_norm": 0.30777865648269653, + "learning_rate": 1.206237419928181e-07, + "loss": 0.2015, + "step": 8260 + }, + { + "epoch": 2.1982437466737625, + "grad_norm": 0.29611900448799133, + "learning_rate": 1.2060721630025032e-07, + "loss": 0.1908, + "step": 8261 + }, + { + "epoch": 2.1985098456625867, + "grad_norm": 0.3686164915561676, + "learning_rate": 1.2059069001992278e-07, + "loss": 0.1949, + "step": 8262 + }, + { + "epoch": 2.1987759446514104, + "grad_norm": 0.35829007625579834, + "learning_rate": 1.2057416315230694e-07, + "loss": 0.1729, + "step": 8263 + }, + { + "epoch": 2.199042043640234, + "grad_norm": 0.384156733751297, + "learning_rate": 1.2055763569787416e-07, + "loss": 0.1579, + "step": 8264 + }, + { + "epoch": 2.199308142629058, + "grad_norm": 0.3324761688709259, + "learning_rate": 1.2054110765709585e-07, + "loss": 0.1881, + "step": 8265 + }, + { + "epoch": 2.199574241617882, + "grad_norm": 0.29730668663978577, + "learning_rate": 1.205245790304434e-07, + "loss": 0.2015, + "step": 8266 + }, + { + "epoch": 2.1998403406067055, + "grad_norm": 0.3729459047317505, + "learning_rate": 1.2050804981838823e-07, + "loss": 0.1765, + "step": 8267 + }, + { + "epoch": 2.2001064395955297, + "grad_norm": 0.27438443899154663, + "learning_rate": 1.2049152002140185e-07, + "loss": 0.1977, + "step": 8268 + }, + { + "epoch": 2.2003725385843533, + "grad_norm": 0.4564077854156494, + "learning_rate": 1.2047498963995564e-07, + "loss": 0.2115, + "step": 8269 + }, + { + "epoch": 2.200638637573177, + "grad_norm": 0.3608035743236542, + "learning_rate": 1.204584586745211e-07, + "loss": 0.1902, + "step": 8270 + }, + { + "epoch": 2.200904736562001, + "grad_norm": 0.2662750780582428, + "learning_rate": 1.204419271255698e-07, + "loss": 0.1737, + "step": 8271 + }, + { + "epoch": 2.201170835550825, + "grad_norm": 0.39946168661117554, + "learning_rate": 1.2042539499357316e-07, + "loss": 0.1931, + "step": 8272 + }, + { + "epoch": 2.201436934539649, + "grad_norm": 0.3125762641429901, + "learning_rate": 1.2040886227900276e-07, + "loss": 0.1841, + "step": 8273 + }, + { + "epoch": 2.2017030335284726, + "grad_norm": 0.2565772831439972, + "learning_rate": 1.203923289823301e-07, + "loss": 0.1743, + "step": 8274 + }, + { + "epoch": 2.2019691325172963, + "grad_norm": 0.2823904752731323, + "learning_rate": 1.2037579510402683e-07, + "loss": 0.182, + "step": 8275 + }, + { + "epoch": 2.2022352315061204, + "grad_norm": 0.3191625773906708, + "learning_rate": 1.2035926064456445e-07, + "loss": 0.1805, + "step": 8276 + }, + { + "epoch": 2.202501330494944, + "grad_norm": 0.40098807215690613, + "learning_rate": 1.203427256044146e-07, + "loss": 0.199, + "step": 8277 + }, + { + "epoch": 2.202767429483768, + "grad_norm": 0.3312230408191681, + "learning_rate": 1.2032618998404888e-07, + "loss": 0.1849, + "step": 8278 + }, + { + "epoch": 2.203033528472592, + "grad_norm": 0.4000667333602905, + "learning_rate": 1.203096537839389e-07, + "loss": 0.1751, + "step": 8279 + }, + { + "epoch": 2.2032996274614156, + "grad_norm": 0.25763511657714844, + "learning_rate": 1.2029311700455633e-07, + "loss": 0.1827, + "step": 8280 + }, + { + "epoch": 2.2035657264502393, + "grad_norm": 0.2745003402233124, + "learning_rate": 1.2027657964637282e-07, + "loss": 0.178, + "step": 8281 + }, + { + "epoch": 2.2038318254390634, + "grad_norm": 0.2769930362701416, + "learning_rate": 1.2026004170986007e-07, + "loss": 0.193, + "step": 8282 + }, + { + "epoch": 2.204097924427887, + "grad_norm": 0.3599102795124054, + "learning_rate": 1.2024350319548975e-07, + "loss": 0.1941, + "step": 8283 + }, + { + "epoch": 2.2043640234167112, + "grad_norm": 0.2581208050251007, + "learning_rate": 1.2022696410373357e-07, + "loss": 0.1593, + "step": 8284 + }, + { + "epoch": 2.204630122405535, + "grad_norm": 0.2720814049243927, + "learning_rate": 1.2021042443506327e-07, + "loss": 0.1795, + "step": 8285 + }, + { + "epoch": 2.2048962213943586, + "grad_norm": 0.3729116916656494, + "learning_rate": 1.2019388418995063e-07, + "loss": 0.1861, + "step": 8286 + }, + { + "epoch": 2.2051623203831827, + "grad_norm": 0.3290795683860779, + "learning_rate": 1.2017734336886737e-07, + "loss": 0.1672, + "step": 8287 + }, + { + "epoch": 2.2054284193720064, + "grad_norm": 0.2857244908809662, + "learning_rate": 1.2016080197228525e-07, + "loss": 0.166, + "step": 8288 + }, + { + "epoch": 2.20569451836083, + "grad_norm": 0.2825838625431061, + "learning_rate": 1.201442600006761e-07, + "loss": 0.1865, + "step": 8289 + }, + { + "epoch": 2.205960617349654, + "grad_norm": 0.37376895546913147, + "learning_rate": 1.2012771745451175e-07, + "loss": 0.1881, + "step": 8290 + }, + { + "epoch": 2.206226716338478, + "grad_norm": 0.27488258481025696, + "learning_rate": 1.2011117433426393e-07, + "loss": 0.1874, + "step": 8291 + }, + { + "epoch": 2.2064928153273016, + "grad_norm": 0.29005131125450134, + "learning_rate": 1.200946306404046e-07, + "loss": 0.1816, + "step": 8292 + }, + { + "epoch": 2.2067589143161257, + "grad_norm": 0.2644680440425873, + "learning_rate": 1.2007808637340555e-07, + "loss": 0.1821, + "step": 8293 + }, + { + "epoch": 2.2070250133049494, + "grad_norm": 0.25845959782600403, + "learning_rate": 1.2006154153373866e-07, + "loss": 0.1616, + "step": 8294 + }, + { + "epoch": 2.207291112293773, + "grad_norm": 0.2704927623271942, + "learning_rate": 1.200449961218759e-07, + "loss": 0.1886, + "step": 8295 + }, + { + "epoch": 2.207557211282597, + "grad_norm": 0.34525614976882935, + "learning_rate": 1.2002845013828905e-07, + "loss": 0.2024, + "step": 8296 + }, + { + "epoch": 2.207823310271421, + "grad_norm": 0.34281137585639954, + "learning_rate": 1.2001190358345013e-07, + "loss": 0.186, + "step": 8297 + }, + { + "epoch": 2.208089409260245, + "grad_norm": 0.3337021470069885, + "learning_rate": 1.1999535645783105e-07, + "loss": 0.1899, + "step": 8298 + }, + { + "epoch": 2.2083555082490687, + "grad_norm": 0.2901037037372589, + "learning_rate": 1.1997880876190376e-07, + "loss": 0.2092, + "step": 8299 + }, + { + "epoch": 2.2086216072378924, + "grad_norm": 0.29818418622016907, + "learning_rate": 1.1996226049614027e-07, + "loss": 0.183, + "step": 8300 + }, + { + "epoch": 2.2088877062267165, + "grad_norm": 0.2587738633155823, + "learning_rate": 1.1994571166101254e-07, + "loss": 0.1574, + "step": 8301 + }, + { + "epoch": 2.20915380521554, + "grad_norm": 0.3177570700645447, + "learning_rate": 1.1992916225699257e-07, + "loss": 0.1797, + "step": 8302 + }, + { + "epoch": 2.209419904204364, + "grad_norm": 0.326273649930954, + "learning_rate": 1.199126122845524e-07, + "loss": 0.168, + "step": 8303 + }, + { + "epoch": 2.209686003193188, + "grad_norm": 0.2572815716266632, + "learning_rate": 1.198960617441641e-07, + "loss": 0.1737, + "step": 8304 + }, + { + "epoch": 2.2099521021820117, + "grad_norm": 0.27491873502731323, + "learning_rate": 1.1987951063629963e-07, + "loss": 0.186, + "step": 8305 + }, + { + "epoch": 2.210218201170836, + "grad_norm": 0.36207807064056396, + "learning_rate": 1.1986295896143115e-07, + "loss": 0.1981, + "step": 8306 + }, + { + "epoch": 2.2104843001596595, + "grad_norm": 0.3858087956905365, + "learning_rate": 1.198464067200307e-07, + "loss": 0.1966, + "step": 8307 + }, + { + "epoch": 2.210750399148483, + "grad_norm": 0.2812097668647766, + "learning_rate": 1.1982985391257043e-07, + "loss": 0.185, + "step": 8308 + }, + { + "epoch": 2.2110164981373073, + "grad_norm": 0.28640636801719666, + "learning_rate": 1.198133005395224e-07, + "loss": 0.1691, + "step": 8309 + }, + { + "epoch": 2.211282597126131, + "grad_norm": 0.42436203360557556, + "learning_rate": 1.197967466013588e-07, + "loss": 0.1991, + "step": 8310 + }, + { + "epoch": 2.2115486961149546, + "grad_norm": 0.37640810012817383, + "learning_rate": 1.1978019209855171e-07, + "loss": 0.2096, + "step": 8311 + }, + { + "epoch": 2.2118147951037788, + "grad_norm": 0.37651196122169495, + "learning_rate": 1.1976363703157338e-07, + "loss": 0.1745, + "step": 8312 + }, + { + "epoch": 2.2120808940926024, + "grad_norm": 0.29114213585853577, + "learning_rate": 1.1974708140089598e-07, + "loss": 0.1864, + "step": 8313 + }, + { + "epoch": 2.212346993081426, + "grad_norm": 0.28139498829841614, + "learning_rate": 1.1973052520699164e-07, + "loss": 0.1824, + "step": 8314 + }, + { + "epoch": 2.2126130920702503, + "grad_norm": 0.31386712193489075, + "learning_rate": 1.197139684503327e-07, + "loss": 0.2043, + "step": 8315 + }, + { + "epoch": 2.212879191059074, + "grad_norm": 0.27253150939941406, + "learning_rate": 1.1969741113139127e-07, + "loss": 0.1905, + "step": 8316 + }, + { + "epoch": 2.2131452900478976, + "grad_norm": 0.30789878964424133, + "learning_rate": 1.1968085325063964e-07, + "loss": 0.193, + "step": 8317 + }, + { + "epoch": 2.2134113890367217, + "grad_norm": 0.35479000210762024, + "learning_rate": 1.1966429480855008e-07, + "loss": 0.1995, + "step": 8318 + }, + { + "epoch": 2.2136774880255454, + "grad_norm": 0.3178027272224426, + "learning_rate": 1.196477358055949e-07, + "loss": 0.1763, + "step": 8319 + }, + { + "epoch": 2.2139435870143696, + "grad_norm": 0.4154800474643707, + "learning_rate": 1.196311762422464e-07, + "loss": 0.1773, + "step": 8320 + }, + { + "epoch": 2.2142096860031932, + "grad_norm": 0.27777329087257385, + "learning_rate": 1.1961461611897678e-07, + "loss": 0.1657, + "step": 8321 + }, + { + "epoch": 2.214475784992017, + "grad_norm": 0.3873506784439087, + "learning_rate": 1.195980554362585e-07, + "loss": 0.2079, + "step": 8322 + }, + { + "epoch": 2.214741883980841, + "grad_norm": 0.3098411560058594, + "learning_rate": 1.1958149419456385e-07, + "loss": 0.1601, + "step": 8323 + }, + { + "epoch": 2.2150079829696647, + "grad_norm": 0.34478840231895447, + "learning_rate": 1.195649323943652e-07, + "loss": 0.1668, + "step": 8324 + }, + { + "epoch": 2.2152740819584884, + "grad_norm": 0.25792843103408813, + "learning_rate": 1.1954837003613487e-07, + "loss": 0.1724, + "step": 8325 + }, + { + "epoch": 2.2155401809473125, + "grad_norm": 0.301384299993515, + "learning_rate": 1.1953180712034536e-07, + "loss": 0.1872, + "step": 8326 + }, + { + "epoch": 2.215806279936136, + "grad_norm": 0.27779969573020935, + "learning_rate": 1.1951524364746902e-07, + "loss": 0.162, + "step": 8327 + }, + { + "epoch": 2.21607237892496, + "grad_norm": 0.6696370840072632, + "learning_rate": 1.1949867961797822e-07, + "loss": 0.1765, + "step": 8328 + }, + { + "epoch": 2.216338477913784, + "grad_norm": 0.35970786213874817, + "learning_rate": 1.1948211503234552e-07, + "loss": 0.1924, + "step": 8329 + }, + { + "epoch": 2.2166045769026077, + "grad_norm": 0.26977330446243286, + "learning_rate": 1.1946554989104324e-07, + "loss": 0.1712, + "step": 8330 + }, + { + "epoch": 2.216870675891432, + "grad_norm": 0.2427521049976349, + "learning_rate": 1.1944898419454395e-07, + "loss": 0.1586, + "step": 8331 + }, + { + "epoch": 2.2171367748802555, + "grad_norm": 0.3173087537288666, + "learning_rate": 1.1943241794332012e-07, + "loss": 0.1771, + "step": 8332 + }, + { + "epoch": 2.217402873869079, + "grad_norm": 0.3500659167766571, + "learning_rate": 1.1941585113784424e-07, + "loss": 0.1871, + "step": 8333 + }, + { + "epoch": 2.2176689728579033, + "grad_norm": 0.3910253345966339, + "learning_rate": 1.1939928377858884e-07, + "loss": 0.1932, + "step": 8334 + }, + { + "epoch": 2.217935071846727, + "grad_norm": 0.27011623978614807, + "learning_rate": 1.1938271586602642e-07, + "loss": 0.1809, + "step": 8335 + }, + { + "epoch": 2.2182011708355507, + "grad_norm": 0.2667098045349121, + "learning_rate": 1.1936614740062958e-07, + "loss": 0.1915, + "step": 8336 + }, + { + "epoch": 2.218467269824375, + "grad_norm": 0.37625226378440857, + "learning_rate": 1.1934957838287085e-07, + "loss": 0.1828, + "step": 8337 + }, + { + "epoch": 2.2187333688131985, + "grad_norm": 0.31793829798698425, + "learning_rate": 1.1933300881322284e-07, + "loss": 0.1734, + "step": 8338 + }, + { + "epoch": 2.218999467802022, + "grad_norm": 0.28040072321891785, + "learning_rate": 1.1931643869215812e-07, + "loss": 0.1907, + "step": 8339 + }, + { + "epoch": 2.2192655667908463, + "grad_norm": 0.4255233407020569, + "learning_rate": 1.192998680201493e-07, + "loss": 0.1968, + "step": 8340 + }, + { + "epoch": 2.21953166577967, + "grad_norm": 0.29601141810417175, + "learning_rate": 1.1928329679766907e-07, + "loss": 0.1918, + "step": 8341 + }, + { + "epoch": 2.2197977647684937, + "grad_norm": 0.2805015444755554, + "learning_rate": 1.1926672502518998e-07, + "loss": 0.1748, + "step": 8342 + }, + { + "epoch": 2.220063863757318, + "grad_norm": 0.2550438940525055, + "learning_rate": 1.1925015270318474e-07, + "loss": 0.1665, + "step": 8343 + }, + { + "epoch": 2.2203299627461415, + "grad_norm": 0.3021949529647827, + "learning_rate": 1.1923357983212607e-07, + "loss": 0.1745, + "step": 8344 + }, + { + "epoch": 2.2205960617349656, + "grad_norm": 0.2587788701057434, + "learning_rate": 1.192170064124866e-07, + "loss": 0.1721, + "step": 8345 + }, + { + "epoch": 2.2208621607237893, + "grad_norm": 0.2353103905916214, + "learning_rate": 1.1920043244473906e-07, + "loss": 0.1575, + "step": 8346 + }, + { + "epoch": 2.221128259712613, + "grad_norm": 0.39135828614234924, + "learning_rate": 1.1918385792935615e-07, + "loss": 0.1867, + "step": 8347 + }, + { + "epoch": 2.221394358701437, + "grad_norm": 0.2592528462409973, + "learning_rate": 1.1916728286681061e-07, + "loss": 0.1818, + "step": 8348 + }, + { + "epoch": 2.2216604576902608, + "grad_norm": 0.3489689528942108, + "learning_rate": 1.1915070725757526e-07, + "loss": 0.1756, + "step": 8349 + }, + { + "epoch": 2.2219265566790845, + "grad_norm": 0.29146808385849, + "learning_rate": 1.191341311021228e-07, + "loss": 0.1592, + "step": 8350 + }, + { + "epoch": 2.2221926556679086, + "grad_norm": 0.27005356550216675, + "learning_rate": 1.1911755440092606e-07, + "loss": 0.1714, + "step": 8351 + }, + { + "epoch": 2.2224587546567323, + "grad_norm": 0.27588778734207153, + "learning_rate": 1.1910097715445779e-07, + "loss": 0.1793, + "step": 8352 + }, + { + "epoch": 2.222724853645556, + "grad_norm": 0.2619694471359253, + "learning_rate": 1.1908439936319084e-07, + "loss": 0.1785, + "step": 8353 + }, + { + "epoch": 2.22299095263438, + "grad_norm": 0.3490706980228424, + "learning_rate": 1.1906782102759807e-07, + "loss": 0.2005, + "step": 8354 + }, + { + "epoch": 2.2232570516232038, + "grad_norm": 0.24900180101394653, + "learning_rate": 1.1905124214815227e-07, + "loss": 0.1596, + "step": 8355 + }, + { + "epoch": 2.223523150612028, + "grad_norm": 0.3709152340888977, + "learning_rate": 1.1903466272532634e-07, + "loss": 0.1954, + "step": 8356 + }, + { + "epoch": 2.2237892496008516, + "grad_norm": 0.25456947088241577, + "learning_rate": 1.1901808275959313e-07, + "loss": 0.1668, + "step": 8357 + }, + { + "epoch": 2.2240553485896752, + "grad_norm": 0.27430176734924316, + "learning_rate": 1.1900150225142557e-07, + "loss": 0.1722, + "step": 8358 + }, + { + "epoch": 2.2243214475784994, + "grad_norm": 0.35720109939575195, + "learning_rate": 1.1898492120129657e-07, + "loss": 0.1926, + "step": 8359 + }, + { + "epoch": 2.224587546567323, + "grad_norm": 0.2679441273212433, + "learning_rate": 1.18968339609679e-07, + "loss": 0.175, + "step": 8360 + }, + { + "epoch": 2.2248536455561467, + "grad_norm": 0.29624029994010925, + "learning_rate": 1.1895175747704587e-07, + "loss": 0.1927, + "step": 8361 + }, + { + "epoch": 2.225119744544971, + "grad_norm": 0.3486998379230499, + "learning_rate": 1.189351748038701e-07, + "loss": 0.1939, + "step": 8362 + }, + { + "epoch": 2.2253858435337945, + "grad_norm": 0.34551623463630676, + "learning_rate": 1.1891859159062465e-07, + "loss": 0.1773, + "step": 8363 + }, + { + "epoch": 2.225651942522618, + "grad_norm": 0.27764275670051575, + "learning_rate": 1.1890200783778253e-07, + "loss": 0.1863, + "step": 8364 + }, + { + "epoch": 2.2259180415114423, + "grad_norm": 0.2569279372692108, + "learning_rate": 1.1888542354581675e-07, + "loss": 0.1686, + "step": 8365 + }, + { + "epoch": 2.226184140500266, + "grad_norm": 0.7054750323295593, + "learning_rate": 1.188688387152003e-07, + "loss": 0.1762, + "step": 8366 + }, + { + "epoch": 2.2264502394890897, + "grad_norm": 0.3349066972732544, + "learning_rate": 1.1885225334640621e-07, + "loss": 0.1747, + "step": 8367 + }, + { + "epoch": 2.226716338477914, + "grad_norm": 0.3140943944454193, + "learning_rate": 1.1883566743990754e-07, + "loss": 0.1797, + "step": 8368 + }, + { + "epoch": 2.2269824374667375, + "grad_norm": 0.45934340357780457, + "learning_rate": 1.1881908099617741e-07, + "loss": 0.2113, + "step": 8369 + }, + { + "epoch": 2.2272485364555616, + "grad_norm": 0.3695533573627472, + "learning_rate": 1.1880249401568882e-07, + "loss": 0.1797, + "step": 8370 + }, + { + "epoch": 2.2275146354443853, + "grad_norm": 0.2755018472671509, + "learning_rate": 1.1878590649891489e-07, + "loss": 0.1918, + "step": 8371 + }, + { + "epoch": 2.227780734433209, + "grad_norm": 0.2743418514728546, + "learning_rate": 1.1876931844632871e-07, + "loss": 0.166, + "step": 8372 + }, + { + "epoch": 2.228046833422033, + "grad_norm": 0.2957782745361328, + "learning_rate": 1.1875272985840348e-07, + "loss": 0.1832, + "step": 8373 + }, + { + "epoch": 2.228312932410857, + "grad_norm": 0.30033937096595764, + "learning_rate": 1.1873614073561226e-07, + "loss": 0.1868, + "step": 8374 + }, + { + "epoch": 2.2285790313996805, + "grad_norm": 0.2840522229671478, + "learning_rate": 1.1871955107842822e-07, + "loss": 0.1873, + "step": 8375 + }, + { + "epoch": 2.2288451303885046, + "grad_norm": 0.3885285258293152, + "learning_rate": 1.1870296088732455e-07, + "loss": 0.1924, + "step": 8376 + }, + { + "epoch": 2.2291112293773283, + "grad_norm": 0.2772749662399292, + "learning_rate": 1.1868637016277445e-07, + "loss": 0.1724, + "step": 8377 + }, + { + "epoch": 2.2293773283661524, + "grad_norm": 0.2754572629928589, + "learning_rate": 1.1866977890525108e-07, + "loss": 0.1838, + "step": 8378 + }, + { + "epoch": 2.229643427354976, + "grad_norm": 0.30705130100250244, + "learning_rate": 1.1865318711522768e-07, + "loss": 0.1767, + "step": 8379 + }, + { + "epoch": 2.2299095263438, + "grad_norm": 0.3104782700538635, + "learning_rate": 1.1863659479317747e-07, + "loss": 0.189, + "step": 8380 + }, + { + "epoch": 2.230175625332624, + "grad_norm": 0.255932480096817, + "learning_rate": 1.1862000193957371e-07, + "loss": 0.1635, + "step": 8381 + }, + { + "epoch": 2.2304417243214476, + "grad_norm": 0.3235965669155121, + "learning_rate": 1.1860340855488966e-07, + "loss": 0.1646, + "step": 8382 + }, + { + "epoch": 2.2307078233102713, + "grad_norm": 0.27983471751213074, + "learning_rate": 1.185868146395986e-07, + "loss": 0.1858, + "step": 8383 + }, + { + "epoch": 2.2309739222990954, + "grad_norm": 0.26597678661346436, + "learning_rate": 1.1857022019417382e-07, + "loss": 0.1835, + "step": 8384 + }, + { + "epoch": 2.231240021287919, + "grad_norm": 0.3177364468574524, + "learning_rate": 1.1855362521908859e-07, + "loss": 0.1855, + "step": 8385 + }, + { + "epoch": 2.2315061202767428, + "grad_norm": 0.28569507598876953, + "learning_rate": 1.1853702971481628e-07, + "loss": 0.1848, + "step": 8386 + }, + { + "epoch": 2.231772219265567, + "grad_norm": 0.3174077272415161, + "learning_rate": 1.1852043368183022e-07, + "loss": 0.1971, + "step": 8387 + }, + { + "epoch": 2.2320383182543906, + "grad_norm": 0.34650689363479614, + "learning_rate": 1.1850383712060378e-07, + "loss": 0.1809, + "step": 8388 + }, + { + "epoch": 2.2323044172432143, + "grad_norm": 0.3940359354019165, + "learning_rate": 1.1848724003161028e-07, + "loss": 0.1882, + "step": 8389 + }, + { + "epoch": 2.2325705162320384, + "grad_norm": 0.274114727973938, + "learning_rate": 1.1847064241532312e-07, + "loss": 0.1783, + "step": 8390 + }, + { + "epoch": 2.232836615220862, + "grad_norm": 0.45328256487846375, + "learning_rate": 1.1845404427221573e-07, + "loss": 0.21, + "step": 8391 + }, + { + "epoch": 2.233102714209686, + "grad_norm": 0.3113136887550354, + "learning_rate": 1.1843744560276147e-07, + "loss": 0.1943, + "step": 8392 + }, + { + "epoch": 2.23336881319851, + "grad_norm": 0.26544809341430664, + "learning_rate": 1.1842084640743379e-07, + "loss": 0.1751, + "step": 8393 + }, + { + "epoch": 2.2336349121873336, + "grad_norm": 0.29130569100379944, + "learning_rate": 1.1840424668670617e-07, + "loss": 0.2106, + "step": 8394 + }, + { + "epoch": 2.2339010111761577, + "grad_norm": 0.26897767186164856, + "learning_rate": 1.1838764644105204e-07, + "loss": 0.169, + "step": 8395 + }, + { + "epoch": 2.2341671101649814, + "grad_norm": 0.37381091713905334, + "learning_rate": 1.1837104567094484e-07, + "loss": 0.1818, + "step": 8396 + }, + { + "epoch": 2.234433209153805, + "grad_norm": 0.2855515778064728, + "learning_rate": 1.1835444437685809e-07, + "loss": 0.1995, + "step": 8397 + }, + { + "epoch": 2.234699308142629, + "grad_norm": 0.2943529188632965, + "learning_rate": 1.1833784255926529e-07, + "loss": 0.18, + "step": 8398 + }, + { + "epoch": 2.234965407131453, + "grad_norm": 0.2555397152900696, + "learning_rate": 1.1832124021863996e-07, + "loss": 0.1754, + "step": 8399 + }, + { + "epoch": 2.2352315061202765, + "grad_norm": 0.24186396598815918, + "learning_rate": 1.1830463735545562e-07, + "loss": 0.1573, + "step": 8400 + }, + { + "epoch": 2.2354976051091007, + "grad_norm": 0.3629745841026306, + "learning_rate": 1.1828803397018583e-07, + "loss": 0.2053, + "step": 8401 + }, + { + "epoch": 2.2357637040979244, + "grad_norm": 0.3081674873828888, + "learning_rate": 1.1827143006330416e-07, + "loss": 0.2014, + "step": 8402 + }, + { + "epoch": 2.2360298030867485, + "grad_norm": 0.27774086594581604, + "learning_rate": 1.1825482563528416e-07, + "loss": 0.1843, + "step": 8403 + }, + { + "epoch": 2.236295902075572, + "grad_norm": 0.270455002784729, + "learning_rate": 1.1823822068659941e-07, + "loss": 0.1659, + "step": 8404 + }, + { + "epoch": 2.236562001064396, + "grad_norm": 0.3179023563861847, + "learning_rate": 1.1822161521772358e-07, + "loss": 0.1935, + "step": 8405 + }, + { + "epoch": 2.23682810005322, + "grad_norm": 0.3156891465187073, + "learning_rate": 1.1820500922913026e-07, + "loss": 0.1601, + "step": 8406 + }, + { + "epoch": 2.2370941990420437, + "grad_norm": 0.2541488707065582, + "learning_rate": 1.1818840272129309e-07, + "loss": 0.174, + "step": 8407 + }, + { + "epoch": 2.2373602980308673, + "grad_norm": 0.2887955605983734, + "learning_rate": 1.1817179569468567e-07, + "loss": 0.1846, + "step": 8408 + }, + { + "epoch": 2.2376263970196915, + "grad_norm": 0.27841559052467346, + "learning_rate": 1.1815518814978176e-07, + "loss": 0.1717, + "step": 8409 + }, + { + "epoch": 2.237892496008515, + "grad_norm": 0.32740476727485657, + "learning_rate": 1.1813858008705497e-07, + "loss": 0.1898, + "step": 8410 + }, + { + "epoch": 2.238158594997339, + "grad_norm": 0.39512166380882263, + "learning_rate": 1.18121971506979e-07, + "loss": 0.183, + "step": 8411 + }, + { + "epoch": 2.238424693986163, + "grad_norm": 0.2916029393672943, + "learning_rate": 1.181053624100276e-07, + "loss": 0.1914, + "step": 8412 + }, + { + "epoch": 2.2386907929749866, + "grad_norm": 0.34670498967170715, + "learning_rate": 1.1808875279667448e-07, + "loss": 0.1882, + "step": 8413 + }, + { + "epoch": 2.2389568919638103, + "grad_norm": 0.4101775884628296, + "learning_rate": 1.1807214266739337e-07, + "loss": 0.1934, + "step": 8414 + }, + { + "epoch": 2.2392229909526344, + "grad_norm": 0.30501314997673035, + "learning_rate": 1.1805553202265799e-07, + "loss": 0.1862, + "step": 8415 + }, + { + "epoch": 2.239489089941458, + "grad_norm": 0.26604464650154114, + "learning_rate": 1.1803892086294216e-07, + "loss": 0.1744, + "step": 8416 + }, + { + "epoch": 2.2397551889302822, + "grad_norm": 0.40350091457366943, + "learning_rate": 1.1802230918871967e-07, + "loss": 0.1895, + "step": 8417 + }, + { + "epoch": 2.240021287919106, + "grad_norm": 0.40050387382507324, + "learning_rate": 1.1800569700046427e-07, + "loss": 0.2065, + "step": 8418 + }, + { + "epoch": 2.2402873869079296, + "grad_norm": 0.2610015273094177, + "learning_rate": 1.1798908429864983e-07, + "loss": 0.1916, + "step": 8419 + }, + { + "epoch": 2.2405534858967537, + "grad_norm": 0.28515467047691345, + "learning_rate": 1.1797247108375015e-07, + "loss": 0.1867, + "step": 8420 + }, + { + "epoch": 2.2408195848855774, + "grad_norm": 0.2563939392566681, + "learning_rate": 1.1795585735623908e-07, + "loss": 0.1768, + "step": 8421 + }, + { + "epoch": 2.241085683874401, + "grad_norm": 0.2741544246673584, + "learning_rate": 1.1793924311659042e-07, + "loss": 0.1713, + "step": 8422 + }, + { + "epoch": 2.2413517828632252, + "grad_norm": 0.2738354802131653, + "learning_rate": 1.179226283652781e-07, + "loss": 0.1968, + "step": 8423 + }, + { + "epoch": 2.241617881852049, + "grad_norm": 0.3926587998867035, + "learning_rate": 1.1790601310277606e-07, + "loss": 0.1752, + "step": 8424 + }, + { + "epoch": 2.241883980840873, + "grad_norm": 0.2546834945678711, + "learning_rate": 1.1788939732955809e-07, + "loss": 0.1612, + "step": 8425 + }, + { + "epoch": 2.2421500798296967, + "grad_norm": 0.3235659599304199, + "learning_rate": 1.1787278104609817e-07, + "loss": 0.1848, + "step": 8426 + }, + { + "epoch": 2.2424161788185204, + "grad_norm": 0.3009030520915985, + "learning_rate": 1.1785616425287022e-07, + "loss": 0.1985, + "step": 8427 + }, + { + "epoch": 2.2426822778073445, + "grad_norm": 0.3498101234436035, + "learning_rate": 1.1783954695034819e-07, + "loss": 0.1888, + "step": 8428 + }, + { + "epoch": 2.242948376796168, + "grad_norm": 0.25937095284461975, + "learning_rate": 1.17822929139006e-07, + "loss": 0.1755, + "step": 8429 + }, + { + "epoch": 2.243214475784992, + "grad_norm": 0.3804304897785187, + "learning_rate": 1.1780631081931766e-07, + "loss": 0.2042, + "step": 8430 + }, + { + "epoch": 2.243480574773816, + "grad_norm": 0.2584676742553711, + "learning_rate": 1.1778969199175717e-07, + "loss": 0.1722, + "step": 8431 + }, + { + "epoch": 2.2437466737626397, + "grad_norm": 0.2551520764827728, + "learning_rate": 1.177730726567985e-07, + "loss": 0.167, + "step": 8432 + }, + { + "epoch": 2.2440127727514634, + "grad_norm": 0.26171210408210754, + "learning_rate": 1.1775645281491568e-07, + "loss": 0.1856, + "step": 8433 + }, + { + "epoch": 2.2442788717402875, + "grad_norm": 0.27087703347206116, + "learning_rate": 1.1773983246658275e-07, + "loss": 0.1729, + "step": 8434 + }, + { + "epoch": 2.244544970729111, + "grad_norm": 0.28924936056137085, + "learning_rate": 1.1772321161227374e-07, + "loss": 0.1851, + "step": 8435 + }, + { + "epoch": 2.244811069717935, + "grad_norm": 0.270408570766449, + "learning_rate": 1.1770659025246273e-07, + "loss": 0.1828, + "step": 8436 + }, + { + "epoch": 2.245077168706759, + "grad_norm": 0.37525153160095215, + "learning_rate": 1.1768996838762378e-07, + "loss": 0.1962, + "step": 8437 + }, + { + "epoch": 2.2453432676955827, + "grad_norm": 0.33427348732948303, + "learning_rate": 1.1767334601823098e-07, + "loss": 0.1932, + "step": 8438 + }, + { + "epoch": 2.245609366684407, + "grad_norm": 0.367697536945343, + "learning_rate": 1.1765672314475845e-07, + "loss": 0.1984, + "step": 8439 + }, + { + "epoch": 2.2458754656732305, + "grad_norm": 0.2892516851425171, + "learning_rate": 1.1764009976768028e-07, + "loss": 0.1904, + "step": 8440 + }, + { + "epoch": 2.246141564662054, + "grad_norm": 0.3945678174495697, + "learning_rate": 1.1762347588747061e-07, + "loss": 0.2012, + "step": 8441 + }, + { + "epoch": 2.2464076636508783, + "grad_norm": 0.27439069747924805, + "learning_rate": 1.176068515046036e-07, + "loss": 0.1824, + "step": 8442 + }, + { + "epoch": 2.246673762639702, + "grad_norm": 0.2590024173259735, + "learning_rate": 1.1759022661955344e-07, + "loss": 0.1736, + "step": 8443 + }, + { + "epoch": 2.2469398616285257, + "grad_norm": 0.3465474843978882, + "learning_rate": 1.1757360123279423e-07, + "loss": 0.1935, + "step": 8444 + }, + { + "epoch": 2.24720596061735, + "grad_norm": 0.2679542303085327, + "learning_rate": 1.1755697534480022e-07, + "loss": 0.1687, + "step": 8445 + }, + { + "epoch": 2.2474720596061735, + "grad_norm": 0.28397008776664734, + "learning_rate": 1.1754034895604561e-07, + "loss": 0.1867, + "step": 8446 + }, + { + "epoch": 2.247738158594997, + "grad_norm": 0.2888154685497284, + "learning_rate": 1.1752372206700458e-07, + "loss": 0.177, + "step": 8447 + }, + { + "epoch": 2.2480042575838213, + "grad_norm": 0.46573206782341003, + "learning_rate": 1.1750709467815139e-07, + "loss": 0.2038, + "step": 8448 + }, + { + "epoch": 2.248270356572645, + "grad_norm": 0.27998071908950806, + "learning_rate": 1.174904667899603e-07, + "loss": 0.1926, + "step": 8449 + }, + { + "epoch": 2.248536455561469, + "grad_norm": 0.26847919821739197, + "learning_rate": 1.1747383840290555e-07, + "loss": 0.1814, + "step": 8450 + }, + { + "epoch": 2.2488025545502928, + "grad_norm": 0.26108700037002563, + "learning_rate": 1.1745720951746141e-07, + "loss": 0.1772, + "step": 8451 + }, + { + "epoch": 2.2490686535391164, + "grad_norm": 0.26982882618904114, + "learning_rate": 1.1744058013410219e-07, + "loss": 0.1728, + "step": 8452 + }, + { + "epoch": 2.2493347525279406, + "grad_norm": 0.357109010219574, + "learning_rate": 1.1742395025330218e-07, + "loss": 0.1966, + "step": 8453 + }, + { + "epoch": 2.2496008515167643, + "grad_norm": 0.3936213254928589, + "learning_rate": 1.1740731987553569e-07, + "loss": 0.1973, + "step": 8454 + }, + { + "epoch": 2.249866950505588, + "grad_norm": 0.2661115825176239, + "learning_rate": 1.1739068900127707e-07, + "loss": 0.171, + "step": 8455 + }, + { + "epoch": 2.250133049494412, + "grad_norm": 0.2965637445449829, + "learning_rate": 1.1737405763100067e-07, + "loss": 0.1893, + "step": 8456 + }, + { + "epoch": 2.2503991484832357, + "grad_norm": 0.27709099650382996, + "learning_rate": 1.1735742576518083e-07, + "loss": 0.1655, + "step": 8457 + }, + { + "epoch": 2.2506652474720594, + "grad_norm": 0.34505099058151245, + "learning_rate": 1.1734079340429194e-07, + "loss": 0.1954, + "step": 8458 + }, + { + "epoch": 2.2509313464608836, + "grad_norm": 0.3816555142402649, + "learning_rate": 1.1732416054880835e-07, + "loss": 0.2046, + "step": 8459 + }, + { + "epoch": 2.2511974454497072, + "grad_norm": 0.2731430232524872, + "learning_rate": 1.1730752719920456e-07, + "loss": 0.1907, + "step": 8460 + }, + { + "epoch": 2.251463544438531, + "grad_norm": 0.26875266432762146, + "learning_rate": 1.172908933559549e-07, + "loss": 0.1818, + "step": 8461 + }, + { + "epoch": 2.251729643427355, + "grad_norm": 0.25790607929229736, + "learning_rate": 1.172742590195338e-07, + "loss": 0.1698, + "step": 8462 + }, + { + "epoch": 2.2519957424161787, + "grad_norm": 0.30109602212905884, + "learning_rate": 1.1725762419041576e-07, + "loss": 0.1953, + "step": 8463 + }, + { + "epoch": 2.252261841405003, + "grad_norm": 0.30501872301101685, + "learning_rate": 1.1724098886907521e-07, + "loss": 0.1905, + "step": 8464 + }, + { + "epoch": 2.2525279403938265, + "grad_norm": 0.26884275674819946, + "learning_rate": 1.1722435305598662e-07, + "loss": 0.1756, + "step": 8465 + }, + { + "epoch": 2.25279403938265, + "grad_norm": 3.0355794429779053, + "learning_rate": 1.1720771675162444e-07, + "loss": 0.1941, + "step": 8466 + }, + { + "epoch": 2.2530601383714743, + "grad_norm": 0.25448745489120483, + "learning_rate": 1.1719107995646324e-07, + "loss": 0.166, + "step": 8467 + }, + { + "epoch": 2.253326237360298, + "grad_norm": 0.3054084777832031, + "learning_rate": 1.1717444267097751e-07, + "loss": 0.1836, + "step": 8468 + }, + { + "epoch": 2.2535923363491217, + "grad_norm": 0.3142651319503784, + "learning_rate": 1.1715780489564176e-07, + "loss": 0.1823, + "step": 8469 + }, + { + "epoch": 2.253858435337946, + "grad_norm": 0.280953973531723, + "learning_rate": 1.1714116663093057e-07, + "loss": 0.1855, + "step": 8470 + }, + { + "epoch": 2.2541245343267695, + "grad_norm": 0.277662068605423, + "learning_rate": 1.1712452787731845e-07, + "loss": 0.1769, + "step": 8471 + }, + { + "epoch": 2.2543906333155936, + "grad_norm": 0.5793594121932983, + "learning_rate": 1.1710788863528003e-07, + "loss": 0.1938, + "step": 8472 + }, + { + "epoch": 2.2546567323044173, + "grad_norm": 0.27783116698265076, + "learning_rate": 1.1709124890528983e-07, + "loss": 0.1784, + "step": 8473 + }, + { + "epoch": 2.254922831293241, + "grad_norm": 0.3513178527355194, + "learning_rate": 1.1707460868782248e-07, + "loss": 0.182, + "step": 8474 + }, + { + "epoch": 2.255188930282065, + "grad_norm": 0.2689623236656189, + "learning_rate": 1.1705796798335262e-07, + "loss": 0.194, + "step": 8475 + }, + { + "epoch": 2.255455029270889, + "grad_norm": 0.289730429649353, + "learning_rate": 1.1704132679235481e-07, + "loss": 0.1863, + "step": 8476 + }, + { + "epoch": 2.2557211282597125, + "grad_norm": 0.27921244502067566, + "learning_rate": 1.1702468511530375e-07, + "loss": 0.1867, + "step": 8477 + }, + { + "epoch": 2.2559872272485366, + "grad_norm": 0.2804962992668152, + "learning_rate": 1.1700804295267408e-07, + "loss": 0.1917, + "step": 8478 + }, + { + "epoch": 2.2562533262373603, + "grad_norm": 0.2937197685241699, + "learning_rate": 1.1699140030494048e-07, + "loss": 0.1857, + "step": 8479 + }, + { + "epoch": 2.256519425226184, + "grad_norm": 0.26846638321876526, + "learning_rate": 1.1697475717257757e-07, + "loss": 0.1804, + "step": 8480 + }, + { + "epoch": 2.256785524215008, + "grad_norm": 0.33831489086151123, + "learning_rate": 1.1695811355606013e-07, + "loss": 0.1766, + "step": 8481 + }, + { + "epoch": 2.257051623203832, + "grad_norm": 0.27996882796287537, + "learning_rate": 1.1694146945586283e-07, + "loss": 0.194, + "step": 8482 + }, + { + "epoch": 2.2573177221926555, + "grad_norm": 0.2661234438419342, + "learning_rate": 1.1692482487246038e-07, + "loss": 0.19, + "step": 8483 + }, + { + "epoch": 2.2575838211814796, + "grad_norm": 0.2906492054462433, + "learning_rate": 1.1690817980632753e-07, + "loss": 0.1816, + "step": 8484 + }, + { + "epoch": 2.2578499201703033, + "grad_norm": 0.2951718866825104, + "learning_rate": 1.1689153425793903e-07, + "loss": 0.1866, + "step": 8485 + }, + { + "epoch": 2.258116019159127, + "grad_norm": 0.2641099691390991, + "learning_rate": 1.1687488822776967e-07, + "loss": 0.187, + "step": 8486 + }, + { + "epoch": 2.258382118147951, + "grad_norm": 0.26437506079673767, + "learning_rate": 1.1685824171629417e-07, + "loss": 0.1633, + "step": 8487 + }, + { + "epoch": 2.2586482171367748, + "grad_norm": 0.6008486747741699, + "learning_rate": 1.1684159472398737e-07, + "loss": 0.1868, + "step": 8488 + }, + { + "epoch": 2.258914316125599, + "grad_norm": 0.32377156615257263, + "learning_rate": 1.1682494725132411e-07, + "loss": 0.1901, + "step": 8489 + }, + { + "epoch": 2.2591804151144226, + "grad_norm": 0.3358038067817688, + "learning_rate": 1.1680829929877913e-07, + "loss": 0.1749, + "step": 8490 + }, + { + "epoch": 2.2594465141032463, + "grad_norm": 0.27242332696914673, + "learning_rate": 1.1679165086682728e-07, + "loss": 0.1625, + "step": 8491 + }, + { + "epoch": 2.2597126130920704, + "grad_norm": 0.28744879364967346, + "learning_rate": 1.1677500195594345e-07, + "loss": 0.193, + "step": 8492 + }, + { + "epoch": 2.259978712080894, + "grad_norm": 0.24314294755458832, + "learning_rate": 1.1675835256660247e-07, + "loss": 0.1614, + "step": 8493 + }, + { + "epoch": 2.2602448110697178, + "grad_norm": 0.3374466896057129, + "learning_rate": 1.1674170269927921e-07, + "loss": 0.2029, + "step": 8494 + }, + { + "epoch": 2.260510910058542, + "grad_norm": 0.32379209995269775, + "learning_rate": 1.1672505235444856e-07, + "loss": 0.1817, + "step": 8495 + }, + { + "epoch": 2.2607770090473656, + "grad_norm": 0.30442509055137634, + "learning_rate": 1.1670840153258545e-07, + "loss": 0.1833, + "step": 8496 + }, + { + "epoch": 2.2610431080361897, + "grad_norm": 0.41789859533309937, + "learning_rate": 1.1669175023416478e-07, + "loss": 0.1839, + "step": 8497 + }, + { + "epoch": 2.2613092070250134, + "grad_norm": 0.2947314977645874, + "learning_rate": 1.1667509845966146e-07, + "loss": 0.1835, + "step": 8498 + }, + { + "epoch": 2.261575306013837, + "grad_norm": 0.27146458625793457, + "learning_rate": 1.1665844620955043e-07, + "loss": 0.1856, + "step": 8499 + }, + { + "epoch": 2.261841405002661, + "grad_norm": 0.2718522548675537, + "learning_rate": 1.1664179348430669e-07, + "loss": 0.1807, + "step": 8500 + }, + { + "epoch": 2.262107503991485, + "grad_norm": 0.33332985639572144, + "learning_rate": 1.166251402844052e-07, + "loss": 0.1941, + "step": 8501 + }, + { + "epoch": 2.2623736029803085, + "grad_norm": 0.27427151799201965, + "learning_rate": 1.1660848661032088e-07, + "loss": 0.1677, + "step": 8502 + }, + { + "epoch": 2.2626397019691327, + "grad_norm": 0.31080830097198486, + "learning_rate": 1.1659183246252879e-07, + "loss": 0.2076, + "step": 8503 + }, + { + "epoch": 2.2629058009579563, + "grad_norm": 0.2730584144592285, + "learning_rate": 1.1657517784150394e-07, + "loss": 0.1833, + "step": 8504 + }, + { + "epoch": 2.26317189994678, + "grad_norm": 0.2613949477672577, + "learning_rate": 1.1655852274772133e-07, + "loss": 0.1571, + "step": 8505 + }, + { + "epoch": 2.263437998935604, + "grad_norm": 0.270123153924942, + "learning_rate": 1.1654186718165599e-07, + "loss": 0.1917, + "step": 8506 + }, + { + "epoch": 2.263704097924428, + "grad_norm": 0.2785778343677521, + "learning_rate": 1.1652521114378297e-07, + "loss": 0.1911, + "step": 8507 + }, + { + "epoch": 2.2639701969132515, + "grad_norm": 0.38489988446235657, + "learning_rate": 1.1650855463457739e-07, + "loss": 0.1843, + "step": 8508 + }, + { + "epoch": 2.2642362959020756, + "grad_norm": 0.280405193567276, + "learning_rate": 1.1649189765451424e-07, + "loss": 0.1727, + "step": 8509 + }, + { + "epoch": 2.2645023948908993, + "grad_norm": 0.28349578380584717, + "learning_rate": 1.1647524020406869e-07, + "loss": 0.1731, + "step": 8510 + }, + { + "epoch": 2.264768493879723, + "grad_norm": 0.32460591197013855, + "learning_rate": 1.1645858228371579e-07, + "loss": 0.1938, + "step": 8511 + }, + { + "epoch": 2.265034592868547, + "grad_norm": 0.3374892473220825, + "learning_rate": 1.164419238939307e-07, + "loss": 0.2129, + "step": 8512 + }, + { + "epoch": 2.265300691857371, + "grad_norm": 0.25734445452690125, + "learning_rate": 1.1642526503518848e-07, + "loss": 0.1698, + "step": 8513 + }, + { + "epoch": 2.265566790846195, + "grad_norm": 0.3075920641422272, + "learning_rate": 1.1640860570796436e-07, + "loss": 0.2056, + "step": 8514 + }, + { + "epoch": 2.2658328898350186, + "grad_norm": 0.43896040320396423, + "learning_rate": 1.1639194591273347e-07, + "loss": 0.211, + "step": 8515 + }, + { + "epoch": 2.2660989888238423, + "grad_norm": 0.317148894071579, + "learning_rate": 1.1637528564997094e-07, + "loss": 0.1867, + "step": 8516 + }, + { + "epoch": 2.2663650878126664, + "grad_norm": 0.26352041959762573, + "learning_rate": 1.1635862492015201e-07, + "loss": 0.1798, + "step": 8517 + }, + { + "epoch": 2.26663118680149, + "grad_norm": 0.33516913652420044, + "learning_rate": 1.1634196372375185e-07, + "loss": 0.1816, + "step": 8518 + }, + { + "epoch": 2.2668972857903142, + "grad_norm": 0.27742794156074524, + "learning_rate": 1.1632530206124568e-07, + "loss": 0.1824, + "step": 8519 + }, + { + "epoch": 2.267163384779138, + "grad_norm": 0.362773597240448, + "learning_rate": 1.1630863993310871e-07, + "loss": 0.1902, + "step": 8520 + }, + { + "epoch": 2.2674294837679616, + "grad_norm": 0.3526199758052826, + "learning_rate": 1.1629197733981619e-07, + "loss": 0.2006, + "step": 8521 + }, + { + "epoch": 2.2676955827567857, + "grad_norm": 0.36239662766456604, + "learning_rate": 1.1627531428184339e-07, + "loss": 0.184, + "step": 8522 + }, + { + "epoch": 2.2679616817456094, + "grad_norm": 0.3354708254337311, + "learning_rate": 1.1625865075966555e-07, + "loss": 0.1808, + "step": 8523 + }, + { + "epoch": 2.268227780734433, + "grad_norm": 0.38621342182159424, + "learning_rate": 1.1624198677375793e-07, + "loss": 0.1901, + "step": 8524 + }, + { + "epoch": 2.2684938797232572, + "grad_norm": 0.37713515758514404, + "learning_rate": 1.1622532232459585e-07, + "loss": 0.2086, + "step": 8525 + }, + { + "epoch": 2.268759978712081, + "grad_norm": 0.24686992168426514, + "learning_rate": 1.1620865741265462e-07, + "loss": 0.1629, + "step": 8526 + }, + { + "epoch": 2.2690260777009046, + "grad_norm": 0.24711668491363525, + "learning_rate": 1.1619199203840953e-07, + "loss": 0.1593, + "step": 8527 + }, + { + "epoch": 2.2692921766897287, + "grad_norm": 0.4439961016178131, + "learning_rate": 1.1617532620233592e-07, + "loss": 0.2073, + "step": 8528 + }, + { + "epoch": 2.2695582756785524, + "grad_norm": 0.27458032965660095, + "learning_rate": 1.1615865990490914e-07, + "loss": 0.1698, + "step": 8529 + }, + { + "epoch": 2.269824374667376, + "grad_norm": 0.3025190234184265, + "learning_rate": 1.1614199314660455e-07, + "loss": 0.1868, + "step": 8530 + }, + { + "epoch": 2.2700904736562, + "grad_norm": 0.36978691816329956, + "learning_rate": 1.1612532592789748e-07, + "loss": 0.1925, + "step": 8531 + }, + { + "epoch": 2.270356572645024, + "grad_norm": 0.2807401418685913, + "learning_rate": 1.1610865824926337e-07, + "loss": 0.1766, + "step": 8532 + }, + { + "epoch": 2.2706226716338476, + "grad_norm": 0.4516507089138031, + "learning_rate": 1.1609199011117761e-07, + "loss": 0.1999, + "step": 8533 + }, + { + "epoch": 2.2708887706226717, + "grad_norm": 0.2749243378639221, + "learning_rate": 1.1607532151411559e-07, + "loss": 0.1818, + "step": 8534 + }, + { + "epoch": 2.2711548696114954, + "grad_norm": 0.2706201374530792, + "learning_rate": 1.1605865245855271e-07, + "loss": 0.1865, + "step": 8535 + }, + { + "epoch": 2.2714209686003195, + "grad_norm": 0.273859441280365, + "learning_rate": 1.1604198294496445e-07, + "loss": 0.1725, + "step": 8536 + }, + { + "epoch": 2.271687067589143, + "grad_norm": 0.31335684657096863, + "learning_rate": 1.1602531297382622e-07, + "loss": 0.1694, + "step": 8537 + }, + { + "epoch": 2.271953166577967, + "grad_norm": 0.27337872982025146, + "learning_rate": 1.1600864254561351e-07, + "loss": 0.1773, + "step": 8538 + }, + { + "epoch": 2.272219265566791, + "grad_norm": 0.2805534601211548, + "learning_rate": 1.1599197166080179e-07, + "loss": 0.1785, + "step": 8539 + }, + { + "epoch": 2.2724853645556147, + "grad_norm": 0.3359350860118866, + "learning_rate": 1.1597530031986653e-07, + "loss": 0.1734, + "step": 8540 + }, + { + "epoch": 2.2727514635444384, + "grad_norm": 0.33235985040664673, + "learning_rate": 1.1595862852328327e-07, + "loss": 0.1741, + "step": 8541 + }, + { + "epoch": 2.2730175625332625, + "grad_norm": 0.27415692806243896, + "learning_rate": 1.1594195627152743e-07, + "loss": 0.176, + "step": 8542 + }, + { + "epoch": 2.273283661522086, + "grad_norm": 0.41639912128448486, + "learning_rate": 1.1592528356507465e-07, + "loss": 0.1881, + "step": 8543 + }, + { + "epoch": 2.2735497605109103, + "grad_norm": 0.4182523787021637, + "learning_rate": 1.1590861040440043e-07, + "loss": 0.1841, + "step": 8544 + }, + { + "epoch": 2.273815859499734, + "grad_norm": 0.2897970676422119, + "learning_rate": 1.1589193678998028e-07, + "loss": 0.1901, + "step": 8545 + }, + { + "epoch": 2.2740819584885577, + "grad_norm": 0.35377129912376404, + "learning_rate": 1.1587526272228982e-07, + "loss": 0.1591, + "step": 8546 + }, + { + "epoch": 2.274348057477382, + "grad_norm": 0.27387872338294983, + "learning_rate": 1.1585858820180461e-07, + "loss": 0.1822, + "step": 8547 + }, + { + "epoch": 2.2746141564662055, + "grad_norm": 0.37211742997169495, + "learning_rate": 1.1584191322900023e-07, + "loss": 0.1839, + "step": 8548 + }, + { + "epoch": 2.274880255455029, + "grad_norm": 0.3029589056968689, + "learning_rate": 1.1582523780435227e-07, + "loss": 0.1818, + "step": 8549 + }, + { + "epoch": 2.2751463544438533, + "grad_norm": 0.3420413136482239, + "learning_rate": 1.158085619283364e-07, + "loss": 0.1998, + "step": 8550 + }, + { + "epoch": 2.275412453432677, + "grad_norm": 0.25327515602111816, + "learning_rate": 1.157918856014282e-07, + "loss": 0.1672, + "step": 8551 + }, + { + "epoch": 2.2756785524215006, + "grad_norm": 0.38086140155792236, + "learning_rate": 1.1577520882410334e-07, + "loss": 0.1924, + "step": 8552 + }, + { + "epoch": 2.2759446514103248, + "grad_norm": 0.25864648818969727, + "learning_rate": 1.1575853159683746e-07, + "loss": 0.1733, + "step": 8553 + }, + { + "epoch": 2.2762107503991484, + "grad_norm": 0.2654057443141937, + "learning_rate": 1.1574185392010624e-07, + "loss": 0.1751, + "step": 8554 + }, + { + "epoch": 2.276476849387972, + "grad_norm": 0.26384681463241577, + "learning_rate": 1.1572517579438539e-07, + "loss": 0.1787, + "step": 8555 + }, + { + "epoch": 2.2767429483767962, + "grad_norm": 0.2832886874675751, + "learning_rate": 1.1570849722015053e-07, + "loss": 0.1891, + "step": 8556 + }, + { + "epoch": 2.27700904736562, + "grad_norm": 0.3586408197879791, + "learning_rate": 1.1569181819787743e-07, + "loss": 0.1947, + "step": 8557 + }, + { + "epoch": 2.2772751463544436, + "grad_norm": 0.261176735162735, + "learning_rate": 1.1567513872804178e-07, + "loss": 0.1744, + "step": 8558 + }, + { + "epoch": 2.2775412453432677, + "grad_norm": 0.2978181540966034, + "learning_rate": 1.1565845881111933e-07, + "loss": 0.178, + "step": 8559 + }, + { + "epoch": 2.2778073443320914, + "grad_norm": 0.3591451644897461, + "learning_rate": 1.1564177844758577e-07, + "loss": 0.2111, + "step": 8560 + }, + { + "epoch": 2.2780734433209155, + "grad_norm": 0.3520423173904419, + "learning_rate": 1.1562509763791696e-07, + "loss": 0.1721, + "step": 8561 + }, + { + "epoch": 2.2783395423097392, + "grad_norm": 0.308118999004364, + "learning_rate": 1.1560841638258861e-07, + "loss": 0.192, + "step": 8562 + }, + { + "epoch": 2.278605641298563, + "grad_norm": 0.26406848430633545, + "learning_rate": 1.1559173468207645e-07, + "loss": 0.1777, + "step": 8563 + }, + { + "epoch": 2.278871740287387, + "grad_norm": 0.28686878085136414, + "learning_rate": 1.155750525368564e-07, + "loss": 0.1932, + "step": 8564 + }, + { + "epoch": 2.2791378392762107, + "grad_norm": 0.2680739462375641, + "learning_rate": 1.1555836994740418e-07, + "loss": 0.1782, + "step": 8565 + }, + { + "epoch": 2.2794039382650344, + "grad_norm": 0.3312866985797882, + "learning_rate": 1.1554168691419563e-07, + "loss": 0.1882, + "step": 8566 + }, + { + "epoch": 2.2796700372538585, + "grad_norm": 0.5011308193206787, + "learning_rate": 1.1552500343770657e-07, + "loss": 0.1858, + "step": 8567 + }, + { + "epoch": 2.279936136242682, + "grad_norm": 0.36331748962402344, + "learning_rate": 1.1550831951841287e-07, + "loss": 0.1887, + "step": 8568 + }, + { + "epoch": 2.2802022352315063, + "grad_norm": 0.2731981575489044, + "learning_rate": 1.1549163515679039e-07, + "loss": 0.19, + "step": 8569 + }, + { + "epoch": 2.28046833422033, + "grad_norm": 0.2879418730735779, + "learning_rate": 1.15474950353315e-07, + "loss": 0.175, + "step": 8570 + }, + { + "epoch": 2.2807344332091537, + "grad_norm": 0.33100584149360657, + "learning_rate": 1.1545826510846258e-07, + "loss": 0.1818, + "step": 8571 + }, + { + "epoch": 2.281000532197978, + "grad_norm": 0.5017268657684326, + "learning_rate": 1.15441579422709e-07, + "loss": 0.1959, + "step": 8572 + }, + { + "epoch": 2.2812666311868015, + "grad_norm": 0.4032279849052429, + "learning_rate": 1.1542489329653022e-07, + "loss": 0.1967, + "step": 8573 + }, + { + "epoch": 2.281532730175625, + "grad_norm": 0.280240923166275, + "learning_rate": 1.1540820673040215e-07, + "loss": 0.1863, + "step": 8574 + }, + { + "epoch": 2.2817988291644493, + "grad_norm": 0.27397674322128296, + "learning_rate": 1.1539151972480067e-07, + "loss": 0.1786, + "step": 8575 + }, + { + "epoch": 2.282064928153273, + "grad_norm": 0.3415622115135193, + "learning_rate": 1.1537483228020182e-07, + "loss": 0.1745, + "step": 8576 + }, + { + "epoch": 2.2823310271420967, + "grad_norm": 0.36317354440689087, + "learning_rate": 1.1535814439708149e-07, + "loss": 0.186, + "step": 8577 + }, + { + "epoch": 2.282597126130921, + "grad_norm": 0.29513904452323914, + "learning_rate": 1.1534145607591566e-07, + "loss": 0.1707, + "step": 8578 + }, + { + "epoch": 2.2828632251197445, + "grad_norm": 0.3687809109687805, + "learning_rate": 1.1532476731718035e-07, + "loss": 0.1828, + "step": 8579 + }, + { + "epoch": 2.283129324108568, + "grad_norm": 0.3551202118396759, + "learning_rate": 1.1530807812135153e-07, + "loss": 0.1871, + "step": 8580 + }, + { + "epoch": 2.2833954230973923, + "grad_norm": 0.3251189887523651, + "learning_rate": 1.1529138848890521e-07, + "loss": 0.1802, + "step": 8581 + }, + { + "epoch": 2.283661522086216, + "grad_norm": 0.41751357913017273, + "learning_rate": 1.1527469842031741e-07, + "loss": 0.2156, + "step": 8582 + }, + { + "epoch": 2.28392762107504, + "grad_norm": 0.27300921082496643, + "learning_rate": 1.1525800791606417e-07, + "loss": 0.1762, + "step": 8583 + }, + { + "epoch": 2.284193720063864, + "grad_norm": 0.3772154748439789, + "learning_rate": 1.1524131697662156e-07, + "loss": 0.197, + "step": 8584 + }, + { + "epoch": 2.2844598190526875, + "grad_norm": 0.2571784555912018, + "learning_rate": 1.152246256024656e-07, + "loss": 0.1798, + "step": 8585 + }, + { + "epoch": 2.2847259180415116, + "grad_norm": 0.3136603534221649, + "learning_rate": 1.1520793379407235e-07, + "loss": 0.188, + "step": 8586 + }, + { + "epoch": 2.2849920170303353, + "grad_norm": 0.3004150092601776, + "learning_rate": 1.1519124155191798e-07, + "loss": 0.1853, + "step": 8587 + }, + { + "epoch": 2.285258116019159, + "grad_norm": 0.26190680265426636, + "learning_rate": 1.1517454887647851e-07, + "loss": 0.1649, + "step": 8588 + }, + { + "epoch": 2.285524215007983, + "grad_norm": 0.2584821283817291, + "learning_rate": 1.1515785576823005e-07, + "loss": 0.1823, + "step": 8589 + }, + { + "epoch": 2.2857903139968068, + "grad_norm": 0.35154008865356445, + "learning_rate": 1.1514116222764876e-07, + "loss": 0.203, + "step": 8590 + }, + { + "epoch": 2.286056412985631, + "grad_norm": 0.2902315855026245, + "learning_rate": 1.1512446825521076e-07, + "loss": 0.1988, + "step": 8591 + }, + { + "epoch": 2.2863225119744546, + "grad_norm": 0.2715817093849182, + "learning_rate": 1.1510777385139219e-07, + "loss": 0.1769, + "step": 8592 + }, + { + "epoch": 2.2865886109632783, + "grad_norm": 0.3995974361896515, + "learning_rate": 1.1509107901666919e-07, + "loss": 0.1892, + "step": 8593 + }, + { + "epoch": 2.2868547099521024, + "grad_norm": 0.44253969192504883, + "learning_rate": 1.1507438375151797e-07, + "loss": 0.1924, + "step": 8594 + }, + { + "epoch": 2.287120808940926, + "grad_norm": 0.2751571536064148, + "learning_rate": 1.1505768805641469e-07, + "loss": 0.1868, + "step": 8595 + }, + { + "epoch": 2.2873869079297497, + "grad_norm": 0.2633489966392517, + "learning_rate": 1.1504099193183554e-07, + "loss": 0.1942, + "step": 8596 + }, + { + "epoch": 2.287653006918574, + "grad_norm": 0.30342596769332886, + "learning_rate": 1.1502429537825673e-07, + "loss": 0.1821, + "step": 8597 + }, + { + "epoch": 2.2879191059073976, + "grad_norm": 0.3472881615161896, + "learning_rate": 1.1500759839615451e-07, + "loss": 0.1954, + "step": 8598 + }, + { + "epoch": 2.2881852048962212, + "grad_norm": 0.3489226698875427, + "learning_rate": 1.1499090098600508e-07, + "loss": 0.1805, + "step": 8599 + }, + { + "epoch": 2.2884513038850454, + "grad_norm": 0.44300246238708496, + "learning_rate": 1.1497420314828467e-07, + "loss": 0.2001, + "step": 8600 + }, + { + "epoch": 2.288717402873869, + "grad_norm": 0.27875640988349915, + "learning_rate": 1.1495750488346958e-07, + "loss": 0.1914, + "step": 8601 + }, + { + "epoch": 2.2889835018626927, + "grad_norm": 0.27104854583740234, + "learning_rate": 1.1494080619203605e-07, + "loss": 0.1749, + "step": 8602 + }, + { + "epoch": 2.289249600851517, + "grad_norm": 0.3490144908428192, + "learning_rate": 1.1492410707446035e-07, + "loss": 0.1726, + "step": 8603 + }, + { + "epoch": 2.2895156998403405, + "grad_norm": 0.27658751606941223, + "learning_rate": 1.1490740753121877e-07, + "loss": 0.1719, + "step": 8604 + }, + { + "epoch": 2.289781798829164, + "grad_norm": 0.45679906010627747, + "learning_rate": 1.1489070756278767e-07, + "loss": 0.2261, + "step": 8605 + }, + { + "epoch": 2.2900478978179883, + "grad_norm": 0.24815818667411804, + "learning_rate": 1.1487400716964335e-07, + "loss": 0.1647, + "step": 8606 + }, + { + "epoch": 2.290313996806812, + "grad_norm": 0.34502890706062317, + "learning_rate": 1.1485730635226207e-07, + "loss": 0.176, + "step": 8607 + }, + { + "epoch": 2.290580095795636, + "grad_norm": 0.37544986605644226, + "learning_rate": 1.1484060511112025e-07, + "loss": 0.186, + "step": 8608 + }, + { + "epoch": 2.29084619478446, + "grad_norm": 0.3705320954322815, + "learning_rate": 1.1482390344669421e-07, + "loss": 0.1962, + "step": 8609 + }, + { + "epoch": 2.2911122937732835, + "grad_norm": 0.3935069143772125, + "learning_rate": 1.1480720135946034e-07, + "loss": 0.1906, + "step": 8610 + }, + { + "epoch": 2.2913783927621076, + "grad_norm": 0.3679819703102112, + "learning_rate": 1.1479049884989496e-07, + "loss": 0.1898, + "step": 8611 + }, + { + "epoch": 2.2916444917509313, + "grad_norm": 0.28418710827827454, + "learning_rate": 1.1477379591847454e-07, + "loss": 0.1896, + "step": 8612 + }, + { + "epoch": 2.291910590739755, + "grad_norm": 0.24399533867835999, + "learning_rate": 1.147570925656754e-07, + "loss": 0.1815, + "step": 8613 + }, + { + "epoch": 2.292176689728579, + "grad_norm": 0.26368847489356995, + "learning_rate": 1.14740388791974e-07, + "loss": 0.1715, + "step": 8614 + }, + { + "epoch": 2.292442788717403, + "grad_norm": 0.3904073238372803, + "learning_rate": 1.1472368459784678e-07, + "loss": 0.2076, + "step": 8615 + }, + { + "epoch": 2.292708887706227, + "grad_norm": 0.3900543451309204, + "learning_rate": 1.1470697998377015e-07, + "loss": 0.1965, + "step": 8616 + }, + { + "epoch": 2.2929749866950506, + "grad_norm": 0.36464613676071167, + "learning_rate": 1.1469027495022054e-07, + "loss": 0.1803, + "step": 8617 + }, + { + "epoch": 2.2932410856838743, + "grad_norm": 0.26254144310951233, + "learning_rate": 1.1467356949767446e-07, + "loss": 0.1707, + "step": 8618 + }, + { + "epoch": 2.2935071846726984, + "grad_norm": 0.6493503451347351, + "learning_rate": 1.1465686362660836e-07, + "loss": 0.1931, + "step": 8619 + }, + { + "epoch": 2.293773283661522, + "grad_norm": 0.2880952060222626, + "learning_rate": 1.146401573374987e-07, + "loss": 0.1705, + "step": 8620 + }, + { + "epoch": 2.294039382650346, + "grad_norm": 0.2968619465827942, + "learning_rate": 1.1462345063082202e-07, + "loss": 0.1726, + "step": 8621 + }, + { + "epoch": 2.29430548163917, + "grad_norm": 0.4106273353099823, + "learning_rate": 1.1460674350705482e-07, + "loss": 0.1992, + "step": 8622 + }, + { + "epoch": 2.2945715806279936, + "grad_norm": 0.29243531823158264, + "learning_rate": 1.1459003596667358e-07, + "loss": 0.1858, + "step": 8623 + }, + { + "epoch": 2.2948376796168173, + "grad_norm": 0.3823285400867462, + "learning_rate": 1.145733280101549e-07, + "loss": 0.1881, + "step": 8624 + }, + { + "epoch": 2.2951037786056414, + "grad_norm": 0.303983211517334, + "learning_rate": 1.145566196379753e-07, + "loss": 0.1778, + "step": 8625 + }, + { + "epoch": 2.295369877594465, + "grad_norm": 0.27813422679901123, + "learning_rate": 1.1453991085061127e-07, + "loss": 0.1822, + "step": 8626 + }, + { + "epoch": 2.2956359765832888, + "grad_norm": 0.298435240983963, + "learning_rate": 1.1452320164853949e-07, + "loss": 0.1901, + "step": 8627 + }, + { + "epoch": 2.295902075572113, + "grad_norm": 0.5132699012756348, + "learning_rate": 1.1450649203223648e-07, + "loss": 0.2056, + "step": 8628 + }, + { + "epoch": 2.2961681745609366, + "grad_norm": 0.3458424210548401, + "learning_rate": 1.144897820021788e-07, + "loss": 0.1784, + "step": 8629 + }, + { + "epoch": 2.2964342735497603, + "grad_norm": 0.26071903109550476, + "learning_rate": 1.1447307155884313e-07, + "loss": 0.1767, + "step": 8630 + }, + { + "epoch": 2.2967003725385844, + "grad_norm": 0.34066569805145264, + "learning_rate": 1.1445636070270606e-07, + "loss": 0.1791, + "step": 8631 + }, + { + "epoch": 2.296966471527408, + "grad_norm": 0.35904955863952637, + "learning_rate": 1.1443964943424418e-07, + "loss": 0.191, + "step": 8632 + }, + { + "epoch": 2.297232570516232, + "grad_norm": 0.2863462269306183, + "learning_rate": 1.1442293775393414e-07, + "loss": 0.1913, + "step": 8633 + }, + { + "epoch": 2.297498669505056, + "grad_norm": 0.27448877692222595, + "learning_rate": 1.1440622566225264e-07, + "loss": 0.1658, + "step": 8634 + }, + { + "epoch": 2.2977647684938796, + "grad_norm": 0.3313087522983551, + "learning_rate": 1.1438951315967631e-07, + "loss": 0.1741, + "step": 8635 + }, + { + "epoch": 2.2980308674827037, + "grad_norm": 0.3379512429237366, + "learning_rate": 1.1437280024668179e-07, + "loss": 0.1865, + "step": 8636 + }, + { + "epoch": 2.2982969664715274, + "grad_norm": 0.35444822907447815, + "learning_rate": 1.1435608692374583e-07, + "loss": 0.1868, + "step": 8637 + }, + { + "epoch": 2.2985630654603515, + "grad_norm": 0.2716233730316162, + "learning_rate": 1.143393731913451e-07, + "loss": 0.1749, + "step": 8638 + }, + { + "epoch": 2.298829164449175, + "grad_norm": 0.28072938323020935, + "learning_rate": 1.143226590499563e-07, + "loss": 0.1795, + "step": 8639 + }, + { + "epoch": 2.299095263437999, + "grad_norm": 0.39617154002189636, + "learning_rate": 1.1430594450005613e-07, + "loss": 0.1781, + "step": 8640 + }, + { + "epoch": 2.299361362426823, + "grad_norm": 0.27683404088020325, + "learning_rate": 1.1428922954212137e-07, + "loss": 0.1752, + "step": 8641 + }, + { + "epoch": 2.2996274614156467, + "grad_norm": 0.2907712161540985, + "learning_rate": 1.1427251417662875e-07, + "loss": 0.1853, + "step": 8642 + }, + { + "epoch": 2.2998935604044703, + "grad_norm": 0.29655829071998596, + "learning_rate": 1.1425579840405504e-07, + "loss": 0.1853, + "step": 8643 + }, + { + "epoch": 2.3001596593932945, + "grad_norm": 0.432037353515625, + "learning_rate": 1.1423908222487697e-07, + "loss": 0.1821, + "step": 8644 + }, + { + "epoch": 2.300425758382118, + "grad_norm": 0.2842944264411926, + "learning_rate": 1.1422236563957135e-07, + "loss": 0.1833, + "step": 8645 + }, + { + "epoch": 2.300691857370942, + "grad_norm": 0.28328582644462585, + "learning_rate": 1.1420564864861495e-07, + "loss": 0.1758, + "step": 8646 + }, + { + "epoch": 2.300957956359766, + "grad_norm": 0.2667887508869171, + "learning_rate": 1.1418893125248459e-07, + "loss": 0.1672, + "step": 8647 + }, + { + "epoch": 2.3012240553485896, + "grad_norm": 0.32429808378219604, + "learning_rate": 1.1417221345165708e-07, + "loss": 0.1792, + "step": 8648 + }, + { + "epoch": 2.3014901543374133, + "grad_norm": 0.32447952032089233, + "learning_rate": 1.1415549524660924e-07, + "loss": 0.1905, + "step": 8649 + }, + { + "epoch": 2.3017562533262375, + "grad_norm": 0.42415520548820496, + "learning_rate": 1.1413877663781792e-07, + "loss": 0.2025, + "step": 8650 + }, + { + "epoch": 2.302022352315061, + "grad_norm": 0.28139713406562805, + "learning_rate": 1.1412205762575993e-07, + "loss": 0.177, + "step": 8651 + }, + { + "epoch": 2.302288451303885, + "grad_norm": 0.3447434604167938, + "learning_rate": 1.141053382109122e-07, + "loss": 0.2013, + "step": 8652 + }, + { + "epoch": 2.302554550292709, + "grad_norm": 0.43214523792266846, + "learning_rate": 1.1408861839375155e-07, + "loss": 0.1928, + "step": 8653 + }, + { + "epoch": 2.3028206492815326, + "grad_norm": 0.32589805126190186, + "learning_rate": 1.1407189817475484e-07, + "loss": 0.1697, + "step": 8654 + }, + { + "epoch": 2.3030867482703568, + "grad_norm": 0.3322322368621826, + "learning_rate": 1.1405517755439902e-07, + "loss": 0.2004, + "step": 8655 + }, + { + "epoch": 2.3033528472591804, + "grad_norm": 0.2467334121465683, + "learning_rate": 1.14038456533161e-07, + "loss": 0.1731, + "step": 8656 + }, + { + "epoch": 2.303618946248004, + "grad_norm": 0.24740558862686157, + "learning_rate": 1.1402173511151765e-07, + "loss": 0.1525, + "step": 8657 + }, + { + "epoch": 2.3038850452368282, + "grad_norm": 0.37280213832855225, + "learning_rate": 1.1400501328994589e-07, + "loss": 0.1746, + "step": 8658 + }, + { + "epoch": 2.304151144225652, + "grad_norm": 0.291014701128006, + "learning_rate": 1.1398829106892271e-07, + "loss": 0.19, + "step": 8659 + }, + { + "epoch": 2.3044172432144756, + "grad_norm": 0.24815574288368225, + "learning_rate": 1.1397156844892506e-07, + "loss": 0.1729, + "step": 8660 + }, + { + "epoch": 2.3046833422032997, + "grad_norm": 0.27318209409713745, + "learning_rate": 1.139548454304299e-07, + "loss": 0.1801, + "step": 8661 + }, + { + "epoch": 2.3049494411921234, + "grad_norm": 0.3847340941429138, + "learning_rate": 1.1393812201391415e-07, + "loss": 0.1848, + "step": 8662 + }, + { + "epoch": 2.3052155401809475, + "grad_norm": 0.38061758875846863, + "learning_rate": 1.1392139819985486e-07, + "loss": 0.1864, + "step": 8663 + }, + { + "epoch": 2.3054816391697712, + "grad_norm": 0.36878737807273865, + "learning_rate": 1.13904673988729e-07, + "loss": 0.2003, + "step": 8664 + }, + { + "epoch": 2.305747738158595, + "grad_norm": 0.3773016333580017, + "learning_rate": 1.1388794938101359e-07, + "loss": 0.1672, + "step": 8665 + }, + { + "epoch": 2.306013837147419, + "grad_norm": 0.27906933426856995, + "learning_rate": 1.1387122437718564e-07, + "loss": 0.1771, + "step": 8666 + }, + { + "epoch": 2.3062799361362427, + "grad_norm": 0.2898276448249817, + "learning_rate": 1.1385449897772217e-07, + "loss": 0.169, + "step": 8667 + }, + { + "epoch": 2.3065460351250664, + "grad_norm": 0.3206211030483246, + "learning_rate": 1.1383777318310027e-07, + "loss": 0.1601, + "step": 8668 + }, + { + "epoch": 2.3068121341138905, + "grad_norm": 0.31625673174858093, + "learning_rate": 1.1382104699379691e-07, + "loss": 0.1652, + "step": 8669 + }, + { + "epoch": 2.307078233102714, + "grad_norm": 0.38978976011276245, + "learning_rate": 1.1380432041028922e-07, + "loss": 0.1842, + "step": 8670 + }, + { + "epoch": 2.307344332091538, + "grad_norm": 0.2728292644023895, + "learning_rate": 1.1378759343305428e-07, + "loss": 0.182, + "step": 8671 + }, + { + "epoch": 2.307610431080362, + "grad_norm": 0.27640071511268616, + "learning_rate": 1.1377086606256914e-07, + "loss": 0.1951, + "step": 8672 + }, + { + "epoch": 2.3078765300691857, + "grad_norm": 0.35159075260162354, + "learning_rate": 1.1375413829931094e-07, + "loss": 0.1729, + "step": 8673 + }, + { + "epoch": 2.3081426290580094, + "grad_norm": 0.2882465720176697, + "learning_rate": 1.1373741014375672e-07, + "loss": 0.1932, + "step": 8674 + }, + { + "epoch": 2.3084087280468335, + "grad_norm": 0.2974115014076233, + "learning_rate": 1.137206815963837e-07, + "loss": 0.1812, + "step": 8675 + }, + { + "epoch": 2.308674827035657, + "grad_norm": 0.2604026794433594, + "learning_rate": 1.1370395265766892e-07, + "loss": 0.1862, + "step": 8676 + }, + { + "epoch": 2.308940926024481, + "grad_norm": 0.2654649615287781, + "learning_rate": 1.1368722332808955e-07, + "loss": 0.1794, + "step": 8677 + }, + { + "epoch": 2.309207025013305, + "grad_norm": 0.297340452671051, + "learning_rate": 1.1367049360812279e-07, + "loss": 0.2112, + "step": 8678 + }, + { + "epoch": 2.3094731240021287, + "grad_norm": 0.4506112039089203, + "learning_rate": 1.1365376349824578e-07, + "loss": 0.19, + "step": 8679 + }, + { + "epoch": 2.309739222990953, + "grad_norm": 0.34522750973701477, + "learning_rate": 1.1363703299893565e-07, + "loss": 0.1849, + "step": 8680 + }, + { + "epoch": 2.3100053219797765, + "grad_norm": 0.25956448912620544, + "learning_rate": 1.1362030211066966e-07, + "loss": 0.1573, + "step": 8681 + }, + { + "epoch": 2.3102714209686, + "grad_norm": 0.3480064272880554, + "learning_rate": 1.1360357083392497e-07, + "loss": 0.1995, + "step": 8682 + }, + { + "epoch": 2.3105375199574243, + "grad_norm": 0.2694728970527649, + "learning_rate": 1.1358683916917881e-07, + "loss": 0.1772, + "step": 8683 + }, + { + "epoch": 2.310803618946248, + "grad_norm": 0.33670833706855774, + "learning_rate": 1.1357010711690836e-07, + "loss": 0.1884, + "step": 8684 + }, + { + "epoch": 2.3110697179350717, + "grad_norm": 0.2803281545639038, + "learning_rate": 1.1355337467759089e-07, + "loss": 0.1806, + "step": 8685 + }, + { + "epoch": 2.311335816923896, + "grad_norm": 0.31452053785324097, + "learning_rate": 1.1353664185170365e-07, + "loss": 0.1685, + "step": 8686 + }, + { + "epoch": 2.3116019159127195, + "grad_norm": 0.27207139134407043, + "learning_rate": 1.1351990863972383e-07, + "loss": 0.1815, + "step": 8687 + }, + { + "epoch": 2.3118680149015436, + "grad_norm": 0.2535879611968994, + "learning_rate": 1.1350317504212878e-07, + "loss": 0.1537, + "step": 8688 + }, + { + "epoch": 2.3121341138903673, + "grad_norm": 0.3084001839160919, + "learning_rate": 1.1348644105939574e-07, + "loss": 0.1862, + "step": 8689 + }, + { + "epoch": 2.312400212879191, + "grad_norm": 0.25773200392723083, + "learning_rate": 1.1346970669200198e-07, + "loss": 0.1578, + "step": 8690 + }, + { + "epoch": 2.312666311868015, + "grad_norm": 0.2580617368221283, + "learning_rate": 1.1345297194042482e-07, + "loss": 0.1647, + "step": 8691 + }, + { + "epoch": 2.3129324108568388, + "grad_norm": 0.35710838437080383, + "learning_rate": 1.1343623680514156e-07, + "loss": 0.2003, + "step": 8692 + }, + { + "epoch": 2.3131985098456624, + "grad_norm": 0.2689075469970703, + "learning_rate": 1.1341950128662954e-07, + "loss": 0.1671, + "step": 8693 + }, + { + "epoch": 2.3134646088344866, + "grad_norm": 0.2855844497680664, + "learning_rate": 1.1340276538536604e-07, + "loss": 0.1762, + "step": 8694 + }, + { + "epoch": 2.3137307078233103, + "grad_norm": 0.33472880721092224, + "learning_rate": 1.1338602910182844e-07, + "loss": 0.1923, + "step": 8695 + }, + { + "epoch": 2.313996806812134, + "grad_norm": 0.29470357298851013, + "learning_rate": 1.1336929243649409e-07, + "loss": 0.1942, + "step": 8696 + }, + { + "epoch": 2.314262905800958, + "grad_norm": 0.28725236654281616, + "learning_rate": 1.1335255538984036e-07, + "loss": 0.1819, + "step": 8697 + }, + { + "epoch": 2.3145290047897817, + "grad_norm": 0.3190770447254181, + "learning_rate": 1.1333581796234461e-07, + "loss": 0.2182, + "step": 8698 + }, + { + "epoch": 2.3147951037786054, + "grad_norm": 0.2787568271160126, + "learning_rate": 1.1331908015448426e-07, + "loss": 0.175, + "step": 8699 + }, + { + "epoch": 2.3150612027674295, + "grad_norm": 0.26847541332244873, + "learning_rate": 1.1330234196673667e-07, + "loss": 0.1717, + "step": 8700 + }, + { + "epoch": 2.3153273017562532, + "grad_norm": 0.35417330265045166, + "learning_rate": 1.1328560339957925e-07, + "loss": 0.1906, + "step": 8701 + }, + { + "epoch": 2.3155934007450774, + "grad_norm": 0.3099732995033264, + "learning_rate": 1.132688644534894e-07, + "loss": 0.1618, + "step": 8702 + }, + { + "epoch": 2.315859499733901, + "grad_norm": 0.25976452231407166, + "learning_rate": 1.132521251289446e-07, + "loss": 0.1596, + "step": 8703 + }, + { + "epoch": 2.3161255987227247, + "grad_norm": 0.2551976442337036, + "learning_rate": 1.1323538542642226e-07, + "loss": 0.1779, + "step": 8704 + }, + { + "epoch": 2.316391697711549, + "grad_norm": 0.34194880723953247, + "learning_rate": 1.132186453463998e-07, + "loss": 0.1908, + "step": 8705 + }, + { + "epoch": 2.3166577967003725, + "grad_norm": 0.26449084281921387, + "learning_rate": 1.1320190488935474e-07, + "loss": 0.1927, + "step": 8706 + }, + { + "epoch": 2.316923895689196, + "grad_norm": 0.25859081745147705, + "learning_rate": 1.1318516405576452e-07, + "loss": 0.1749, + "step": 8707 + }, + { + "epoch": 2.3171899946780203, + "grad_norm": 0.35462814569473267, + "learning_rate": 1.1316842284610663e-07, + "loss": 0.1958, + "step": 8708 + }, + { + "epoch": 2.317456093666844, + "grad_norm": 0.44344401359558105, + "learning_rate": 1.1315168126085857e-07, + "loss": 0.2009, + "step": 8709 + }, + { + "epoch": 2.317722192655668, + "grad_norm": 0.26072368025779724, + "learning_rate": 1.1313493930049782e-07, + "loss": 0.1691, + "step": 8710 + }, + { + "epoch": 2.317988291644492, + "grad_norm": 0.2729509770870209, + "learning_rate": 1.1311819696550193e-07, + "loss": 0.1866, + "step": 8711 + }, + { + "epoch": 2.3182543906333155, + "grad_norm": 0.2902708351612091, + "learning_rate": 1.1310145425634839e-07, + "loss": 0.1828, + "step": 8712 + }, + { + "epoch": 2.3185204896221396, + "grad_norm": 0.2671089172363281, + "learning_rate": 1.1308471117351475e-07, + "loss": 0.1716, + "step": 8713 + }, + { + "epoch": 2.3187865886109633, + "grad_norm": 0.2666877210140228, + "learning_rate": 1.1306796771747853e-07, + "loss": 0.1763, + "step": 8714 + }, + { + "epoch": 2.319052687599787, + "grad_norm": 0.27314797043800354, + "learning_rate": 1.1305122388871736e-07, + "loss": 0.1852, + "step": 8715 + }, + { + "epoch": 2.319318786588611, + "grad_norm": 0.32774990797042847, + "learning_rate": 1.1303447968770875e-07, + "loss": 0.184, + "step": 8716 + }, + { + "epoch": 2.319584885577435, + "grad_norm": 0.24991632997989655, + "learning_rate": 1.1301773511493027e-07, + "loss": 0.1663, + "step": 8717 + }, + { + "epoch": 2.3198509845662585, + "grad_norm": 0.26687273383140564, + "learning_rate": 1.1300099017085955e-07, + "loss": 0.1749, + "step": 8718 + }, + { + "epoch": 2.3201170835550826, + "grad_norm": 0.23706704378128052, + "learning_rate": 1.129842448559742e-07, + "loss": 0.1574, + "step": 8719 + }, + { + "epoch": 2.3203831825439063, + "grad_norm": 0.2869854271411896, + "learning_rate": 1.1296749917075175e-07, + "loss": 0.1771, + "step": 8720 + }, + { + "epoch": 2.32064928153273, + "grad_norm": 0.3420490026473999, + "learning_rate": 1.129507531156699e-07, + "loss": 0.1904, + "step": 8721 + }, + { + "epoch": 2.320915380521554, + "grad_norm": 0.26992571353912354, + "learning_rate": 1.1293400669120625e-07, + "loss": 0.1675, + "step": 8722 + }, + { + "epoch": 2.321181479510378, + "grad_norm": 0.3277239501476288, + "learning_rate": 1.1291725989783845e-07, + "loss": 0.1676, + "step": 8723 + }, + { + "epoch": 2.3214475784992015, + "grad_norm": 0.29147642850875854, + "learning_rate": 1.1290051273604415e-07, + "loss": 0.1834, + "step": 8724 + }, + { + "epoch": 2.3217136774880256, + "grad_norm": 0.3436546325683594, + "learning_rate": 1.12883765206301e-07, + "loss": 0.1731, + "step": 8725 + }, + { + "epoch": 2.3219797764768493, + "grad_norm": 0.33864569664001465, + "learning_rate": 1.128670173090867e-07, + "loss": 0.1847, + "step": 8726 + }, + { + "epoch": 2.3222458754656734, + "grad_norm": 0.3990749716758728, + "learning_rate": 1.128502690448789e-07, + "loss": 0.2194, + "step": 8727 + }, + { + "epoch": 2.322511974454497, + "grad_norm": 0.2704707086086273, + "learning_rate": 1.1283352041415533e-07, + "loss": 0.183, + "step": 8728 + }, + { + "epoch": 2.3227780734433208, + "grad_norm": 0.27676713466644287, + "learning_rate": 1.1281677141739367e-07, + "loss": 0.1735, + "step": 8729 + }, + { + "epoch": 2.323044172432145, + "grad_norm": 0.26138144731521606, + "learning_rate": 1.1280002205507164e-07, + "loss": 0.1857, + "step": 8730 + }, + { + "epoch": 2.3233102714209686, + "grad_norm": 0.2709175646305084, + "learning_rate": 1.1278327232766697e-07, + "loss": 0.1765, + "step": 8731 + }, + { + "epoch": 2.3235763704097923, + "grad_norm": 0.2613798677921295, + "learning_rate": 1.1276652223565741e-07, + "loss": 0.1763, + "step": 8732 + }, + { + "epoch": 2.3238424693986164, + "grad_norm": 0.2654668390750885, + "learning_rate": 1.1274977177952069e-07, + "loss": 0.1709, + "step": 8733 + }, + { + "epoch": 2.32410856838744, + "grad_norm": 0.34326937794685364, + "learning_rate": 1.1273302095973456e-07, + "loss": 0.1706, + "step": 8734 + }, + { + "epoch": 2.324374667376264, + "grad_norm": 0.43296390771865845, + "learning_rate": 1.1271626977677679e-07, + "loss": 0.1777, + "step": 8735 + }, + { + "epoch": 2.324640766365088, + "grad_norm": 0.3284471929073334, + "learning_rate": 1.126995182311252e-07, + "loss": 0.1864, + "step": 8736 + }, + { + "epoch": 2.3249068653539116, + "grad_norm": 0.27499961853027344, + "learning_rate": 1.1268276632325753e-07, + "loss": 0.1869, + "step": 8737 + }, + { + "epoch": 2.3251729643427357, + "grad_norm": 0.41961121559143066, + "learning_rate": 1.1266601405365157e-07, + "loss": 0.179, + "step": 8738 + }, + { + "epoch": 2.3254390633315594, + "grad_norm": 0.2842872142791748, + "learning_rate": 1.1264926142278517e-07, + "loss": 0.1868, + "step": 8739 + }, + { + "epoch": 2.325705162320383, + "grad_norm": 0.28811517357826233, + "learning_rate": 1.1263250843113615e-07, + "loss": 0.2049, + "step": 8740 + }, + { + "epoch": 2.325971261309207, + "grad_norm": 0.2611024081707001, + "learning_rate": 1.1261575507918226e-07, + "loss": 0.1802, + "step": 8741 + }, + { + "epoch": 2.326237360298031, + "grad_norm": 0.29731324315071106, + "learning_rate": 1.1259900136740146e-07, + "loss": 0.1764, + "step": 8742 + }, + { + "epoch": 2.3265034592868545, + "grad_norm": 0.3956061601638794, + "learning_rate": 1.1258224729627151e-07, + "loss": 0.1991, + "step": 8743 + }, + { + "epoch": 2.3267695582756787, + "grad_norm": 0.4523504674434662, + "learning_rate": 1.1256549286627031e-07, + "loss": 0.1746, + "step": 8744 + }, + { + "epoch": 2.3270356572645023, + "grad_norm": 0.27910315990448, + "learning_rate": 1.1254873807787568e-07, + "loss": 0.19, + "step": 8745 + }, + { + "epoch": 2.327301756253326, + "grad_norm": 0.2785157561302185, + "learning_rate": 1.1253198293156558e-07, + "loss": 0.1886, + "step": 8746 + }, + { + "epoch": 2.32756785524215, + "grad_norm": 0.29235994815826416, + "learning_rate": 1.1251522742781787e-07, + "loss": 0.1739, + "step": 8747 + }, + { + "epoch": 2.327833954230974, + "grad_norm": 0.4026066064834595, + "learning_rate": 1.124984715671104e-07, + "loss": 0.2066, + "step": 8748 + }, + { + "epoch": 2.3281000532197975, + "grad_norm": 0.29826170206069946, + "learning_rate": 1.1248171534992114e-07, + "loss": 0.1895, + "step": 8749 + }, + { + "epoch": 2.3283661522086216, + "grad_norm": 0.4469897449016571, + "learning_rate": 1.12464958776728e-07, + "loss": 0.2003, + "step": 8750 + }, + { + "epoch": 2.3286322511974453, + "grad_norm": 0.2893070876598358, + "learning_rate": 1.1244820184800889e-07, + "loss": 0.1671, + "step": 8751 + }, + { + "epoch": 2.3288983501862695, + "grad_norm": 0.5055080056190491, + "learning_rate": 1.124314445642418e-07, + "loss": 0.1928, + "step": 8752 + }, + { + "epoch": 2.329164449175093, + "grad_norm": 0.386476993560791, + "learning_rate": 1.1241468692590461e-07, + "loss": 0.1902, + "step": 8753 + }, + { + "epoch": 2.329430548163917, + "grad_norm": 0.2544551491737366, + "learning_rate": 1.1239792893347535e-07, + "loss": 0.1586, + "step": 8754 + }, + { + "epoch": 2.329696647152741, + "grad_norm": 0.26776981353759766, + "learning_rate": 1.1238117058743197e-07, + "loss": 0.1792, + "step": 8755 + }, + { + "epoch": 2.3299627461415646, + "grad_norm": 0.24696232378482819, + "learning_rate": 1.1236441188825241e-07, + "loss": 0.1706, + "step": 8756 + }, + { + "epoch": 2.3302288451303887, + "grad_norm": 0.2921820282936096, + "learning_rate": 1.1234765283641472e-07, + "loss": 0.1862, + "step": 8757 + }, + { + "epoch": 2.3304949441192124, + "grad_norm": 0.3714800477027893, + "learning_rate": 1.123308934323969e-07, + "loss": 0.1995, + "step": 8758 + }, + { + "epoch": 2.330761043108036, + "grad_norm": 0.2970832288265228, + "learning_rate": 1.1231413367667693e-07, + "loss": 0.1796, + "step": 8759 + }, + { + "epoch": 2.3310271420968602, + "grad_norm": 0.2593066096305847, + "learning_rate": 1.1229737356973282e-07, + "loss": 0.1774, + "step": 8760 + }, + { + "epoch": 2.331293241085684, + "grad_norm": 0.25297266244888306, + "learning_rate": 1.1228061311204262e-07, + "loss": 0.1566, + "step": 8761 + }, + { + "epoch": 2.3315593400745076, + "grad_norm": 0.3464639186859131, + "learning_rate": 1.1226385230408442e-07, + "loss": 0.1871, + "step": 8762 + }, + { + "epoch": 2.3318254390633317, + "grad_norm": 0.3575940430164337, + "learning_rate": 1.1224709114633619e-07, + "loss": 0.2014, + "step": 8763 + }, + { + "epoch": 2.3320915380521554, + "grad_norm": 0.2999721169471741, + "learning_rate": 1.1223032963927605e-07, + "loss": 0.1962, + "step": 8764 + }, + { + "epoch": 2.332357637040979, + "grad_norm": 0.27553239464759827, + "learning_rate": 1.1221356778338207e-07, + "loss": 0.1624, + "step": 8765 + }, + { + "epoch": 2.332623736029803, + "grad_norm": 0.3366338908672333, + "learning_rate": 1.1219680557913231e-07, + "loss": 0.2025, + "step": 8766 + }, + { + "epoch": 2.332889835018627, + "grad_norm": 0.2581539750099182, + "learning_rate": 1.1218004302700485e-07, + "loss": 0.1779, + "step": 8767 + }, + { + "epoch": 2.3331559340074506, + "grad_norm": 0.2724931240081787, + "learning_rate": 1.1216328012747784e-07, + "loss": 0.1784, + "step": 8768 + }, + { + "epoch": 2.3334220329962747, + "grad_norm": 0.33992740511894226, + "learning_rate": 1.1214651688102934e-07, + "loss": 0.1781, + "step": 8769 + }, + { + "epoch": 2.3336881319850984, + "grad_norm": 0.27167949080467224, + "learning_rate": 1.1212975328813749e-07, + "loss": 0.1644, + "step": 8770 + }, + { + "epoch": 2.333954230973922, + "grad_norm": 0.34406784176826477, + "learning_rate": 1.1211298934928043e-07, + "loss": 0.1786, + "step": 8771 + }, + { + "epoch": 2.334220329962746, + "grad_norm": 0.27094632387161255, + "learning_rate": 1.1209622506493632e-07, + "loss": 0.1756, + "step": 8772 + }, + { + "epoch": 2.33448642895157, + "grad_norm": 0.31152379512786865, + "learning_rate": 1.120794604355833e-07, + "loss": 0.1835, + "step": 8773 + }, + { + "epoch": 2.334752527940394, + "grad_norm": 0.2776585817337036, + "learning_rate": 1.1206269546169948e-07, + "loss": 0.1835, + "step": 8774 + }, + { + "epoch": 2.3350186269292177, + "grad_norm": 0.3385573625564575, + "learning_rate": 1.1204593014376311e-07, + "loss": 0.1971, + "step": 8775 + }, + { + "epoch": 2.3352847259180414, + "grad_norm": 0.2633352279663086, + "learning_rate": 1.1202916448225232e-07, + "loss": 0.1787, + "step": 8776 + }, + { + "epoch": 2.3355508249068655, + "grad_norm": 0.3064967095851898, + "learning_rate": 1.1201239847764535e-07, + "loss": 0.1992, + "step": 8777 + }, + { + "epoch": 2.335816923895689, + "grad_norm": 0.280228853225708, + "learning_rate": 1.1199563213042029e-07, + "loss": 0.184, + "step": 8778 + }, + { + "epoch": 2.336083022884513, + "grad_norm": 0.33282148838043213, + "learning_rate": 1.1197886544105548e-07, + "loss": 0.184, + "step": 8779 + }, + { + "epoch": 2.336349121873337, + "grad_norm": 0.27574387192726135, + "learning_rate": 1.1196209841002908e-07, + "loss": 0.1847, + "step": 8780 + }, + { + "epoch": 2.3366152208621607, + "grad_norm": 0.30617210268974304, + "learning_rate": 1.1194533103781932e-07, + "loss": 0.1894, + "step": 8781 + }, + { + "epoch": 2.336881319850985, + "grad_norm": 0.3126690983772278, + "learning_rate": 1.1192856332490443e-07, + "loss": 0.1893, + "step": 8782 + }, + { + "epoch": 2.3371474188398085, + "grad_norm": 0.29126888513565063, + "learning_rate": 1.1191179527176269e-07, + "loss": 0.1968, + "step": 8783 + }, + { + "epoch": 2.337413517828632, + "grad_norm": 0.2641806900501251, + "learning_rate": 1.1189502687887236e-07, + "loss": 0.1696, + "step": 8784 + }, + { + "epoch": 2.3376796168174563, + "grad_norm": 0.2688051164150238, + "learning_rate": 1.1187825814671164e-07, + "loss": 0.1731, + "step": 8785 + }, + { + "epoch": 2.33794571580628, + "grad_norm": 0.2541799545288086, + "learning_rate": 1.118614890757589e-07, + "loss": 0.1712, + "step": 8786 + }, + { + "epoch": 2.3382118147951036, + "grad_norm": 0.2763603925704956, + "learning_rate": 1.1184471966649235e-07, + "loss": 0.1802, + "step": 8787 + }, + { + "epoch": 2.3384779137839278, + "grad_norm": 0.29660987854003906, + "learning_rate": 1.1182794991939038e-07, + "loss": 0.1921, + "step": 8788 + }, + { + "epoch": 2.3387440127727515, + "grad_norm": 0.2801758050918579, + "learning_rate": 1.118111798349312e-07, + "loss": 0.172, + "step": 8789 + }, + { + "epoch": 2.339010111761575, + "grad_norm": 0.2752925753593445, + "learning_rate": 1.117944094135932e-07, + "loss": 0.1795, + "step": 8790 + }, + { + "epoch": 2.3392762107503993, + "grad_norm": 0.26510050892829895, + "learning_rate": 1.1177763865585465e-07, + "loss": 0.1756, + "step": 8791 + }, + { + "epoch": 2.339542309739223, + "grad_norm": 1.0958389043807983, + "learning_rate": 1.1176086756219392e-07, + "loss": 0.1953, + "step": 8792 + }, + { + "epoch": 2.3398084087280466, + "grad_norm": 0.4370810091495514, + "learning_rate": 1.1174409613308935e-07, + "loss": 0.1722, + "step": 8793 + }, + { + "epoch": 2.3400745077168708, + "grad_norm": 0.46446213126182556, + "learning_rate": 1.117273243690193e-07, + "loss": 0.1971, + "step": 8794 + }, + { + "epoch": 2.3403406067056944, + "grad_norm": 0.2677917778491974, + "learning_rate": 1.1171055227046214e-07, + "loss": 0.1689, + "step": 8795 + }, + { + "epoch": 2.340606705694518, + "grad_norm": 0.3901643753051758, + "learning_rate": 1.1169377983789619e-07, + "loss": 0.2036, + "step": 8796 + }, + { + "epoch": 2.3408728046833422, + "grad_norm": 0.2781224846839905, + "learning_rate": 1.116770070717999e-07, + "loss": 0.1763, + "step": 8797 + }, + { + "epoch": 2.341138903672166, + "grad_norm": 0.3060893416404724, + "learning_rate": 1.1166023397265165e-07, + "loss": 0.1954, + "step": 8798 + }, + { + "epoch": 2.34140500266099, + "grad_norm": 0.33937618136405945, + "learning_rate": 1.1164346054092982e-07, + "loss": 0.1913, + "step": 8799 + }, + { + "epoch": 2.3416711016498137, + "grad_norm": 0.2686882019042969, + "learning_rate": 1.1162668677711285e-07, + "loss": 0.1632, + "step": 8800 + }, + { + "epoch": 2.3419372006386374, + "grad_norm": 0.29626739025115967, + "learning_rate": 1.1160991268167914e-07, + "loss": 0.1718, + "step": 8801 + }, + { + "epoch": 2.3422032996274615, + "grad_norm": 0.322529137134552, + "learning_rate": 1.1159313825510712e-07, + "loss": 0.18, + "step": 8802 + }, + { + "epoch": 2.3424693986162852, + "grad_norm": 0.34626615047454834, + "learning_rate": 1.1157636349787522e-07, + "loss": 0.202, + "step": 8803 + }, + { + "epoch": 2.342735497605109, + "grad_norm": 0.2726106345653534, + "learning_rate": 1.1155958841046194e-07, + "loss": 0.186, + "step": 8804 + }, + { + "epoch": 2.343001596593933, + "grad_norm": 0.28677138686180115, + "learning_rate": 1.1154281299334569e-07, + "loss": 0.1746, + "step": 8805 + }, + { + "epoch": 2.3432676955827567, + "grad_norm": 0.42904624342918396, + "learning_rate": 1.1152603724700495e-07, + "loss": 0.1996, + "step": 8806 + }, + { + "epoch": 2.343533794571581, + "grad_norm": 0.3527756631374359, + "learning_rate": 1.1150926117191822e-07, + "loss": 0.1683, + "step": 8807 + }, + { + "epoch": 2.3437998935604045, + "grad_norm": 0.30435335636138916, + "learning_rate": 1.1149248476856399e-07, + "loss": 0.1765, + "step": 8808 + }, + { + "epoch": 2.344065992549228, + "grad_norm": 0.24697643518447876, + "learning_rate": 1.1147570803742072e-07, + "loss": 0.1617, + "step": 8809 + }, + { + "epoch": 2.3443320915380523, + "grad_norm": 0.266170471906662, + "learning_rate": 1.1145893097896695e-07, + "loss": 0.1625, + "step": 8810 + }, + { + "epoch": 2.344598190526876, + "grad_norm": 0.3296646475791931, + "learning_rate": 1.1144215359368117e-07, + "loss": 0.1737, + "step": 8811 + }, + { + "epoch": 2.3448642895156997, + "grad_norm": 0.36056339740753174, + "learning_rate": 1.1142537588204192e-07, + "loss": 0.188, + "step": 8812 + }, + { + "epoch": 2.345130388504524, + "grad_norm": 0.44867488741874695, + "learning_rate": 1.1140859784452774e-07, + "loss": 0.2106, + "step": 8813 + }, + { + "epoch": 2.3453964874933475, + "grad_norm": 0.35868459939956665, + "learning_rate": 1.1139181948161715e-07, + "loss": 0.1796, + "step": 8814 + }, + { + "epoch": 2.345662586482171, + "grad_norm": 0.3477521240711212, + "learning_rate": 1.1137504079378875e-07, + "loss": 0.1891, + "step": 8815 + }, + { + "epoch": 2.3459286854709953, + "grad_norm": 0.27602383494377136, + "learning_rate": 1.1135826178152105e-07, + "loss": 0.1762, + "step": 8816 + }, + { + "epoch": 2.346194784459819, + "grad_norm": 0.326239675283432, + "learning_rate": 1.1134148244529262e-07, + "loss": 0.2025, + "step": 8817 + }, + { + "epoch": 2.3464608834486427, + "grad_norm": 0.25947269797325134, + "learning_rate": 1.113247027855821e-07, + "loss": 0.1667, + "step": 8818 + }, + { + "epoch": 2.346726982437467, + "grad_norm": 0.2819582223892212, + "learning_rate": 1.1130792280286804e-07, + "loss": 0.1967, + "step": 8819 + }, + { + "epoch": 2.3469930814262905, + "grad_norm": 0.30959367752075195, + "learning_rate": 1.1129114249762904e-07, + "loss": 0.2073, + "step": 8820 + }, + { + "epoch": 2.3472591804151146, + "grad_norm": 0.270082950592041, + "learning_rate": 1.112743618703437e-07, + "loss": 0.1704, + "step": 8821 + }, + { + "epoch": 2.3475252794039383, + "grad_norm": 0.35077595710754395, + "learning_rate": 1.1125758092149065e-07, + "loss": 0.1774, + "step": 8822 + }, + { + "epoch": 2.347791378392762, + "grad_norm": 0.27428138256073, + "learning_rate": 1.1124079965154854e-07, + "loss": 0.1727, + "step": 8823 + }, + { + "epoch": 2.348057477381586, + "grad_norm": 0.2629137635231018, + "learning_rate": 1.1122401806099593e-07, + "loss": 0.1665, + "step": 8824 + }, + { + "epoch": 2.34832357637041, + "grad_norm": 0.2610733211040497, + "learning_rate": 1.1120723615031154e-07, + "loss": 0.1676, + "step": 8825 + }, + { + "epoch": 2.3485896753592335, + "grad_norm": 0.2771182358264923, + "learning_rate": 1.1119045391997402e-07, + "loss": 0.1693, + "step": 8826 + }, + { + "epoch": 2.3488557743480576, + "grad_norm": 0.3550879657268524, + "learning_rate": 1.1117367137046201e-07, + "loss": 0.1851, + "step": 8827 + }, + { + "epoch": 2.3491218733368813, + "grad_norm": 0.27591514587402344, + "learning_rate": 1.1115688850225418e-07, + "loss": 0.1704, + "step": 8828 + }, + { + "epoch": 2.3493879723257054, + "grad_norm": 0.25631964206695557, + "learning_rate": 1.111401053158292e-07, + "loss": 0.17, + "step": 8829 + }, + { + "epoch": 2.349654071314529, + "grad_norm": 0.38631927967071533, + "learning_rate": 1.1112332181166581e-07, + "loss": 0.1959, + "step": 8830 + }, + { + "epoch": 2.3499201703033528, + "grad_norm": 0.3743244409561157, + "learning_rate": 1.1110653799024268e-07, + "loss": 0.1963, + "step": 8831 + }, + { + "epoch": 2.350186269292177, + "grad_norm": 0.34325700998306274, + "learning_rate": 1.1108975385203848e-07, + "loss": 0.1877, + "step": 8832 + }, + { + "epoch": 2.3504523682810006, + "grad_norm": 0.27966633439064026, + "learning_rate": 1.1107296939753199e-07, + "loss": 0.1613, + "step": 8833 + }, + { + "epoch": 2.3507184672698243, + "grad_norm": 0.3245508372783661, + "learning_rate": 1.1105618462720192e-07, + "loss": 0.1761, + "step": 8834 + }, + { + "epoch": 2.3509845662586484, + "grad_norm": 0.3412483036518097, + "learning_rate": 1.11039399541527e-07, + "loss": 0.1875, + "step": 8835 + }, + { + "epoch": 2.351250665247472, + "grad_norm": 0.2719053030014038, + "learning_rate": 1.1102261414098592e-07, + "loss": 0.1816, + "step": 8836 + }, + { + "epoch": 2.3515167642362957, + "grad_norm": 0.29798808693885803, + "learning_rate": 1.1100582842605755e-07, + "loss": 0.1911, + "step": 8837 + }, + { + "epoch": 2.35178286322512, + "grad_norm": 0.25876909494400024, + "learning_rate": 1.1098904239722056e-07, + "loss": 0.1742, + "step": 8838 + }, + { + "epoch": 2.3520489622139436, + "grad_norm": 0.2718377411365509, + "learning_rate": 1.1097225605495372e-07, + "loss": 0.1723, + "step": 8839 + }, + { + "epoch": 2.3523150612027672, + "grad_norm": 0.3509431481361389, + "learning_rate": 1.109554693997359e-07, + "loss": 0.1903, + "step": 8840 + }, + { + "epoch": 2.3525811601915914, + "grad_norm": 0.3070494830608368, + "learning_rate": 1.109386824320458e-07, + "loss": 0.1889, + "step": 8841 + }, + { + "epoch": 2.352847259180415, + "grad_norm": 0.2640112638473511, + "learning_rate": 1.1092189515236223e-07, + "loss": 0.1683, + "step": 8842 + }, + { + "epoch": 2.3531133581692387, + "grad_norm": 0.374549925327301, + "learning_rate": 1.1090510756116403e-07, + "loss": 0.193, + "step": 8843 + }, + { + "epoch": 2.353379457158063, + "grad_norm": 0.3254058361053467, + "learning_rate": 1.1088831965893002e-07, + "loss": 0.1847, + "step": 8844 + }, + { + "epoch": 2.3536455561468865, + "grad_norm": 0.26631903648376465, + "learning_rate": 1.1087153144613898e-07, + "loss": 0.1784, + "step": 8845 + }, + { + "epoch": 2.3539116551357107, + "grad_norm": 0.2618035078048706, + "learning_rate": 1.1085474292326982e-07, + "loss": 0.1657, + "step": 8846 + }, + { + "epoch": 2.3541777541245343, + "grad_norm": 0.2821303606033325, + "learning_rate": 1.1083795409080128e-07, + "loss": 0.1871, + "step": 8847 + }, + { + "epoch": 2.354443853113358, + "grad_norm": 0.2952011227607727, + "learning_rate": 1.108211649492123e-07, + "loss": 0.1846, + "step": 8848 + }, + { + "epoch": 2.354709952102182, + "grad_norm": 0.33343368768692017, + "learning_rate": 1.108043754989817e-07, + "loss": 0.1812, + "step": 8849 + }, + { + "epoch": 2.354976051091006, + "grad_norm": 0.3587806224822998, + "learning_rate": 1.1078758574058833e-07, + "loss": 0.2015, + "step": 8850 + }, + { + "epoch": 2.3552421500798295, + "grad_norm": 0.3936798572540283, + "learning_rate": 1.1077079567451111e-07, + "loss": 0.1892, + "step": 8851 + }, + { + "epoch": 2.3555082490686536, + "grad_norm": 0.29595622420310974, + "learning_rate": 1.1075400530122892e-07, + "loss": 0.1868, + "step": 8852 + }, + { + "epoch": 2.3557743480574773, + "grad_norm": 0.26551806926727295, + "learning_rate": 1.1073721462122064e-07, + "loss": 0.1762, + "step": 8853 + }, + { + "epoch": 2.3560404470463014, + "grad_norm": 0.2609638273715973, + "learning_rate": 1.1072042363496516e-07, + "loss": 0.1734, + "step": 8854 + }, + { + "epoch": 2.356306546035125, + "grad_norm": 0.3230355381965637, + "learning_rate": 1.1070363234294144e-07, + "loss": 0.1907, + "step": 8855 + }, + { + "epoch": 2.356572645023949, + "grad_norm": 0.28604772686958313, + "learning_rate": 1.1068684074562838e-07, + "loss": 0.1748, + "step": 8856 + }, + { + "epoch": 2.356838744012773, + "grad_norm": 0.31770309805870056, + "learning_rate": 1.1067004884350486e-07, + "loss": 0.1907, + "step": 8857 + }, + { + "epoch": 2.3571048430015966, + "grad_norm": 0.3404553532600403, + "learning_rate": 1.106532566370499e-07, + "loss": 0.1927, + "step": 8858 + }, + { + "epoch": 2.3573709419904203, + "grad_norm": 0.2934116721153259, + "learning_rate": 1.1063646412674241e-07, + "loss": 0.1868, + "step": 8859 + }, + { + "epoch": 2.3576370409792444, + "grad_norm": 0.2610779404640198, + "learning_rate": 1.1061967131306133e-07, + "loss": 0.1594, + "step": 8860 + }, + { + "epoch": 2.357903139968068, + "grad_norm": 0.42196449637413025, + "learning_rate": 1.1060287819648566e-07, + "loss": 0.2098, + "step": 8861 + }, + { + "epoch": 2.358169238956892, + "grad_norm": 0.2674003839492798, + "learning_rate": 1.1058608477749436e-07, + "loss": 0.1726, + "step": 8862 + }, + { + "epoch": 2.358435337945716, + "grad_norm": 0.27657827734947205, + "learning_rate": 1.1056929105656641e-07, + "loss": 0.1855, + "step": 8863 + }, + { + "epoch": 2.3587014369345396, + "grad_norm": 0.2740563154220581, + "learning_rate": 1.105524970341808e-07, + "loss": 0.172, + "step": 8864 + }, + { + "epoch": 2.3589675359233633, + "grad_norm": 0.26548174023628235, + "learning_rate": 1.1053570271081652e-07, + "loss": 0.1746, + "step": 8865 + }, + { + "epoch": 2.3592336349121874, + "grad_norm": 0.25355616211891174, + "learning_rate": 1.1051890808695259e-07, + "loss": 0.1822, + "step": 8866 + }, + { + "epoch": 2.359499733901011, + "grad_norm": 1.0120439529418945, + "learning_rate": 1.1050211316306806e-07, + "loss": 0.1871, + "step": 8867 + }, + { + "epoch": 2.3597658328898348, + "grad_norm": 0.2644795775413513, + "learning_rate": 1.1048531793964186e-07, + "loss": 0.1642, + "step": 8868 + }, + { + "epoch": 2.360031931878659, + "grad_norm": 0.27585020661354065, + "learning_rate": 1.1046852241715315e-07, + "loss": 0.1741, + "step": 8869 + }, + { + "epoch": 2.3602980308674826, + "grad_norm": 0.27695223689079285, + "learning_rate": 1.1045172659608087e-07, + "loss": 0.1776, + "step": 8870 + }, + { + "epoch": 2.3605641298563067, + "grad_norm": 0.3662376403808594, + "learning_rate": 1.1043493047690413e-07, + "loss": 0.1706, + "step": 8871 + }, + { + "epoch": 2.3608302288451304, + "grad_norm": 0.26169613003730774, + "learning_rate": 1.1041813406010194e-07, + "loss": 0.1736, + "step": 8872 + }, + { + "epoch": 2.361096327833954, + "grad_norm": 0.36093804240226746, + "learning_rate": 1.1040133734615342e-07, + "loss": 0.1947, + "step": 8873 + }, + { + "epoch": 2.361362426822778, + "grad_norm": 1.038516640663147, + "learning_rate": 1.1038454033553761e-07, + "loss": 0.1648, + "step": 8874 + }, + { + "epoch": 2.361628525811602, + "grad_norm": 0.26320603489875793, + "learning_rate": 1.103677430287336e-07, + "loss": 0.1706, + "step": 8875 + }, + { + "epoch": 2.361894624800426, + "grad_norm": 0.26707568764686584, + "learning_rate": 1.103509454262205e-07, + "loss": 0.1889, + "step": 8876 + }, + { + "epoch": 2.3621607237892497, + "grad_norm": 0.27550557255744934, + "learning_rate": 1.103341475284774e-07, + "loss": 0.1839, + "step": 8877 + }, + { + "epoch": 2.3624268227780734, + "grad_norm": 0.32755327224731445, + "learning_rate": 1.1031734933598342e-07, + "loss": 0.1889, + "step": 8878 + }, + { + "epoch": 2.3626929217668975, + "grad_norm": 0.36863332986831665, + "learning_rate": 1.1030055084921768e-07, + "loss": 0.1758, + "step": 8879 + }, + { + "epoch": 2.362959020755721, + "grad_norm": 0.3464549779891968, + "learning_rate": 1.1028375206865926e-07, + "loss": 0.2052, + "step": 8880 + }, + { + "epoch": 2.363225119744545, + "grad_norm": 0.3758781850337982, + "learning_rate": 1.1026695299478736e-07, + "loss": 0.1865, + "step": 8881 + }, + { + "epoch": 2.363491218733369, + "grad_norm": 0.25165605545043945, + "learning_rate": 1.102501536280811e-07, + "loss": 0.1785, + "step": 8882 + }, + { + "epoch": 2.3637573177221927, + "grad_norm": 0.30479350686073303, + "learning_rate": 1.102333539690196e-07, + "loss": 0.1719, + "step": 8883 + }, + { + "epoch": 2.3640234167110163, + "grad_norm": 0.2605089545249939, + "learning_rate": 1.102165540180821e-07, + "loss": 0.1757, + "step": 8884 + }, + { + "epoch": 2.3642895156998405, + "grad_norm": 0.3805619776248932, + "learning_rate": 1.1019975377574768e-07, + "loss": 0.187, + "step": 8885 + }, + { + "epoch": 2.364555614688664, + "grad_norm": 0.35024264454841614, + "learning_rate": 1.1018295324249558e-07, + "loss": 0.1761, + "step": 8886 + }, + { + "epoch": 2.364821713677488, + "grad_norm": 0.3580847382545471, + "learning_rate": 1.1016615241880494e-07, + "loss": 0.1834, + "step": 8887 + }, + { + "epoch": 2.365087812666312, + "grad_norm": 0.31623873114585876, + "learning_rate": 1.1014935130515499e-07, + "loss": 0.1802, + "step": 8888 + }, + { + "epoch": 2.3653539116551356, + "grad_norm": 0.33294424414634705, + "learning_rate": 1.1013254990202492e-07, + "loss": 0.1887, + "step": 8889 + }, + { + "epoch": 2.3656200106439593, + "grad_norm": 0.2877526581287384, + "learning_rate": 1.1011574820989391e-07, + "loss": 0.1822, + "step": 8890 + }, + { + "epoch": 2.3658861096327835, + "grad_norm": 0.257635235786438, + "learning_rate": 1.1009894622924122e-07, + "loss": 0.1614, + "step": 8891 + }, + { + "epoch": 2.366152208621607, + "grad_norm": 0.31031548976898193, + "learning_rate": 1.1008214396054606e-07, + "loss": 0.1796, + "step": 8892 + }, + { + "epoch": 2.3664183076104313, + "grad_norm": 0.4285481870174408, + "learning_rate": 1.100653414042877e-07, + "loss": 0.1772, + "step": 8893 + }, + { + "epoch": 2.366684406599255, + "grad_norm": 0.31884944438934326, + "learning_rate": 1.1004853856094529e-07, + "loss": 0.1805, + "step": 8894 + }, + { + "epoch": 2.3669505055880786, + "grad_norm": 0.2814730405807495, + "learning_rate": 1.1003173543099818e-07, + "loss": 0.1945, + "step": 8895 + }, + { + "epoch": 2.3672166045769027, + "grad_norm": 0.27077922224998474, + "learning_rate": 1.1001493201492558e-07, + "loss": 0.1696, + "step": 8896 + }, + { + "epoch": 2.3674827035657264, + "grad_norm": 0.31527334451675415, + "learning_rate": 1.0999812831320676e-07, + "loss": 0.1861, + "step": 8897 + }, + { + "epoch": 2.36774880255455, + "grad_norm": 0.3550083637237549, + "learning_rate": 1.0998132432632099e-07, + "loss": 0.1736, + "step": 8898 + }, + { + "epoch": 2.3680149015433742, + "grad_norm": 0.2459515929222107, + "learning_rate": 1.0996452005474759e-07, + "loss": 0.1638, + "step": 8899 + }, + { + "epoch": 2.368281000532198, + "grad_norm": 0.29247474670410156, + "learning_rate": 1.0994771549896584e-07, + "loss": 0.1781, + "step": 8900 + }, + { + "epoch": 2.368547099521022, + "grad_norm": 0.26907864212989807, + "learning_rate": 1.0993091065945498e-07, + "loss": 0.1697, + "step": 8901 + }, + { + "epoch": 2.3688131985098457, + "grad_norm": 0.2834838926792145, + "learning_rate": 1.0991410553669442e-07, + "loss": 0.166, + "step": 8902 + }, + { + "epoch": 2.3690792974986694, + "grad_norm": 0.30778828263282776, + "learning_rate": 1.0989730013116343e-07, + "loss": 0.1742, + "step": 8903 + }, + { + "epoch": 2.3693453964874935, + "grad_norm": 0.28565871715545654, + "learning_rate": 1.0988049444334131e-07, + "loss": 0.1733, + "step": 8904 + }, + { + "epoch": 2.369611495476317, + "grad_norm": 0.33985912799835205, + "learning_rate": 1.0986368847370737e-07, + "loss": 0.1792, + "step": 8905 + }, + { + "epoch": 2.369877594465141, + "grad_norm": 0.26376423239707947, + "learning_rate": 1.0984688222274104e-07, + "loss": 0.1836, + "step": 8906 + }, + { + "epoch": 2.370143693453965, + "grad_norm": 0.37252166867256165, + "learning_rate": 1.0983007569092162e-07, + "loss": 0.2052, + "step": 8907 + }, + { + "epoch": 2.3704097924427887, + "grad_norm": 0.2796190679073334, + "learning_rate": 1.0981326887872842e-07, + "loss": 0.1649, + "step": 8908 + }, + { + "epoch": 2.3706758914316124, + "grad_norm": 0.2801375389099121, + "learning_rate": 1.0979646178664088e-07, + "loss": 0.193, + "step": 8909 + }, + { + "epoch": 2.3709419904204365, + "grad_norm": 0.25320738554000854, + "learning_rate": 1.0977965441513836e-07, + "loss": 0.1769, + "step": 8910 + }, + { + "epoch": 2.37120808940926, + "grad_norm": 0.44844767451286316, + "learning_rate": 1.0976284676470022e-07, + "loss": 0.2017, + "step": 8911 + }, + { + "epoch": 2.371474188398084, + "grad_norm": 0.29779568314552307, + "learning_rate": 1.0974603883580582e-07, + "loss": 0.168, + "step": 8912 + }, + { + "epoch": 2.371740287386908, + "grad_norm": 0.2655664086341858, + "learning_rate": 1.0972923062893459e-07, + "loss": 0.1669, + "step": 8913 + }, + { + "epoch": 2.3720063863757317, + "grad_norm": 0.27210721373558044, + "learning_rate": 1.0971242214456598e-07, + "loss": 0.1702, + "step": 8914 + }, + { + "epoch": 2.3722724853645554, + "grad_norm": 0.27522557973861694, + "learning_rate": 1.096956133831793e-07, + "loss": 0.1766, + "step": 8915 + }, + { + "epoch": 2.3725385843533795, + "grad_norm": 0.47741571068763733, + "learning_rate": 1.0967880434525403e-07, + "loss": 0.1821, + "step": 8916 + }, + { + "epoch": 2.372804683342203, + "grad_norm": 0.3454221785068512, + "learning_rate": 1.096619950312696e-07, + "loss": 0.204, + "step": 8917 + }, + { + "epoch": 2.3730707823310273, + "grad_norm": 0.45683375000953674, + "learning_rate": 1.0964518544170544e-07, + "loss": 0.1955, + "step": 8918 + }, + { + "epoch": 2.373336881319851, + "grad_norm": 0.2879493832588196, + "learning_rate": 1.0962837557704099e-07, + "loss": 0.1916, + "step": 8919 + }, + { + "epoch": 2.3736029803086747, + "grad_norm": 0.34327149391174316, + "learning_rate": 1.0961156543775572e-07, + "loss": 0.1886, + "step": 8920 + }, + { + "epoch": 2.373869079297499, + "grad_norm": 0.25536149740219116, + "learning_rate": 1.0959475502432907e-07, + "loss": 0.1807, + "step": 8921 + }, + { + "epoch": 2.3741351782863225, + "grad_norm": 0.36572718620300293, + "learning_rate": 1.095779443372405e-07, + "loss": 0.1899, + "step": 8922 + }, + { + "epoch": 2.374401277275146, + "grad_norm": 0.3143775463104248, + "learning_rate": 1.0956113337696949e-07, + "loss": 0.176, + "step": 8923 + }, + { + "epoch": 2.3746673762639703, + "grad_norm": 0.31198981404304504, + "learning_rate": 1.0954432214399552e-07, + "loss": 0.2069, + "step": 8924 + }, + { + "epoch": 2.374933475252794, + "grad_norm": 0.3098054528236389, + "learning_rate": 1.095275106387981e-07, + "loss": 0.166, + "step": 8925 + }, + { + "epoch": 2.375199574241618, + "grad_norm": 0.2738151252269745, + "learning_rate": 1.0951069886185671e-07, + "loss": 0.1769, + "step": 8926 + }, + { + "epoch": 2.3754656732304418, + "grad_norm": 0.3044242560863495, + "learning_rate": 1.0949388681365086e-07, + "loss": 0.1839, + "step": 8927 + }, + { + "epoch": 2.3757317722192655, + "grad_norm": 0.2823222577571869, + "learning_rate": 1.0947707449466008e-07, + "loss": 0.1791, + "step": 8928 + }, + { + "epoch": 2.3759978712080896, + "grad_norm": 0.28558143973350525, + "learning_rate": 1.0946026190536388e-07, + "loss": 0.1808, + "step": 8929 + }, + { + "epoch": 2.3762639701969133, + "grad_norm": 0.32473278045654297, + "learning_rate": 1.0944344904624176e-07, + "loss": 0.1841, + "step": 8930 + }, + { + "epoch": 2.376530069185737, + "grad_norm": 0.2850419580936432, + "learning_rate": 1.094266359177733e-07, + "loss": 0.1832, + "step": 8931 + }, + { + "epoch": 2.376796168174561, + "grad_norm": 0.2471289336681366, + "learning_rate": 1.0940982252043802e-07, + "loss": 0.1716, + "step": 8932 + }, + { + "epoch": 2.3770622671633848, + "grad_norm": 0.304907888174057, + "learning_rate": 1.0939300885471548e-07, + "loss": 0.1973, + "step": 8933 + }, + { + "epoch": 2.3773283661522084, + "grad_norm": 0.27504536509513855, + "learning_rate": 1.0937619492108523e-07, + "loss": 0.172, + "step": 8934 + }, + { + "epoch": 2.3775944651410326, + "grad_norm": 0.2754216194152832, + "learning_rate": 1.0935938072002686e-07, + "loss": 0.1744, + "step": 8935 + }, + { + "epoch": 2.3778605641298562, + "grad_norm": 0.378745973110199, + "learning_rate": 1.0934256625201994e-07, + "loss": 0.1946, + "step": 8936 + }, + { + "epoch": 2.37812666311868, + "grad_norm": 0.32221922278404236, + "learning_rate": 1.0932575151754406e-07, + "loss": 0.1827, + "step": 8937 + }, + { + "epoch": 2.378392762107504, + "grad_norm": 0.43157416582107544, + "learning_rate": 1.0930893651707875e-07, + "loss": 0.1699, + "step": 8938 + }, + { + "epoch": 2.3786588610963277, + "grad_norm": 0.3566056191921234, + "learning_rate": 1.092921212511037e-07, + "loss": 0.1877, + "step": 8939 + }, + { + "epoch": 2.378924960085152, + "grad_norm": 0.31074094772338867, + "learning_rate": 1.0927530572009844e-07, + "loss": 0.1769, + "step": 8940 + }, + { + "epoch": 2.3791910590739755, + "grad_norm": 0.3040381073951721, + "learning_rate": 1.092584899245426e-07, + "loss": 0.1784, + "step": 8941 + }, + { + "epoch": 2.3794571580627992, + "grad_norm": 0.2814216911792755, + "learning_rate": 1.0924167386491585e-07, + "loss": 0.1797, + "step": 8942 + }, + { + "epoch": 2.3797232570516234, + "grad_norm": 0.2762245833873749, + "learning_rate": 1.0922485754169779e-07, + "loss": 0.1784, + "step": 8943 + }, + { + "epoch": 2.379989356040447, + "grad_norm": 0.24835136532783508, + "learning_rate": 1.0920804095536799e-07, + "loss": 0.1658, + "step": 8944 + }, + { + "epoch": 2.3802554550292707, + "grad_norm": 0.3111089766025543, + "learning_rate": 1.091912241064062e-07, + "loss": 0.1782, + "step": 8945 + }, + { + "epoch": 2.380521554018095, + "grad_norm": 0.24634404480457306, + "learning_rate": 1.0917440699529201e-07, + "loss": 0.1778, + "step": 8946 + }, + { + "epoch": 2.3807876530069185, + "grad_norm": 0.3406279981136322, + "learning_rate": 1.091575896225051e-07, + "loss": 0.1758, + "step": 8947 + }, + { + "epoch": 2.3810537519957427, + "grad_norm": 0.26337963342666626, + "learning_rate": 1.0914077198852509e-07, + "loss": 0.1771, + "step": 8948 + }, + { + "epoch": 2.3813198509845663, + "grad_norm": 0.26425495743751526, + "learning_rate": 1.091239540938317e-07, + "loss": 0.18, + "step": 8949 + }, + { + "epoch": 2.38158594997339, + "grad_norm": 0.3046528100967407, + "learning_rate": 1.0910713593890461e-07, + "loss": 0.1719, + "step": 8950 + }, + { + "epoch": 2.381852048962214, + "grad_norm": 0.2710542678833008, + "learning_rate": 1.0909031752422348e-07, + "loss": 0.1768, + "step": 8951 + }, + { + "epoch": 2.382118147951038, + "grad_norm": 0.3148633539676666, + "learning_rate": 1.0907349885026802e-07, + "loss": 0.1988, + "step": 8952 + }, + { + "epoch": 2.3823842469398615, + "grad_norm": 0.34186047315597534, + "learning_rate": 1.0905667991751797e-07, + "loss": 0.1909, + "step": 8953 + }, + { + "epoch": 2.3826503459286856, + "grad_norm": 0.2758556604385376, + "learning_rate": 1.0903986072645299e-07, + "loss": 0.1865, + "step": 8954 + }, + { + "epoch": 2.3829164449175093, + "grad_norm": 0.30060628056526184, + "learning_rate": 1.0902304127755279e-07, + "loss": 0.2082, + "step": 8955 + }, + { + "epoch": 2.383182543906333, + "grad_norm": 0.2781277000904083, + "learning_rate": 1.0900622157129713e-07, + "loss": 0.1666, + "step": 8956 + }, + { + "epoch": 2.383448642895157, + "grad_norm": 0.30685368180274963, + "learning_rate": 1.0898940160816575e-07, + "loss": 0.1789, + "step": 8957 + }, + { + "epoch": 2.383714741883981, + "grad_norm": 0.32709792256355286, + "learning_rate": 1.0897258138863834e-07, + "loss": 0.1914, + "step": 8958 + }, + { + "epoch": 2.3839808408728045, + "grad_norm": 0.3489883542060852, + "learning_rate": 1.0895576091319467e-07, + "loss": 0.1703, + "step": 8959 + }, + { + "epoch": 2.3842469398616286, + "grad_norm": 0.2664475739002228, + "learning_rate": 1.0893894018231453e-07, + "loss": 0.1884, + "step": 8960 + }, + { + "epoch": 2.3845130388504523, + "grad_norm": 0.42507022619247437, + "learning_rate": 1.0892211919647765e-07, + "loss": 0.2087, + "step": 8961 + }, + { + "epoch": 2.384779137839276, + "grad_norm": 0.2644398510456085, + "learning_rate": 1.0890529795616379e-07, + "loss": 0.189, + "step": 8962 + }, + { + "epoch": 2.3850452368281, + "grad_norm": 0.32102856040000916, + "learning_rate": 1.088884764618527e-07, + "loss": 0.1982, + "step": 8963 + }, + { + "epoch": 2.385311335816924, + "grad_norm": 0.27622249722480774, + "learning_rate": 1.0887165471402426e-07, + "loss": 0.1862, + "step": 8964 + }, + { + "epoch": 2.385577434805748, + "grad_norm": 0.4092581272125244, + "learning_rate": 1.0885483271315816e-07, + "loss": 0.1889, + "step": 8965 + }, + { + "epoch": 2.3858435337945716, + "grad_norm": 0.44211727380752563, + "learning_rate": 1.0883801045973425e-07, + "loss": 0.1999, + "step": 8966 + }, + { + "epoch": 2.3861096327833953, + "grad_norm": 0.28477269411087036, + "learning_rate": 1.0882118795423232e-07, + "loss": 0.1914, + "step": 8967 + }, + { + "epoch": 2.3863757317722194, + "grad_norm": 0.3063855767250061, + "learning_rate": 1.0880436519713219e-07, + "loss": 0.1808, + "step": 8968 + }, + { + "epoch": 2.386641830761043, + "grad_norm": 0.3392769396305084, + "learning_rate": 1.0878754218891368e-07, + "loss": 0.1698, + "step": 8969 + }, + { + "epoch": 2.3869079297498668, + "grad_norm": 0.2517843246459961, + "learning_rate": 1.0877071893005656e-07, + "loss": 0.1713, + "step": 8970 + }, + { + "epoch": 2.387174028738691, + "grad_norm": 0.35026586055755615, + "learning_rate": 1.0875389542104077e-07, + "loss": 0.1843, + "step": 8971 + }, + { + "epoch": 2.3874401277275146, + "grad_norm": 0.2803477644920349, + "learning_rate": 1.0873707166234606e-07, + "loss": 0.1825, + "step": 8972 + }, + { + "epoch": 2.3877062267163387, + "grad_norm": 0.36178529262542725, + "learning_rate": 1.0872024765445234e-07, + "loss": 0.1775, + "step": 8973 + }, + { + "epoch": 2.3879723257051624, + "grad_norm": 0.30490490794181824, + "learning_rate": 1.087034233978394e-07, + "loss": 0.1862, + "step": 8974 + }, + { + "epoch": 2.388238424693986, + "grad_norm": 0.2893596589565277, + "learning_rate": 1.0868659889298717e-07, + "loss": 0.1884, + "step": 8975 + }, + { + "epoch": 2.38850452368281, + "grad_norm": 0.27979588508605957, + "learning_rate": 1.0866977414037548e-07, + "loss": 0.1807, + "step": 8976 + }, + { + "epoch": 2.388770622671634, + "grad_norm": 0.31290099024772644, + "learning_rate": 1.0865294914048421e-07, + "loss": 0.1797, + "step": 8977 + }, + { + "epoch": 2.3890367216604576, + "grad_norm": 0.362003356218338, + "learning_rate": 1.0863612389379326e-07, + "loss": 0.1809, + "step": 8978 + }, + { + "epoch": 2.3893028206492817, + "grad_norm": 0.3134409487247467, + "learning_rate": 1.086192984007825e-07, + "loss": 0.1747, + "step": 8979 + }, + { + "epoch": 2.3895689196381054, + "grad_norm": 0.2857058346271515, + "learning_rate": 1.0860247266193183e-07, + "loss": 0.1957, + "step": 8980 + }, + { + "epoch": 2.389835018626929, + "grad_norm": 0.4383140504360199, + "learning_rate": 1.0858564667772115e-07, + "loss": 0.2082, + "step": 8981 + }, + { + "epoch": 2.390101117615753, + "grad_norm": 0.28957319259643555, + "learning_rate": 1.0856882044863038e-07, + "loss": 0.1936, + "step": 8982 + }, + { + "epoch": 2.390367216604577, + "grad_norm": 0.4340592622756958, + "learning_rate": 1.0855199397513946e-07, + "loss": 0.1849, + "step": 8983 + }, + { + "epoch": 2.3906333155934005, + "grad_norm": 0.3109321594238281, + "learning_rate": 1.0853516725772827e-07, + "loss": 0.1983, + "step": 8984 + }, + { + "epoch": 2.3908994145822247, + "grad_norm": 0.27428051829338074, + "learning_rate": 1.0851834029687678e-07, + "loss": 0.1732, + "step": 8985 + }, + { + "epoch": 2.3911655135710483, + "grad_norm": 0.2721122205257416, + "learning_rate": 1.0850151309306492e-07, + "loss": 0.1833, + "step": 8986 + }, + { + "epoch": 2.3914316125598725, + "grad_norm": 0.3237120807170868, + "learning_rate": 1.0848468564677262e-07, + "loss": 0.188, + "step": 8987 + }, + { + "epoch": 2.391697711548696, + "grad_norm": 0.28450390696525574, + "learning_rate": 1.0846785795847981e-07, + "loss": 0.1682, + "step": 8988 + }, + { + "epoch": 2.39196381053752, + "grad_norm": 0.2694593369960785, + "learning_rate": 1.0845103002866653e-07, + "loss": 0.1728, + "step": 8989 + }, + { + "epoch": 2.392229909526344, + "grad_norm": 0.2866186797618866, + "learning_rate": 1.0843420185781269e-07, + "loss": 0.1943, + "step": 8990 + }, + { + "epoch": 2.3924960085151676, + "grad_norm": 0.40048184990882874, + "learning_rate": 1.084173734463983e-07, + "loss": 0.1808, + "step": 8991 + }, + { + "epoch": 2.3927621075039913, + "grad_norm": 0.35915255546569824, + "learning_rate": 1.0840054479490326e-07, + "loss": 0.1841, + "step": 8992 + }, + { + "epoch": 2.3930282064928154, + "grad_norm": 0.35461631417274475, + "learning_rate": 1.0838371590380763e-07, + "loss": 0.186, + "step": 8993 + }, + { + "epoch": 2.393294305481639, + "grad_norm": 0.29031902551651, + "learning_rate": 1.0836688677359142e-07, + "loss": 0.1747, + "step": 8994 + }, + { + "epoch": 2.3935604044704633, + "grad_norm": 0.3510974943637848, + "learning_rate": 1.0835005740473455e-07, + "loss": 0.1785, + "step": 8995 + }, + { + "epoch": 2.393826503459287, + "grad_norm": 0.2668175995349884, + "learning_rate": 1.083332277977171e-07, + "loss": 0.194, + "step": 8996 + }, + { + "epoch": 2.3940926024481106, + "grad_norm": 0.40308475494384766, + "learning_rate": 1.0831639795301906e-07, + "loss": 0.196, + "step": 8997 + }, + { + "epoch": 2.3943587014369347, + "grad_norm": 0.4038389027118683, + "learning_rate": 1.0829956787112046e-07, + "loss": 0.1781, + "step": 8998 + }, + { + "epoch": 2.3946248004257584, + "grad_norm": 0.3432294428348541, + "learning_rate": 1.0828273755250128e-07, + "loss": 0.2001, + "step": 8999 + }, + { + "epoch": 2.394890899414582, + "grad_norm": 0.34779247641563416, + "learning_rate": 1.0826590699764164e-07, + "loss": 0.1788, + "step": 9000 + }, + { + "epoch": 2.3951569984034062, + "grad_norm": 0.37768933176994324, + "learning_rate": 1.082490762070215e-07, + "loss": 0.1832, + "step": 9001 + }, + { + "epoch": 2.39542309739223, + "grad_norm": 0.4004638195037842, + "learning_rate": 1.0823224518112095e-07, + "loss": 0.1982, + "step": 9002 + }, + { + "epoch": 2.3956891963810536, + "grad_norm": 0.26082101464271545, + "learning_rate": 1.0821541392042006e-07, + "loss": 0.1753, + "step": 9003 + }, + { + "epoch": 2.3959552953698777, + "grad_norm": 0.3374905288219452, + "learning_rate": 1.0819858242539885e-07, + "loss": 0.1955, + "step": 9004 + }, + { + "epoch": 2.3962213943587014, + "grad_norm": 0.2908842861652374, + "learning_rate": 1.0818175069653743e-07, + "loss": 0.1945, + "step": 9005 + }, + { + "epoch": 2.396487493347525, + "grad_norm": 0.27223050594329834, + "learning_rate": 1.0816491873431583e-07, + "loss": 0.1742, + "step": 9006 + }, + { + "epoch": 2.396753592336349, + "grad_norm": 0.2769252359867096, + "learning_rate": 1.0814808653921415e-07, + "loss": 0.1708, + "step": 9007 + }, + { + "epoch": 2.397019691325173, + "grad_norm": 0.29190871119499207, + "learning_rate": 1.0813125411171251e-07, + "loss": 0.1684, + "step": 9008 + }, + { + "epoch": 2.3972857903139966, + "grad_norm": 0.2768833339214325, + "learning_rate": 1.0811442145229097e-07, + "loss": 0.1813, + "step": 9009 + }, + { + "epoch": 2.3975518893028207, + "grad_norm": 0.2889501750469208, + "learning_rate": 1.0809758856142963e-07, + "loss": 0.1784, + "step": 9010 + }, + { + "epoch": 2.3978179882916444, + "grad_norm": 0.2963598072528839, + "learning_rate": 1.0808075543960862e-07, + "loss": 0.1845, + "step": 9011 + }, + { + "epoch": 2.3980840872804685, + "grad_norm": 0.2656811773777008, + "learning_rate": 1.0806392208730806e-07, + "loss": 0.1676, + "step": 9012 + }, + { + "epoch": 2.398350186269292, + "grad_norm": 0.30551859736442566, + "learning_rate": 1.0804708850500803e-07, + "loss": 0.1925, + "step": 9013 + }, + { + "epoch": 2.398616285258116, + "grad_norm": 0.4524086117744446, + "learning_rate": 1.080302546931887e-07, + "loss": 0.1956, + "step": 9014 + }, + { + "epoch": 2.39888238424694, + "grad_norm": 0.28540733456611633, + "learning_rate": 1.0801342065233017e-07, + "loss": 0.1825, + "step": 9015 + }, + { + "epoch": 2.3991484832357637, + "grad_norm": 0.28018298745155334, + "learning_rate": 1.0799658638291263e-07, + "loss": 0.1821, + "step": 9016 + }, + { + "epoch": 2.3994145822245874, + "grad_norm": 0.28310999274253845, + "learning_rate": 1.0797975188541615e-07, + "loss": 0.1793, + "step": 9017 + }, + { + "epoch": 2.3996806812134115, + "grad_norm": 0.2713719606399536, + "learning_rate": 1.0796291716032096e-07, + "loss": 0.1808, + "step": 9018 + }, + { + "epoch": 2.399946780202235, + "grad_norm": 0.3838622272014618, + "learning_rate": 1.0794608220810719e-07, + "loss": 0.1937, + "step": 9019 + }, + { + "epoch": 2.4002128791910593, + "grad_norm": 0.2653614580631256, + "learning_rate": 1.07929247029255e-07, + "loss": 0.1769, + "step": 9020 + }, + { + "epoch": 2.400478978179883, + "grad_norm": 0.3114367425441742, + "learning_rate": 1.0791241162424458e-07, + "loss": 0.1931, + "step": 9021 + }, + { + "epoch": 2.4007450771687067, + "grad_norm": 0.2957669198513031, + "learning_rate": 1.078955759935561e-07, + "loss": 0.1962, + "step": 9022 + }, + { + "epoch": 2.401011176157531, + "grad_norm": 0.28209996223449707, + "learning_rate": 1.0787874013766975e-07, + "loss": 0.1853, + "step": 9023 + }, + { + "epoch": 2.4012772751463545, + "grad_norm": 0.3134882152080536, + "learning_rate": 1.078619040570657e-07, + "loss": 0.181, + "step": 9024 + }, + { + "epoch": 2.401543374135178, + "grad_norm": 0.27229025959968567, + "learning_rate": 1.0784506775222417e-07, + "loss": 0.1737, + "step": 9025 + }, + { + "epoch": 2.4018094731240023, + "grad_norm": 0.6258665323257446, + "learning_rate": 1.0782823122362539e-07, + "loss": 0.1864, + "step": 9026 + }, + { + "epoch": 2.402075572112826, + "grad_norm": 0.3330075740814209, + "learning_rate": 1.0781139447174956e-07, + "loss": 0.1766, + "step": 9027 + }, + { + "epoch": 2.4023416711016496, + "grad_norm": 0.4453677535057068, + "learning_rate": 1.0779455749707684e-07, + "loss": 0.1917, + "step": 9028 + }, + { + "epoch": 2.4026077700904738, + "grad_norm": 0.2948370575904846, + "learning_rate": 1.0777772030008754e-07, + "loss": 0.1775, + "step": 9029 + }, + { + "epoch": 2.4028738690792975, + "grad_norm": 0.2844383418560028, + "learning_rate": 1.0776088288126186e-07, + "loss": 0.1788, + "step": 9030 + }, + { + "epoch": 2.403139968068121, + "grad_norm": 0.2793814539909363, + "learning_rate": 1.0774404524108001e-07, + "loss": 0.1838, + "step": 9031 + }, + { + "epoch": 2.4034060670569453, + "grad_norm": 0.31540900468826294, + "learning_rate": 1.0772720738002224e-07, + "loss": 0.1736, + "step": 9032 + }, + { + "epoch": 2.403672166045769, + "grad_norm": 0.27868780493736267, + "learning_rate": 1.0771036929856882e-07, + "loss": 0.176, + "step": 9033 + }, + { + "epoch": 2.4039382650345926, + "grad_norm": 0.23903238773345947, + "learning_rate": 1.0769353099720002e-07, + "loss": 0.1551, + "step": 9034 + }, + { + "epoch": 2.4042043640234168, + "grad_norm": 0.32880741357803345, + "learning_rate": 1.0767669247639605e-07, + "loss": 0.2015, + "step": 9035 + }, + { + "epoch": 2.4044704630122404, + "grad_norm": 3.299130439758301, + "learning_rate": 1.0765985373663725e-07, + "loss": 0.2002, + "step": 9036 + }, + { + "epoch": 2.4047365620010646, + "grad_norm": 0.373521625995636, + "learning_rate": 1.0764301477840383e-07, + "loss": 0.2067, + "step": 9037 + }, + { + "epoch": 2.4050026609898882, + "grad_norm": 0.29202648997306824, + "learning_rate": 1.076261756021761e-07, + "loss": 0.1769, + "step": 9038 + }, + { + "epoch": 2.405268759978712, + "grad_norm": 0.2800842821598053, + "learning_rate": 1.0760933620843433e-07, + "loss": 0.1798, + "step": 9039 + }, + { + "epoch": 2.405534858967536, + "grad_norm": 0.2722086012363434, + "learning_rate": 1.0759249659765885e-07, + "loss": 0.165, + "step": 9040 + }, + { + "epoch": 2.4058009579563597, + "grad_norm": 0.3197363615036011, + "learning_rate": 1.0757565677032994e-07, + "loss": 0.1835, + "step": 9041 + }, + { + "epoch": 2.406067056945184, + "grad_norm": 0.31292086839675903, + "learning_rate": 1.0755881672692788e-07, + "loss": 0.1891, + "step": 9042 + }, + { + "epoch": 2.4063331559340075, + "grad_norm": 0.2706109583377838, + "learning_rate": 1.0754197646793302e-07, + "loss": 0.1883, + "step": 9043 + }, + { + "epoch": 2.406599254922831, + "grad_norm": 0.3170577585697174, + "learning_rate": 1.0752513599382567e-07, + "loss": 0.1625, + "step": 9044 + }, + { + "epoch": 2.4068653539116553, + "grad_norm": 0.5064786076545715, + "learning_rate": 1.0750829530508616e-07, + "loss": 0.1937, + "step": 9045 + }, + { + "epoch": 2.407131452900479, + "grad_norm": 0.36702361702919006, + "learning_rate": 1.0749145440219479e-07, + "loss": 0.1843, + "step": 9046 + }, + { + "epoch": 2.4073975518893027, + "grad_norm": 0.27878063917160034, + "learning_rate": 1.0747461328563192e-07, + "loss": 0.1879, + "step": 9047 + }, + { + "epoch": 2.407663650878127, + "grad_norm": 0.2723178565502167, + "learning_rate": 1.0745777195587791e-07, + "loss": 0.1721, + "step": 9048 + }, + { + "epoch": 2.4079297498669505, + "grad_norm": 0.3812156915664673, + "learning_rate": 1.0744093041341306e-07, + "loss": 0.1907, + "step": 9049 + }, + { + "epoch": 2.408195848855774, + "grad_norm": 0.3628903031349182, + "learning_rate": 1.0742408865871777e-07, + "loss": 0.1912, + "step": 9050 + }, + { + "epoch": 2.4084619478445983, + "grad_norm": 0.2843943238258362, + "learning_rate": 1.0740724669227239e-07, + "loss": 0.1865, + "step": 9051 + }, + { + "epoch": 2.408728046833422, + "grad_norm": 0.3018209934234619, + "learning_rate": 1.0739040451455728e-07, + "loss": 0.1711, + "step": 9052 + }, + { + "epoch": 2.4089941458222457, + "grad_norm": 0.415920227766037, + "learning_rate": 1.073735621260528e-07, + "loss": 0.1872, + "step": 9053 + }, + { + "epoch": 2.40926024481107, + "grad_norm": 0.36719855666160583, + "learning_rate": 1.0735671952723935e-07, + "loss": 0.195, + "step": 9054 + }, + { + "epoch": 2.4095263437998935, + "grad_norm": 0.26126599311828613, + "learning_rate": 1.0733987671859732e-07, + "loss": 0.189, + "step": 9055 + }, + { + "epoch": 2.409792442788717, + "grad_norm": 0.35965171456336975, + "learning_rate": 1.0732303370060709e-07, + "loss": 0.1913, + "step": 9056 + }, + { + "epoch": 2.4100585417775413, + "grad_norm": 0.3342043161392212, + "learning_rate": 1.0730619047374904e-07, + "loss": 0.192, + "step": 9057 + }, + { + "epoch": 2.410324640766365, + "grad_norm": 0.33450326323509216, + "learning_rate": 1.0728934703850361e-07, + "loss": 0.1846, + "step": 9058 + }, + { + "epoch": 2.410590739755189, + "grad_norm": 0.38361918926239014, + "learning_rate": 1.0727250339535119e-07, + "loss": 0.1839, + "step": 9059 + }, + { + "epoch": 2.410856838744013, + "grad_norm": 0.34413543343544006, + "learning_rate": 1.0725565954477218e-07, + "loss": 0.176, + "step": 9060 + }, + { + "epoch": 2.4111229377328365, + "grad_norm": 0.2652321457862854, + "learning_rate": 1.07238815487247e-07, + "loss": 0.1687, + "step": 9061 + }, + { + "epoch": 2.4113890367216606, + "grad_norm": 0.33014941215515137, + "learning_rate": 1.0722197122325613e-07, + "loss": 0.1896, + "step": 9062 + }, + { + "epoch": 2.4116551357104843, + "grad_norm": 0.28239870071411133, + "learning_rate": 1.0720512675327995e-07, + "loss": 0.1752, + "step": 9063 + }, + { + "epoch": 2.411921234699308, + "grad_norm": 0.26794302463531494, + "learning_rate": 1.0718828207779893e-07, + "loss": 0.1692, + "step": 9064 + }, + { + "epoch": 2.412187333688132, + "grad_norm": 0.28505054116249084, + "learning_rate": 1.0717143719729347e-07, + "loss": 0.1943, + "step": 9065 + }, + { + "epoch": 2.4124534326769558, + "grad_norm": 0.2878556549549103, + "learning_rate": 1.0715459211224407e-07, + "loss": 0.1733, + "step": 9066 + }, + { + "epoch": 2.41271953166578, + "grad_norm": 0.2753622829914093, + "learning_rate": 1.0713774682313114e-07, + "loss": 0.1746, + "step": 9067 + }, + { + "epoch": 2.4129856306546036, + "grad_norm": 0.277876615524292, + "learning_rate": 1.0712090133043518e-07, + "loss": 0.1685, + "step": 9068 + }, + { + "epoch": 2.4132517296434273, + "grad_norm": 0.25625649094581604, + "learning_rate": 1.0710405563463665e-07, + "loss": 0.1839, + "step": 9069 + }, + { + "epoch": 2.4135178286322514, + "grad_norm": 0.2556103765964508, + "learning_rate": 1.0708720973621601e-07, + "loss": 0.1721, + "step": 9070 + }, + { + "epoch": 2.413783927621075, + "grad_norm": 0.279752641916275, + "learning_rate": 1.0707036363565374e-07, + "loss": 0.1831, + "step": 9071 + }, + { + "epoch": 2.4140500266098988, + "grad_norm": 0.36261388659477234, + "learning_rate": 1.0705351733343036e-07, + "loss": 0.1754, + "step": 9072 + }, + { + "epoch": 2.414316125598723, + "grad_norm": 0.31493133306503296, + "learning_rate": 1.070366708300263e-07, + "loss": 0.1915, + "step": 9073 + }, + { + "epoch": 2.4145822245875466, + "grad_norm": 0.355320543050766, + "learning_rate": 1.0701982412592213e-07, + "loss": 0.178, + "step": 9074 + }, + { + "epoch": 2.4148483235763702, + "grad_norm": 0.2686184346675873, + "learning_rate": 1.0700297722159825e-07, + "loss": 0.1814, + "step": 9075 + }, + { + "epoch": 2.4151144225651944, + "grad_norm": 0.4864693880081177, + "learning_rate": 1.0698613011753526e-07, + "loss": 0.2083, + "step": 9076 + }, + { + "epoch": 2.415380521554018, + "grad_norm": 0.31188252568244934, + "learning_rate": 1.0696928281421365e-07, + "loss": 0.1941, + "step": 9077 + }, + { + "epoch": 2.4156466205428417, + "grad_norm": 0.32694804668426514, + "learning_rate": 1.0695243531211391e-07, + "loss": 0.1991, + "step": 9078 + }, + { + "epoch": 2.415912719531666, + "grad_norm": 0.28685450553894043, + "learning_rate": 1.0693558761171659e-07, + "loss": 0.1728, + "step": 9079 + }, + { + "epoch": 2.4161788185204895, + "grad_norm": 0.3029080033302307, + "learning_rate": 1.0691873971350223e-07, + "loss": 0.1783, + "step": 9080 + }, + { + "epoch": 2.4164449175093132, + "grad_norm": 0.2819245159626007, + "learning_rate": 1.0690189161795135e-07, + "loss": 0.1759, + "step": 9081 + }, + { + "epoch": 2.4167110164981374, + "grad_norm": 0.2599683701992035, + "learning_rate": 1.068850433255445e-07, + "loss": 0.1632, + "step": 9082 + }, + { + "epoch": 2.416977115486961, + "grad_norm": 0.26019254326820374, + "learning_rate": 1.0686819483676224e-07, + "loss": 0.1845, + "step": 9083 + }, + { + "epoch": 2.417243214475785, + "grad_norm": 0.3550504148006439, + "learning_rate": 1.0685134615208509e-07, + "loss": 0.187, + "step": 9084 + }, + { + "epoch": 2.417509313464609, + "grad_norm": 0.25186216831207275, + "learning_rate": 1.0683449727199363e-07, + "loss": 0.1586, + "step": 9085 + }, + { + "epoch": 2.4177754124534325, + "grad_norm": 0.2862148880958557, + "learning_rate": 1.0681764819696838e-07, + "loss": 0.1853, + "step": 9086 + }, + { + "epoch": 2.4180415114422567, + "grad_norm": 0.27940914034843445, + "learning_rate": 1.0680079892748999e-07, + "loss": 0.1731, + "step": 9087 + }, + { + "epoch": 2.4183076104310803, + "grad_norm": 0.29574957489967346, + "learning_rate": 1.0678394946403902e-07, + "loss": 0.1706, + "step": 9088 + }, + { + "epoch": 2.418573709419904, + "grad_norm": 0.27323901653289795, + "learning_rate": 1.0676709980709597e-07, + "loss": 0.1848, + "step": 9089 + }, + { + "epoch": 2.418839808408728, + "grad_norm": 0.36539632081985474, + "learning_rate": 1.067502499571415e-07, + "loss": 0.1898, + "step": 9090 + }, + { + "epoch": 2.419105907397552, + "grad_norm": 0.2847062647342682, + "learning_rate": 1.0673339991465619e-07, + "loss": 0.1752, + "step": 9091 + }, + { + "epoch": 2.419372006386376, + "grad_norm": 0.31525757908821106, + "learning_rate": 1.0671654968012065e-07, + "loss": 0.1888, + "step": 9092 + }, + { + "epoch": 2.4196381053751996, + "grad_norm": 0.307487815618515, + "learning_rate": 1.0669969925401543e-07, + "loss": 0.1922, + "step": 9093 + }, + { + "epoch": 2.4199042043640233, + "grad_norm": 0.2655682861804962, + "learning_rate": 1.0668284863682118e-07, + "loss": 0.1749, + "step": 9094 + }, + { + "epoch": 2.4201703033528474, + "grad_norm": 0.26277947425842285, + "learning_rate": 1.0666599782901853e-07, + "loss": 0.1903, + "step": 9095 + }, + { + "epoch": 2.420436402341671, + "grad_norm": 0.40850701928138733, + "learning_rate": 1.0664914683108806e-07, + "loss": 0.1963, + "step": 9096 + }, + { + "epoch": 2.420702501330495, + "grad_norm": 0.2938707768917084, + "learning_rate": 1.066322956435104e-07, + "loss": 0.1935, + "step": 9097 + }, + { + "epoch": 2.420968600319319, + "grad_norm": 0.2555403709411621, + "learning_rate": 1.0661544426676619e-07, + "loss": 0.1654, + "step": 9098 + }, + { + "epoch": 2.4212346993081426, + "grad_norm": 0.44625213742256165, + "learning_rate": 1.0659859270133608e-07, + "loss": 0.2086, + "step": 9099 + }, + { + "epoch": 2.4215007982969663, + "grad_norm": 0.34020596742630005, + "learning_rate": 1.065817409477007e-07, + "loss": 0.1957, + "step": 9100 + }, + { + "epoch": 2.4217668972857904, + "grad_norm": 0.4461599886417389, + "learning_rate": 1.0656488900634069e-07, + "loss": 0.1982, + "step": 9101 + }, + { + "epoch": 2.422032996274614, + "grad_norm": 0.3469967842102051, + "learning_rate": 1.0654803687773671e-07, + "loss": 0.1882, + "step": 9102 + }, + { + "epoch": 2.422299095263438, + "grad_norm": 0.4149291217327118, + "learning_rate": 1.065311845623694e-07, + "loss": 0.1903, + "step": 9103 + }, + { + "epoch": 2.422565194252262, + "grad_norm": 0.2752256691455841, + "learning_rate": 1.0651433206071942e-07, + "loss": 0.1951, + "step": 9104 + }, + { + "epoch": 2.4228312932410856, + "grad_norm": 0.3490048348903656, + "learning_rate": 1.0649747937326747e-07, + "loss": 0.1946, + "step": 9105 + }, + { + "epoch": 2.4230973922299097, + "grad_norm": 0.5810169577598572, + "learning_rate": 1.0648062650049422e-07, + "loss": 0.1677, + "step": 9106 + }, + { + "epoch": 2.4233634912187334, + "grad_norm": 0.26807713508605957, + "learning_rate": 1.0646377344288034e-07, + "loss": 0.162, + "step": 9107 + }, + { + "epoch": 2.423629590207557, + "grad_norm": 0.2833516299724579, + "learning_rate": 1.0644692020090646e-07, + "loss": 0.1919, + "step": 9108 + }, + { + "epoch": 2.423895689196381, + "grad_norm": 0.25080087780952454, + "learning_rate": 1.0643006677505335e-07, + "loss": 0.1585, + "step": 9109 + }, + { + "epoch": 2.424161788185205, + "grad_norm": 0.27312225103378296, + "learning_rate": 1.0641321316580168e-07, + "loss": 0.1725, + "step": 9110 + }, + { + "epoch": 2.4244278871740286, + "grad_norm": 0.24187074601650238, + "learning_rate": 1.063963593736321e-07, + "loss": 0.1597, + "step": 9111 + }, + { + "epoch": 2.4246939861628527, + "grad_norm": 0.26212167739868164, + "learning_rate": 1.0637950539902536e-07, + "loss": 0.1679, + "step": 9112 + }, + { + "epoch": 2.4249600851516764, + "grad_norm": 0.3723462224006653, + "learning_rate": 1.0636265124246218e-07, + "loss": 0.1831, + "step": 9113 + }, + { + "epoch": 2.4252261841405005, + "grad_norm": 0.4144590497016907, + "learning_rate": 1.0634579690442327e-07, + "loss": 0.1931, + "step": 9114 + }, + { + "epoch": 2.425492283129324, + "grad_norm": 0.37669849395751953, + "learning_rate": 1.063289423853893e-07, + "loss": 0.1952, + "step": 9115 + }, + { + "epoch": 2.425758382118148, + "grad_norm": 0.2556971311569214, + "learning_rate": 1.0631208768584103e-07, + "loss": 0.1662, + "step": 9116 + }, + { + "epoch": 2.426024481106972, + "grad_norm": 0.29608023166656494, + "learning_rate": 1.0629523280625921e-07, + "loss": 0.1661, + "step": 9117 + }, + { + "epoch": 2.4262905800957957, + "grad_norm": 0.2728145122528076, + "learning_rate": 1.0627837774712456e-07, + "loss": 0.1837, + "step": 9118 + }, + { + "epoch": 2.4265566790846194, + "grad_norm": 0.35325339436531067, + "learning_rate": 1.0626152250891782e-07, + "loss": 0.1849, + "step": 9119 + }, + { + "epoch": 2.4268227780734435, + "grad_norm": 0.26948535442352295, + "learning_rate": 1.0624466709211972e-07, + "loss": 0.1759, + "step": 9120 + }, + { + "epoch": 2.427088877062267, + "grad_norm": 0.29510414600372314, + "learning_rate": 1.0622781149721105e-07, + "loss": 0.2, + "step": 9121 + }, + { + "epoch": 2.427354976051091, + "grad_norm": 0.4177214503288269, + "learning_rate": 1.0621095572467252e-07, + "loss": 0.1946, + "step": 9122 + }, + { + "epoch": 2.427621075039915, + "grad_norm": 0.355039119720459, + "learning_rate": 1.0619409977498493e-07, + "loss": 0.1857, + "step": 9123 + }, + { + "epoch": 2.4278871740287387, + "grad_norm": 0.32466384768486023, + "learning_rate": 1.0617724364862901e-07, + "loss": 0.1786, + "step": 9124 + }, + { + "epoch": 2.4281532730175623, + "grad_norm": 0.2801082134246826, + "learning_rate": 1.0616038734608557e-07, + "loss": 0.1903, + "step": 9125 + }, + { + "epoch": 2.4284193720063865, + "grad_norm": 0.3395731449127197, + "learning_rate": 1.0614353086783534e-07, + "loss": 0.1775, + "step": 9126 + }, + { + "epoch": 2.42868547099521, + "grad_norm": 0.32037588953971863, + "learning_rate": 1.0612667421435912e-07, + "loss": 0.1645, + "step": 9127 + }, + { + "epoch": 2.428951569984034, + "grad_norm": 0.33219781517982483, + "learning_rate": 1.0610981738613773e-07, + "loss": 0.1786, + "step": 9128 + }, + { + "epoch": 2.429217668972858, + "grad_norm": 0.27310317754745483, + "learning_rate": 1.060929603836519e-07, + "loss": 0.1822, + "step": 9129 + }, + { + "epoch": 2.4294837679616816, + "grad_norm": 0.3821481764316559, + "learning_rate": 1.060761032073825e-07, + "loss": 0.1904, + "step": 9130 + }, + { + "epoch": 2.4297498669505058, + "grad_norm": 0.30219289660453796, + "learning_rate": 1.0605924585781025e-07, + "loss": 0.1907, + "step": 9131 + }, + { + "epoch": 2.4300159659393294, + "grad_norm": 0.3077714145183563, + "learning_rate": 1.0604238833541603e-07, + "loss": 0.1854, + "step": 9132 + }, + { + "epoch": 2.430282064928153, + "grad_norm": 0.3527337610721588, + "learning_rate": 1.0602553064068058e-07, + "loss": 0.1893, + "step": 9133 + }, + { + "epoch": 2.4305481639169773, + "grad_norm": 0.3667123317718506, + "learning_rate": 1.0600867277408476e-07, + "loss": 0.183, + "step": 9134 + }, + { + "epoch": 2.430814262905801, + "grad_norm": 0.3473729193210602, + "learning_rate": 1.0599181473610939e-07, + "loss": 0.1779, + "step": 9135 + }, + { + "epoch": 2.4310803618946246, + "grad_norm": 0.29169464111328125, + "learning_rate": 1.059749565272353e-07, + "loss": 0.1817, + "step": 9136 + }, + { + "epoch": 2.4313464608834487, + "grad_norm": 0.364149808883667, + "learning_rate": 1.0595809814794328e-07, + "loss": 0.1815, + "step": 9137 + }, + { + "epoch": 2.4316125598722724, + "grad_norm": 0.25892361998558044, + "learning_rate": 1.0594123959871422e-07, + "loss": 0.1652, + "step": 9138 + }, + { + "epoch": 2.4318786588610966, + "grad_norm": 0.3411307632923126, + "learning_rate": 1.0592438088002894e-07, + "loss": 0.176, + "step": 9139 + }, + { + "epoch": 2.4321447578499202, + "grad_norm": 0.2656151056289673, + "learning_rate": 1.0590752199236826e-07, + "loss": 0.1817, + "step": 9140 + }, + { + "epoch": 2.432410856838744, + "grad_norm": 0.3891696333885193, + "learning_rate": 1.0589066293621303e-07, + "loss": 0.18, + "step": 9141 + }, + { + "epoch": 2.432676955827568, + "grad_norm": 0.2795925438404083, + "learning_rate": 1.0587380371204415e-07, + "loss": 0.1829, + "step": 9142 + }, + { + "epoch": 2.4329430548163917, + "grad_norm": 0.36538341641426086, + "learning_rate": 1.0585694432034245e-07, + "loss": 0.1851, + "step": 9143 + }, + { + "epoch": 2.4332091538052154, + "grad_norm": 0.28621426224708557, + "learning_rate": 1.0584008476158876e-07, + "loss": 0.1966, + "step": 9144 + }, + { + "epoch": 2.4334752527940395, + "grad_norm": 0.2763318121433258, + "learning_rate": 1.0582322503626404e-07, + "loss": 0.1722, + "step": 9145 + }, + { + "epoch": 2.433741351782863, + "grad_norm": 0.2642519176006317, + "learning_rate": 1.0580636514484908e-07, + "loss": 0.1821, + "step": 9146 + }, + { + "epoch": 2.434007450771687, + "grad_norm": 0.47540873289108276, + "learning_rate": 1.0578950508782481e-07, + "loss": 0.1861, + "step": 9147 + }, + { + "epoch": 2.434273549760511, + "grad_norm": 0.2590658664703369, + "learning_rate": 1.0577264486567206e-07, + "loss": 0.1633, + "step": 9148 + }, + { + "epoch": 2.4345396487493347, + "grad_norm": 0.24722439050674438, + "learning_rate": 1.0575578447887177e-07, + "loss": 0.1656, + "step": 9149 + }, + { + "epoch": 2.4348057477381584, + "grad_norm": 0.29171332716941833, + "learning_rate": 1.057389239279048e-07, + "loss": 0.177, + "step": 9150 + }, + { + "epoch": 2.4350718467269825, + "grad_norm": 0.41647812724113464, + "learning_rate": 1.0572206321325203e-07, + "loss": 0.1917, + "step": 9151 + }, + { + "epoch": 2.435337945715806, + "grad_norm": 0.3675501346588135, + "learning_rate": 1.0570520233539442e-07, + "loss": 0.1893, + "step": 9152 + }, + { + "epoch": 2.43560404470463, + "grad_norm": 0.3477460741996765, + "learning_rate": 1.0568834129481283e-07, + "loss": 0.189, + "step": 9153 + }, + { + "epoch": 2.435870143693454, + "grad_norm": 0.2822449803352356, + "learning_rate": 1.056714800919882e-07, + "loss": 0.1843, + "step": 9154 + }, + { + "epoch": 2.4361362426822777, + "grad_norm": 0.28075313568115234, + "learning_rate": 1.0565461872740142e-07, + "loss": 0.1861, + "step": 9155 + }, + { + "epoch": 2.436402341671102, + "grad_norm": 0.3235592246055603, + "learning_rate": 1.0563775720153345e-07, + "loss": 0.17, + "step": 9156 + }, + { + "epoch": 2.4366684406599255, + "grad_norm": 0.5169726014137268, + "learning_rate": 1.0562089551486517e-07, + "loss": 0.1883, + "step": 9157 + }, + { + "epoch": 2.436934539648749, + "grad_norm": 0.2786189019680023, + "learning_rate": 1.0560403366787756e-07, + "loss": 0.1865, + "step": 9158 + }, + { + "epoch": 2.4372006386375733, + "grad_norm": 0.35602644085884094, + "learning_rate": 1.0558717166105146e-07, + "loss": 0.1824, + "step": 9159 + }, + { + "epoch": 2.437466737626397, + "grad_norm": 0.28808149695396423, + "learning_rate": 1.0557030949486792e-07, + "loss": 0.1761, + "step": 9160 + }, + { + "epoch": 2.437732836615221, + "grad_norm": 0.3073597550392151, + "learning_rate": 1.0555344716980783e-07, + "loss": 0.1681, + "step": 9161 + }, + { + "epoch": 2.437998935604045, + "grad_norm": 0.39261654019355774, + "learning_rate": 1.0553658468635212e-07, + "loss": 0.1721, + "step": 9162 + }, + { + "epoch": 2.4382650345928685, + "grad_norm": 0.2629826068878174, + "learning_rate": 1.0551972204498176e-07, + "loss": 0.173, + "step": 9163 + }, + { + "epoch": 2.4385311335816926, + "grad_norm": 0.2701939344406128, + "learning_rate": 1.0550285924617774e-07, + "loss": 0.175, + "step": 9164 + }, + { + "epoch": 2.4387972325705163, + "grad_norm": 0.2661333382129669, + "learning_rate": 1.0548599629042099e-07, + "loss": 0.1674, + "step": 9165 + }, + { + "epoch": 2.43906333155934, + "grad_norm": 0.2746449112892151, + "learning_rate": 1.0546913317819243e-07, + "loss": 0.1643, + "step": 9166 + }, + { + "epoch": 2.439329430548164, + "grad_norm": 0.4126463830471039, + "learning_rate": 1.0545226990997311e-07, + "loss": 0.1863, + "step": 9167 + }, + { + "epoch": 2.4395955295369878, + "grad_norm": 0.29806506633758545, + "learning_rate": 1.0543540648624398e-07, + "loss": 0.1849, + "step": 9168 + }, + { + "epoch": 2.4398616285258115, + "grad_norm": 0.2616540491580963, + "learning_rate": 1.05418542907486e-07, + "loss": 0.1738, + "step": 9169 + }, + { + "epoch": 2.4401277275146356, + "grad_norm": 0.4370535910129547, + "learning_rate": 1.054016791741802e-07, + "loss": 0.2004, + "step": 9170 + }, + { + "epoch": 2.4403938265034593, + "grad_norm": 0.24907106161117554, + "learning_rate": 1.0538481528680748e-07, + "loss": 0.1621, + "step": 9171 + }, + { + "epoch": 2.440659925492283, + "grad_norm": 0.272420197725296, + "learning_rate": 1.0536795124584889e-07, + "loss": 0.1775, + "step": 9172 + }, + { + "epoch": 2.440926024481107, + "grad_norm": 0.39607375860214233, + "learning_rate": 1.0535108705178544e-07, + "loss": 0.1902, + "step": 9173 + }, + { + "epoch": 2.4411921234699308, + "grad_norm": 0.35752958059310913, + "learning_rate": 1.0533422270509813e-07, + "loss": 0.1848, + "step": 9174 + }, + { + "epoch": 2.4414582224587544, + "grad_norm": 0.4046279489994049, + "learning_rate": 1.0531735820626792e-07, + "loss": 0.1906, + "step": 9175 + }, + { + "epoch": 2.4417243214475786, + "grad_norm": 0.3328685164451599, + "learning_rate": 1.0530049355577589e-07, + "loss": 0.1894, + "step": 9176 + }, + { + "epoch": 2.4419904204364022, + "grad_norm": 0.28564098477363586, + "learning_rate": 1.0528362875410297e-07, + "loss": 0.1801, + "step": 9177 + }, + { + "epoch": 2.4422565194252264, + "grad_norm": 0.2789086401462555, + "learning_rate": 1.0526676380173022e-07, + "loss": 0.1964, + "step": 9178 + }, + { + "epoch": 2.44252261841405, + "grad_norm": 0.8059722185134888, + "learning_rate": 1.052498986991387e-07, + "loss": 0.2054, + "step": 9179 + }, + { + "epoch": 2.4427887174028737, + "grad_norm": 0.25585755705833435, + "learning_rate": 1.0523303344680936e-07, + "loss": 0.163, + "step": 9180 + }, + { + "epoch": 2.443054816391698, + "grad_norm": 0.27711984515190125, + "learning_rate": 1.0521616804522331e-07, + "loss": 0.1981, + "step": 9181 + }, + { + "epoch": 2.4433209153805215, + "grad_norm": 0.6175658106803894, + "learning_rate": 1.0519930249486153e-07, + "loss": 0.1864, + "step": 9182 + }, + { + "epoch": 2.4435870143693452, + "grad_norm": 0.3515039384365082, + "learning_rate": 1.0518243679620508e-07, + "loss": 0.188, + "step": 9183 + }, + { + "epoch": 2.4438531133581693, + "grad_norm": 0.2667018473148346, + "learning_rate": 1.05165570949735e-07, + "loss": 0.1828, + "step": 9184 + }, + { + "epoch": 2.444119212346993, + "grad_norm": 0.24234716594219208, + "learning_rate": 1.0514870495593233e-07, + "loss": 0.1608, + "step": 9185 + }, + { + "epoch": 2.444385311335817, + "grad_norm": 0.46376004815101624, + "learning_rate": 1.0513183881527816e-07, + "loss": 0.2001, + "step": 9186 + }, + { + "epoch": 2.444651410324641, + "grad_norm": 0.6197704672813416, + "learning_rate": 1.0511497252825348e-07, + "loss": 0.1921, + "step": 9187 + }, + { + "epoch": 2.4449175093134645, + "grad_norm": 0.2841981053352356, + "learning_rate": 1.0509810609533943e-07, + "loss": 0.1957, + "step": 9188 + }, + { + "epoch": 2.4451836083022886, + "grad_norm": 0.25743114948272705, + "learning_rate": 1.0508123951701699e-07, + "loss": 0.1595, + "step": 9189 + }, + { + "epoch": 2.4454497072911123, + "grad_norm": 0.3657349944114685, + "learning_rate": 1.050643727937673e-07, + "loss": 0.1826, + "step": 9190 + }, + { + "epoch": 2.445715806279936, + "grad_norm": 0.3468981981277466, + "learning_rate": 1.0504750592607141e-07, + "loss": 0.1835, + "step": 9191 + }, + { + "epoch": 2.44598190526876, + "grad_norm": 0.25112074613571167, + "learning_rate": 1.0503063891441037e-07, + "loss": 0.1652, + "step": 9192 + }, + { + "epoch": 2.446248004257584, + "grad_norm": 0.370879590511322, + "learning_rate": 1.0501377175926532e-07, + "loss": 0.1767, + "step": 9193 + }, + { + "epoch": 2.4465141032464075, + "grad_norm": 0.34426987171173096, + "learning_rate": 1.0499690446111729e-07, + "loss": 0.2064, + "step": 9194 + }, + { + "epoch": 2.4467802022352316, + "grad_norm": 0.2657172977924347, + "learning_rate": 1.0498003702044738e-07, + "loss": 0.1805, + "step": 9195 + }, + { + "epoch": 2.4470463012240553, + "grad_norm": 0.28037121891975403, + "learning_rate": 1.049631694377367e-07, + "loss": 0.1796, + "step": 9196 + }, + { + "epoch": 2.447312400212879, + "grad_norm": 0.2732643783092499, + "learning_rate": 1.0494630171346637e-07, + "loss": 0.172, + "step": 9197 + }, + { + "epoch": 2.447578499201703, + "grad_norm": 0.3457004725933075, + "learning_rate": 1.0492943384811744e-07, + "loss": 0.1878, + "step": 9198 + }, + { + "epoch": 2.447844598190527, + "grad_norm": 0.2657914161682129, + "learning_rate": 1.0491256584217104e-07, + "loss": 0.1737, + "step": 9199 + }, + { + "epoch": 2.4481106971793505, + "grad_norm": 0.28884458541870117, + "learning_rate": 1.0489569769610829e-07, + "loss": 0.2081, + "step": 9200 + }, + { + "epoch": 2.4483767961681746, + "grad_norm": 0.24465011060237885, + "learning_rate": 1.0487882941041029e-07, + "loss": 0.1601, + "step": 9201 + }, + { + "epoch": 2.4486428951569983, + "grad_norm": 0.2586555480957031, + "learning_rate": 1.0486196098555815e-07, + "loss": 0.1587, + "step": 9202 + }, + { + "epoch": 2.4489089941458224, + "grad_norm": 0.3549834191799164, + "learning_rate": 1.0484509242203301e-07, + "loss": 0.2047, + "step": 9203 + }, + { + "epoch": 2.449175093134646, + "grad_norm": 0.2833446264266968, + "learning_rate": 1.0482822372031599e-07, + "loss": 0.1972, + "step": 9204 + }, + { + "epoch": 2.44944119212347, + "grad_norm": 0.334438294172287, + "learning_rate": 1.0481135488088822e-07, + "loss": 0.1841, + "step": 9205 + }, + { + "epoch": 2.449707291112294, + "grad_norm": 0.33862197399139404, + "learning_rate": 1.0479448590423081e-07, + "loss": 0.2078, + "step": 9206 + }, + { + "epoch": 2.4499733901011176, + "grad_norm": 0.26140615344047546, + "learning_rate": 1.0477761679082494e-07, + "loss": 0.174, + "step": 9207 + }, + { + "epoch": 2.4502394890899413, + "grad_norm": 0.27811679244041443, + "learning_rate": 1.0476074754115171e-07, + "loss": 0.1606, + "step": 9208 + }, + { + "epoch": 2.4505055880787654, + "grad_norm": 0.373708575963974, + "learning_rate": 1.0474387815569233e-07, + "loss": 0.1765, + "step": 9209 + }, + { + "epoch": 2.450771687067589, + "grad_norm": 0.2891259491443634, + "learning_rate": 1.0472700863492785e-07, + "loss": 0.1837, + "step": 9210 + }, + { + "epoch": 2.451037786056413, + "grad_norm": 0.27483755350112915, + "learning_rate": 1.0471013897933951e-07, + "loss": 0.1831, + "step": 9211 + }, + { + "epoch": 2.451303885045237, + "grad_norm": 0.2903304696083069, + "learning_rate": 1.0469326918940846e-07, + "loss": 0.1895, + "step": 9212 + }, + { + "epoch": 2.4515699840340606, + "grad_norm": 0.28100165724754333, + "learning_rate": 1.0467639926561578e-07, + "loss": 0.1783, + "step": 9213 + }, + { + "epoch": 2.4518360830228847, + "grad_norm": 0.35207033157348633, + "learning_rate": 1.0465952920844272e-07, + "loss": 0.1776, + "step": 9214 + }, + { + "epoch": 2.4521021820117084, + "grad_norm": 0.27439919114112854, + "learning_rate": 1.0464265901837044e-07, + "loss": 0.1747, + "step": 9215 + }, + { + "epoch": 2.452368281000532, + "grad_norm": 0.34388262033462524, + "learning_rate": 1.0462578869588006e-07, + "loss": 0.1846, + "step": 9216 + }, + { + "epoch": 2.452634379989356, + "grad_norm": 0.2698422968387604, + "learning_rate": 1.0460891824145276e-07, + "loss": 0.1692, + "step": 9217 + }, + { + "epoch": 2.45290047897818, + "grad_norm": 0.263778418302536, + "learning_rate": 1.0459204765556979e-07, + "loss": 0.1687, + "step": 9218 + }, + { + "epoch": 2.4531665779670035, + "grad_norm": 0.25234565138816833, + "learning_rate": 1.045751769387123e-07, + "loss": 0.1875, + "step": 9219 + }, + { + "epoch": 2.4534326769558277, + "grad_norm": 0.27046290040016174, + "learning_rate": 1.0455830609136142e-07, + "loss": 0.1943, + "step": 9220 + }, + { + "epoch": 2.4536987759446514, + "grad_norm": 0.31335800886154175, + "learning_rate": 1.0454143511399842e-07, + "loss": 0.1999, + "step": 9221 + }, + { + "epoch": 2.453964874933475, + "grad_norm": 0.26846936345100403, + "learning_rate": 1.0452456400710446e-07, + "loss": 0.1853, + "step": 9222 + }, + { + "epoch": 2.454230973922299, + "grad_norm": 0.2844768166542053, + "learning_rate": 1.0450769277116073e-07, + "loss": 0.195, + "step": 9223 + }, + { + "epoch": 2.454497072911123, + "grad_norm": 0.385153591632843, + "learning_rate": 1.0449082140664844e-07, + "loss": 0.1918, + "step": 9224 + }, + { + "epoch": 2.454763171899947, + "grad_norm": 0.3817715346813202, + "learning_rate": 1.044739499140488e-07, + "loss": 0.1998, + "step": 9225 + }, + { + "epoch": 2.4550292708887707, + "grad_norm": 0.2779160141944885, + "learning_rate": 1.0445707829384302e-07, + "loss": 0.1771, + "step": 9226 + }, + { + "epoch": 2.4552953698775943, + "grad_norm": 0.2708698511123657, + "learning_rate": 1.0444020654651231e-07, + "loss": 0.1754, + "step": 9227 + }, + { + "epoch": 2.4555614688664185, + "grad_norm": 0.3319680094718933, + "learning_rate": 1.0442333467253788e-07, + "loss": 0.1878, + "step": 9228 + }, + { + "epoch": 2.455827567855242, + "grad_norm": 0.2562406659126282, + "learning_rate": 1.0440646267240097e-07, + "loss": 0.1789, + "step": 9229 + }, + { + "epoch": 2.456093666844066, + "grad_norm": 0.3279339671134949, + "learning_rate": 1.0438959054658278e-07, + "loss": 0.1923, + "step": 9230 + }, + { + "epoch": 2.45635976583289, + "grad_norm": 0.2669832706451416, + "learning_rate": 1.0437271829556454e-07, + "loss": 0.1877, + "step": 9231 + }, + { + "epoch": 2.4566258648217136, + "grad_norm": 0.34134459495544434, + "learning_rate": 1.043558459198275e-07, + "loss": 0.1918, + "step": 9232 + }, + { + "epoch": 2.4568919638105378, + "grad_norm": 0.28160595893859863, + "learning_rate": 1.043389734198529e-07, + "loss": 0.1785, + "step": 9233 + }, + { + "epoch": 2.4571580627993614, + "grad_norm": 0.34453848004341125, + "learning_rate": 1.0432210079612193e-07, + "loss": 0.1897, + "step": 9234 + }, + { + "epoch": 2.457424161788185, + "grad_norm": 0.2812197208404541, + "learning_rate": 1.0430522804911588e-07, + "loss": 0.1921, + "step": 9235 + }, + { + "epoch": 2.4576902607770093, + "grad_norm": 0.25974878668785095, + "learning_rate": 1.0428835517931598e-07, + "loss": 0.1789, + "step": 9236 + }, + { + "epoch": 2.457956359765833, + "grad_norm": 0.25216591358184814, + "learning_rate": 1.0427148218720348e-07, + "loss": 0.1744, + "step": 9237 + }, + { + "epoch": 2.4582224587546566, + "grad_norm": 0.2700729966163635, + "learning_rate": 1.0425460907325958e-07, + "loss": 0.1721, + "step": 9238 + }, + { + "epoch": 2.4584885577434807, + "grad_norm": 0.261356383562088, + "learning_rate": 1.0423773583796562e-07, + "loss": 0.1678, + "step": 9239 + }, + { + "epoch": 2.4587546567323044, + "grad_norm": 0.2315402776002884, + "learning_rate": 1.0422086248180284e-07, + "loss": 0.1611, + "step": 9240 + }, + { + "epoch": 2.459020755721128, + "grad_norm": 0.2772538661956787, + "learning_rate": 1.0420398900525246e-07, + "loss": 0.1773, + "step": 9241 + }, + { + "epoch": 2.4592868547099522, + "grad_norm": 0.32699814438819885, + "learning_rate": 1.041871154087958e-07, + "loss": 0.1913, + "step": 9242 + }, + { + "epoch": 2.459552953698776, + "grad_norm": 0.4005555212497711, + "learning_rate": 1.0417024169291406e-07, + "loss": 0.1801, + "step": 9243 + }, + { + "epoch": 2.4598190526875996, + "grad_norm": 0.4004446864128113, + "learning_rate": 1.0415336785808854e-07, + "loss": 0.1926, + "step": 9244 + }, + { + "epoch": 2.4600851516764237, + "grad_norm": 0.394351601600647, + "learning_rate": 1.041364939048006e-07, + "loss": 0.1951, + "step": 9245 + }, + { + "epoch": 2.4603512506652474, + "grad_norm": 0.26334112882614136, + "learning_rate": 1.0411961983353136e-07, + "loss": 0.1721, + "step": 9246 + }, + { + "epoch": 2.460617349654071, + "grad_norm": 0.27910804748535156, + "learning_rate": 1.0410274564476226e-07, + "loss": 0.1808, + "step": 9247 + }, + { + "epoch": 2.460883448642895, + "grad_norm": 0.3083896040916443, + "learning_rate": 1.040858713389745e-07, + "loss": 0.1801, + "step": 9248 + }, + { + "epoch": 2.461149547631719, + "grad_norm": 0.2789459824562073, + "learning_rate": 1.0406899691664937e-07, + "loss": 0.1926, + "step": 9249 + }, + { + "epoch": 2.461415646620543, + "grad_norm": 0.31074976921081543, + "learning_rate": 1.0405212237826819e-07, + "loss": 0.1733, + "step": 9250 + }, + { + "epoch": 2.4616817456093667, + "grad_norm": 0.30033254623413086, + "learning_rate": 1.0403524772431225e-07, + "loss": 0.1815, + "step": 9251 + }, + { + "epoch": 2.4619478445981904, + "grad_norm": 0.48929575085639954, + "learning_rate": 1.0401837295526286e-07, + "loss": 0.2004, + "step": 9252 + }, + { + "epoch": 2.4622139435870145, + "grad_norm": 0.32782241702079773, + "learning_rate": 1.0400149807160126e-07, + "loss": 0.1861, + "step": 9253 + }, + { + "epoch": 2.462480042575838, + "grad_norm": 0.3081984221935272, + "learning_rate": 1.0398462307380885e-07, + "loss": 0.2045, + "step": 9254 + }, + { + "epoch": 2.462746141564662, + "grad_norm": 0.2875906527042389, + "learning_rate": 1.0396774796236688e-07, + "loss": 0.1893, + "step": 9255 + }, + { + "epoch": 2.463012240553486, + "grad_norm": 0.2433844357728958, + "learning_rate": 1.0395087273775665e-07, + "loss": 0.1593, + "step": 9256 + }, + { + "epoch": 2.4632783395423097, + "grad_norm": 0.3563707172870636, + "learning_rate": 1.0393399740045953e-07, + "loss": 0.1947, + "step": 9257 + }, + { + "epoch": 2.463544438531134, + "grad_norm": 0.2830277681350708, + "learning_rate": 1.0391712195095681e-07, + "loss": 0.182, + "step": 9258 + }, + { + "epoch": 2.4638105375199575, + "grad_norm": 0.3828220069408417, + "learning_rate": 1.039002463897298e-07, + "loss": 0.179, + "step": 9259 + }, + { + "epoch": 2.464076636508781, + "grad_norm": 0.27911296486854553, + "learning_rate": 1.0388337071725983e-07, + "loss": 0.1807, + "step": 9260 + }, + { + "epoch": 2.4643427354976053, + "grad_norm": 0.2616578936576843, + "learning_rate": 1.0386649493402826e-07, + "loss": 0.1744, + "step": 9261 + }, + { + "epoch": 2.464608834486429, + "grad_norm": 0.2865540683269501, + "learning_rate": 1.0384961904051637e-07, + "loss": 0.1874, + "step": 9262 + }, + { + "epoch": 2.4648749334752527, + "grad_norm": 0.31111499667167664, + "learning_rate": 1.0383274303720553e-07, + "loss": 0.1952, + "step": 9263 + }, + { + "epoch": 2.465141032464077, + "grad_norm": 0.25772467255592346, + "learning_rate": 1.0381586692457706e-07, + "loss": 0.1894, + "step": 9264 + }, + { + "epoch": 2.4654071314529005, + "grad_norm": 0.26864370703697205, + "learning_rate": 1.0379899070311233e-07, + "loss": 0.1745, + "step": 9265 + }, + { + "epoch": 2.465673230441724, + "grad_norm": 0.3010498285293579, + "learning_rate": 1.0378211437329267e-07, + "loss": 0.192, + "step": 9266 + }, + { + "epoch": 2.4659393294305483, + "grad_norm": 0.3567357063293457, + "learning_rate": 1.0376523793559944e-07, + "loss": 0.1773, + "step": 9267 + }, + { + "epoch": 2.466205428419372, + "grad_norm": 0.2687128186225891, + "learning_rate": 1.0374836139051392e-07, + "loss": 0.155, + "step": 9268 + }, + { + "epoch": 2.4664715274081956, + "grad_norm": 0.2678039073944092, + "learning_rate": 1.0373148473851756e-07, + "loss": 0.1657, + "step": 9269 + }, + { + "epoch": 2.4667376263970198, + "grad_norm": 0.27065882086753845, + "learning_rate": 1.0371460798009166e-07, + "loss": 0.1856, + "step": 9270 + }, + { + "epoch": 2.4670037253858434, + "grad_norm": 0.26058974862098694, + "learning_rate": 1.0369773111571756e-07, + "loss": 0.1699, + "step": 9271 + }, + { + "epoch": 2.467269824374667, + "grad_norm": 0.26828834414482117, + "learning_rate": 1.036808541458767e-07, + "loss": 0.1762, + "step": 9272 + }, + { + "epoch": 2.4675359233634913, + "grad_norm": 0.2882813513278961, + "learning_rate": 1.0366397707105038e-07, + "loss": 0.1941, + "step": 9273 + }, + { + "epoch": 2.467802022352315, + "grad_norm": 0.32214367389678955, + "learning_rate": 1.0364709989172e-07, + "loss": 0.168, + "step": 9274 + }, + { + "epoch": 2.468068121341139, + "grad_norm": 0.30894196033477783, + "learning_rate": 1.0363022260836689e-07, + "loss": 0.1942, + "step": 9275 + }, + { + "epoch": 2.4683342203299627, + "grad_norm": 0.25856220722198486, + "learning_rate": 1.036133452214725e-07, + "loss": 0.1647, + "step": 9276 + }, + { + "epoch": 2.4686003193187864, + "grad_norm": 0.2820211946964264, + "learning_rate": 1.0359646773151814e-07, + "loss": 0.1622, + "step": 9277 + }, + { + "epoch": 2.4688664183076106, + "grad_norm": 0.2777004837989807, + "learning_rate": 1.0357959013898519e-07, + "loss": 0.1869, + "step": 9278 + }, + { + "epoch": 2.4691325172964342, + "grad_norm": 0.4337560832500458, + "learning_rate": 1.0356271244435509e-07, + "loss": 0.1787, + "step": 9279 + }, + { + "epoch": 2.4693986162852584, + "grad_norm": 0.24693241715431213, + "learning_rate": 1.0354583464810915e-07, + "loss": 0.1718, + "step": 9280 + }, + { + "epoch": 2.469664715274082, + "grad_norm": 0.25040891766548157, + "learning_rate": 1.0352895675072883e-07, + "loss": 0.1583, + "step": 9281 + }, + { + "epoch": 2.4699308142629057, + "grad_norm": 0.3456902801990509, + "learning_rate": 1.0351207875269547e-07, + "loss": 0.1733, + "step": 9282 + }, + { + "epoch": 2.47019691325173, + "grad_norm": 0.26327085494995117, + "learning_rate": 1.0349520065449051e-07, + "loss": 0.1785, + "step": 9283 + }, + { + "epoch": 2.4704630122405535, + "grad_norm": 0.2974182963371277, + "learning_rate": 1.0347832245659532e-07, + "loss": 0.1874, + "step": 9284 + }, + { + "epoch": 2.470729111229377, + "grad_norm": 0.30382218956947327, + "learning_rate": 1.0346144415949131e-07, + "loss": 0.191, + "step": 9285 + }, + { + "epoch": 2.4709952102182013, + "grad_norm": 0.3306291997432709, + "learning_rate": 1.0344456576365985e-07, + "loss": 0.1858, + "step": 9286 + }, + { + "epoch": 2.471261309207025, + "grad_norm": 0.324855774641037, + "learning_rate": 1.0342768726958242e-07, + "loss": 0.1854, + "step": 9287 + }, + { + "epoch": 2.4715274081958487, + "grad_norm": 0.2692417502403259, + "learning_rate": 1.0341080867774037e-07, + "loss": 0.1753, + "step": 9288 + }, + { + "epoch": 2.471793507184673, + "grad_norm": 0.27305248379707336, + "learning_rate": 1.0339392998861509e-07, + "loss": 0.1774, + "step": 9289 + }, + { + "epoch": 2.4720596061734965, + "grad_norm": 0.25481218099594116, + "learning_rate": 1.0337705120268808e-07, + "loss": 0.1751, + "step": 9290 + }, + { + "epoch": 2.47232570516232, + "grad_norm": 1.4289354085922241, + "learning_rate": 1.0336017232044067e-07, + "loss": 0.1888, + "step": 9291 + }, + { + "epoch": 2.4725918041511443, + "grad_norm": 0.2711571156978607, + "learning_rate": 1.0334329334235434e-07, + "loss": 0.1698, + "step": 9292 + }, + { + "epoch": 2.472857903139968, + "grad_norm": 0.3030376136302948, + "learning_rate": 1.0332641426891046e-07, + "loss": 0.2016, + "step": 9293 + }, + { + "epoch": 2.4731240021287917, + "grad_norm": 0.2706642746925354, + "learning_rate": 1.0330953510059053e-07, + "loss": 0.1864, + "step": 9294 + }, + { + "epoch": 2.473390101117616, + "grad_norm": 0.2875572144985199, + "learning_rate": 1.0329265583787592e-07, + "loss": 0.1832, + "step": 9295 + }, + { + "epoch": 2.4736562001064395, + "grad_norm": 0.3796294033527374, + "learning_rate": 1.0327577648124803e-07, + "loss": 0.1875, + "step": 9296 + }, + { + "epoch": 2.4739222990952636, + "grad_norm": 0.3311891555786133, + "learning_rate": 1.032588970311884e-07, + "loss": 0.1752, + "step": 9297 + }, + { + "epoch": 2.4741883980840873, + "grad_norm": 0.26984742283821106, + "learning_rate": 1.0324201748817837e-07, + "loss": 0.1731, + "step": 9298 + }, + { + "epoch": 2.474454497072911, + "grad_norm": 0.3767528235912323, + "learning_rate": 1.0322513785269942e-07, + "loss": 0.1915, + "step": 9299 + }, + { + "epoch": 2.474720596061735, + "grad_norm": 0.36461547017097473, + "learning_rate": 1.0320825812523295e-07, + "loss": 0.1782, + "step": 9300 + }, + { + "epoch": 2.474986695050559, + "grad_norm": 0.4834677577018738, + "learning_rate": 1.0319137830626048e-07, + "loss": 0.2043, + "step": 9301 + }, + { + "epoch": 2.4752527940393825, + "grad_norm": 0.27153876423835754, + "learning_rate": 1.0317449839626341e-07, + "loss": 0.1941, + "step": 9302 + }, + { + "epoch": 2.4755188930282066, + "grad_norm": 0.2806762754917145, + "learning_rate": 1.0315761839572321e-07, + "loss": 0.1742, + "step": 9303 + }, + { + "epoch": 2.4757849920170303, + "grad_norm": 0.37570688128471375, + "learning_rate": 1.0314073830512129e-07, + "loss": 0.1832, + "step": 9304 + }, + { + "epoch": 2.4760510910058544, + "grad_norm": 0.3300805389881134, + "learning_rate": 1.0312385812493915e-07, + "loss": 0.1985, + "step": 9305 + }, + { + "epoch": 2.476317189994678, + "grad_norm": 0.3584786355495453, + "learning_rate": 1.0310697785565822e-07, + "loss": 0.1857, + "step": 9306 + }, + { + "epoch": 2.4765832889835018, + "grad_norm": 0.31816062331199646, + "learning_rate": 1.0309009749775997e-07, + "loss": 0.1724, + "step": 9307 + }, + { + "epoch": 2.476849387972326, + "grad_norm": 0.28452759981155396, + "learning_rate": 1.0307321705172585e-07, + "loss": 0.2028, + "step": 9308 + }, + { + "epoch": 2.4771154869611496, + "grad_norm": 0.27786099910736084, + "learning_rate": 1.0305633651803735e-07, + "loss": 0.1835, + "step": 9309 + }, + { + "epoch": 2.4773815859499733, + "grad_norm": 0.26919254660606384, + "learning_rate": 1.0303945589717593e-07, + "loss": 0.201, + "step": 9310 + }, + { + "epoch": 2.4776476849387974, + "grad_norm": 0.3610513508319855, + "learning_rate": 1.03022575189623e-07, + "loss": 0.2057, + "step": 9311 + }, + { + "epoch": 2.477913783927621, + "grad_norm": 0.2630827724933624, + "learning_rate": 1.0300569439586014e-07, + "loss": 0.1696, + "step": 9312 + }, + { + "epoch": 2.4781798829164448, + "grad_norm": 0.3903343379497528, + "learning_rate": 1.0298881351636876e-07, + "loss": 0.1876, + "step": 9313 + }, + { + "epoch": 2.478445981905269, + "grad_norm": 0.2585439383983612, + "learning_rate": 1.0297193255163031e-07, + "loss": 0.1657, + "step": 9314 + }, + { + "epoch": 2.4787120808940926, + "grad_norm": 0.25223180651664734, + "learning_rate": 1.0295505150212633e-07, + "loss": 0.1723, + "step": 9315 + }, + { + "epoch": 2.4789781798829162, + "grad_norm": 0.34046486020088196, + "learning_rate": 1.029381703683383e-07, + "loss": 0.1857, + "step": 9316 + }, + { + "epoch": 2.4792442788717404, + "grad_norm": 0.4852840006351471, + "learning_rate": 1.0292128915074762e-07, + "loss": 0.1898, + "step": 9317 + }, + { + "epoch": 2.479510377860564, + "grad_norm": 0.2788182199001312, + "learning_rate": 1.0290440784983587e-07, + "loss": 0.174, + "step": 9318 + }, + { + "epoch": 2.4797764768493877, + "grad_norm": 0.3081790804862976, + "learning_rate": 1.028875264660845e-07, + "loss": 0.2035, + "step": 9319 + }, + { + "epoch": 2.480042575838212, + "grad_norm": 0.2941235303878784, + "learning_rate": 1.0287064499997502e-07, + "loss": 0.207, + "step": 9320 + }, + { + "epoch": 2.4803086748270355, + "grad_norm": 0.28276410698890686, + "learning_rate": 1.0285376345198891e-07, + "loss": 0.1798, + "step": 9321 + }, + { + "epoch": 2.4805747738158597, + "grad_norm": 0.26255303621292114, + "learning_rate": 1.0283688182260766e-07, + "loss": 0.1696, + "step": 9322 + }, + { + "epoch": 2.4808408728046834, + "grad_norm": 0.32391974329948425, + "learning_rate": 1.0282000011231277e-07, + "loss": 0.1836, + "step": 9323 + }, + { + "epoch": 2.481106971793507, + "grad_norm": 0.2735765278339386, + "learning_rate": 1.0280311832158577e-07, + "loss": 0.1811, + "step": 9324 + }, + { + "epoch": 2.481373070782331, + "grad_norm": 0.28009557723999023, + "learning_rate": 1.0278623645090809e-07, + "loss": 0.1631, + "step": 9325 + }, + { + "epoch": 2.481639169771155, + "grad_norm": 0.29189515113830566, + "learning_rate": 1.0276935450076134e-07, + "loss": 0.1784, + "step": 9326 + }, + { + "epoch": 2.4819052687599785, + "grad_norm": 0.3350098729133606, + "learning_rate": 1.0275247247162695e-07, + "loss": 0.1968, + "step": 9327 + }, + { + "epoch": 2.4821713677488026, + "grad_norm": 0.25231385231018066, + "learning_rate": 1.0273559036398646e-07, + "loss": 0.169, + "step": 9328 + }, + { + "epoch": 2.4824374667376263, + "grad_norm": 0.35908132791519165, + "learning_rate": 1.0271870817832135e-07, + "loss": 0.1729, + "step": 9329 + }, + { + "epoch": 2.4827035657264505, + "grad_norm": 0.29658937454223633, + "learning_rate": 1.0270182591511318e-07, + "loss": 0.1744, + "step": 9330 + }, + { + "epoch": 2.482969664715274, + "grad_norm": 0.35279542207717896, + "learning_rate": 1.0268494357484345e-07, + "loss": 0.1937, + "step": 9331 + }, + { + "epoch": 2.483235763704098, + "grad_norm": 0.30892497301101685, + "learning_rate": 1.0266806115799364e-07, + "loss": 0.1729, + "step": 9332 + }, + { + "epoch": 2.483501862692922, + "grad_norm": 0.2550913393497467, + "learning_rate": 1.0265117866504533e-07, + "loss": 0.1578, + "step": 9333 + }, + { + "epoch": 2.4837679616817456, + "grad_norm": 0.2842426002025604, + "learning_rate": 1.0263429609648002e-07, + "loss": 0.187, + "step": 9334 + }, + { + "epoch": 2.4840340606705693, + "grad_norm": 0.2720085680484772, + "learning_rate": 1.0261741345277921e-07, + "loss": 0.1756, + "step": 9335 + }, + { + "epoch": 2.4843001596593934, + "grad_norm": 0.3797548711299896, + "learning_rate": 1.0260053073442448e-07, + "loss": 0.1834, + "step": 9336 + }, + { + "epoch": 2.484566258648217, + "grad_norm": 0.2911050021648407, + "learning_rate": 1.0258364794189729e-07, + "loss": 0.1752, + "step": 9337 + }, + { + "epoch": 2.484832357637041, + "grad_norm": 0.36338064074516296, + "learning_rate": 1.0256676507567925e-07, + "loss": 0.1764, + "step": 9338 + }, + { + "epoch": 2.485098456625865, + "grad_norm": 0.34289366006851196, + "learning_rate": 1.0254988213625184e-07, + "loss": 0.1737, + "step": 9339 + }, + { + "epoch": 2.4853645556146886, + "grad_norm": 0.4596160352230072, + "learning_rate": 1.0253299912409659e-07, + "loss": 0.1796, + "step": 9340 + }, + { + "epoch": 2.4856306546035123, + "grad_norm": 0.26905557513237, + "learning_rate": 1.0251611603969507e-07, + "loss": 0.1683, + "step": 9341 + }, + { + "epoch": 2.4858967535923364, + "grad_norm": 0.25621557235717773, + "learning_rate": 1.0249923288352882e-07, + "loss": 0.1699, + "step": 9342 + }, + { + "epoch": 2.48616285258116, + "grad_norm": 0.4178546965122223, + "learning_rate": 1.0248234965607937e-07, + "loss": 0.2018, + "step": 9343 + }, + { + "epoch": 2.4864289515699842, + "grad_norm": 0.27300211787223816, + "learning_rate": 1.0246546635782824e-07, + "loss": 0.1946, + "step": 9344 + }, + { + "epoch": 2.486695050558808, + "grad_norm": 0.2739085853099823, + "learning_rate": 1.0244858298925703e-07, + "loss": 0.1858, + "step": 9345 + }, + { + "epoch": 2.4869611495476316, + "grad_norm": 0.2821018099784851, + "learning_rate": 1.0243169955084725e-07, + "loss": 0.1644, + "step": 9346 + }, + { + "epoch": 2.4872272485364557, + "grad_norm": 0.5601105093955994, + "learning_rate": 1.0241481604308044e-07, + "loss": 0.1706, + "step": 9347 + }, + { + "epoch": 2.4874933475252794, + "grad_norm": 0.48198944330215454, + "learning_rate": 1.0239793246643818e-07, + "loss": 0.1833, + "step": 9348 + }, + { + "epoch": 2.487759446514103, + "grad_norm": 0.3585558533668518, + "learning_rate": 1.0238104882140202e-07, + "loss": 0.196, + "step": 9349 + }, + { + "epoch": 2.488025545502927, + "grad_norm": 0.3091385066509247, + "learning_rate": 1.0236416510845351e-07, + "loss": 0.1896, + "step": 9350 + }, + { + "epoch": 2.488291644491751, + "grad_norm": 0.2934665381908417, + "learning_rate": 1.0234728132807417e-07, + "loss": 0.1977, + "step": 9351 + }, + { + "epoch": 2.488557743480575, + "grad_norm": 0.3533155620098114, + "learning_rate": 1.0233039748074563e-07, + "loss": 0.1789, + "step": 9352 + }, + { + "epoch": 2.4888238424693987, + "grad_norm": 0.3412761986255646, + "learning_rate": 1.0231351356694941e-07, + "loss": 0.1895, + "step": 9353 + }, + { + "epoch": 2.4890899414582224, + "grad_norm": 0.26808640360832214, + "learning_rate": 1.0229662958716709e-07, + "loss": 0.1688, + "step": 9354 + }, + { + "epoch": 2.4893560404470465, + "grad_norm": 0.3249465823173523, + "learning_rate": 1.0227974554188021e-07, + "loss": 0.1964, + "step": 9355 + }, + { + "epoch": 2.48962213943587, + "grad_norm": 0.2572871446609497, + "learning_rate": 1.0226286143157037e-07, + "loss": 0.17, + "step": 9356 + }, + { + "epoch": 2.489888238424694, + "grad_norm": 0.2751869857311249, + "learning_rate": 1.0224597725671913e-07, + "loss": 0.1918, + "step": 9357 + }, + { + "epoch": 2.490154337413518, + "grad_norm": 0.2730103135108948, + "learning_rate": 1.0222909301780802e-07, + "loss": 0.1804, + "step": 9358 + }, + { + "epoch": 2.4904204364023417, + "grad_norm": 0.29515504837036133, + "learning_rate": 1.0221220871531868e-07, + "loss": 0.1724, + "step": 9359 + }, + { + "epoch": 2.4906865353911654, + "grad_norm": 0.28740963339805603, + "learning_rate": 1.0219532434973266e-07, + "loss": 0.1779, + "step": 9360 + }, + { + "epoch": 2.4909526343799895, + "grad_norm": 0.2718030512332916, + "learning_rate": 1.0217843992153153e-07, + "loss": 0.172, + "step": 9361 + }, + { + "epoch": 2.491218733368813, + "grad_norm": 0.2616580128669739, + "learning_rate": 1.0216155543119682e-07, + "loss": 0.1791, + "step": 9362 + }, + { + "epoch": 2.491484832357637, + "grad_norm": 0.7221017479896545, + "learning_rate": 1.021446708792102e-07, + "loss": 0.1789, + "step": 9363 + }, + { + "epoch": 2.491750931346461, + "grad_norm": 0.4294281005859375, + "learning_rate": 1.0212778626605319e-07, + "loss": 0.1717, + "step": 9364 + }, + { + "epoch": 2.4920170303352847, + "grad_norm": 0.26456698775291443, + "learning_rate": 1.0211090159220739e-07, + "loss": 0.1778, + "step": 9365 + }, + { + "epoch": 2.4922831293241083, + "grad_norm": 0.3303757905960083, + "learning_rate": 1.020940168581544e-07, + "loss": 0.172, + "step": 9366 + }, + { + "epoch": 2.4925492283129325, + "grad_norm": 0.40249326825141907, + "learning_rate": 1.0207713206437578e-07, + "loss": 0.1839, + "step": 9367 + }, + { + "epoch": 2.492815327301756, + "grad_norm": 0.36520612239837646, + "learning_rate": 1.0206024721135315e-07, + "loss": 0.1825, + "step": 9368 + }, + { + "epoch": 2.4930814262905803, + "grad_norm": 0.2623172402381897, + "learning_rate": 1.0204336229956806e-07, + "loss": 0.1888, + "step": 9369 + }, + { + "epoch": 2.493347525279404, + "grad_norm": 0.5626546144485474, + "learning_rate": 1.0202647732950214e-07, + "loss": 0.1883, + "step": 9370 + }, + { + "epoch": 2.4936136242682276, + "grad_norm": 0.2496350258588791, + "learning_rate": 1.0200959230163697e-07, + "loss": 0.1581, + "step": 9371 + }, + { + "epoch": 2.4938797232570518, + "grad_norm": 0.26074275374412537, + "learning_rate": 1.0199270721645412e-07, + "loss": 0.1848, + "step": 9372 + }, + { + "epoch": 2.4941458222458754, + "grad_norm": 0.4709879159927368, + "learning_rate": 1.0197582207443522e-07, + "loss": 0.1925, + "step": 9373 + }, + { + "epoch": 2.494411921234699, + "grad_norm": 0.3817465603351593, + "learning_rate": 1.0195893687606188e-07, + "loss": 0.1845, + "step": 9374 + }, + { + "epoch": 2.4946780202235233, + "grad_norm": 0.47722581028938293, + "learning_rate": 1.0194205162181569e-07, + "loss": 0.1892, + "step": 9375 + }, + { + "epoch": 2.494944119212347, + "grad_norm": 0.26187846064567566, + "learning_rate": 1.019251663121782e-07, + "loss": 0.1633, + "step": 9376 + }, + { + "epoch": 2.495210218201171, + "grad_norm": 0.26733526587486267, + "learning_rate": 1.0190828094763109e-07, + "loss": 0.1757, + "step": 9377 + }, + { + "epoch": 2.4954763171899947, + "grad_norm": 0.27688100934028625, + "learning_rate": 1.018913955286559e-07, + "loss": 0.1753, + "step": 9378 + }, + { + "epoch": 2.4957424161788184, + "grad_norm": 0.2718322277069092, + "learning_rate": 1.0187451005573429e-07, + "loss": 0.1727, + "step": 9379 + }, + { + "epoch": 2.4960085151676425, + "grad_norm": 0.29264160990715027, + "learning_rate": 1.0185762452934782e-07, + "loss": 0.199, + "step": 9380 + }, + { + "epoch": 2.4962746141564662, + "grad_norm": 0.28786414861679077, + "learning_rate": 1.0184073894997815e-07, + "loss": 0.1976, + "step": 9381 + }, + { + "epoch": 2.49654071314529, + "grad_norm": 0.2953801155090332, + "learning_rate": 1.0182385331810686e-07, + "loss": 0.1787, + "step": 9382 + }, + { + "epoch": 2.496806812134114, + "grad_norm": 0.4297066032886505, + "learning_rate": 1.0180696763421554e-07, + "loss": 0.1868, + "step": 9383 + }, + { + "epoch": 2.4970729111229377, + "grad_norm": 0.2825303375720978, + "learning_rate": 1.0179008189878586e-07, + "loss": 0.1854, + "step": 9384 + }, + { + "epoch": 2.4973390101117614, + "grad_norm": 0.2745210826396942, + "learning_rate": 1.0177319611229942e-07, + "loss": 0.1617, + "step": 9385 + }, + { + "epoch": 2.4976051091005855, + "grad_norm": 0.2555813789367676, + "learning_rate": 1.0175631027523779e-07, + "loss": 0.1573, + "step": 9386 + }, + { + "epoch": 2.497871208089409, + "grad_norm": 0.26671165227890015, + "learning_rate": 1.0173942438808264e-07, + "loss": 0.1883, + "step": 9387 + }, + { + "epoch": 2.498137307078233, + "grad_norm": 0.2527552545070648, + "learning_rate": 1.0172253845131555e-07, + "loss": 0.1764, + "step": 9388 + }, + { + "epoch": 2.498403406067057, + "grad_norm": 0.3400966227054596, + "learning_rate": 1.0170565246541821e-07, + "loss": 0.173, + "step": 9389 + }, + { + "epoch": 2.4986695050558807, + "grad_norm": 0.27814173698425293, + "learning_rate": 1.0168876643087214e-07, + "loss": 0.1745, + "step": 9390 + }, + { + "epoch": 2.4989356040447044, + "grad_norm": 0.31419551372528076, + "learning_rate": 1.0167188034815901e-07, + "loss": 0.1805, + "step": 9391 + }, + { + "epoch": 2.4992017030335285, + "grad_norm": 0.26656702160835266, + "learning_rate": 1.0165499421776051e-07, + "loss": 0.1826, + "step": 9392 + }, + { + "epoch": 2.499467802022352, + "grad_norm": 0.36809781193733215, + "learning_rate": 1.016381080401582e-07, + "loss": 0.1895, + "step": 9393 + }, + { + "epoch": 2.4997339010111763, + "grad_norm": 0.35699835419654846, + "learning_rate": 1.0162122181583371e-07, + "loss": 0.1637, + "step": 9394 + }, + { + "epoch": 2.5, + "grad_norm": 0.3596678078174591, + "learning_rate": 1.0160433554526866e-07, + "loss": 0.2071, + "step": 9395 + }, + { + "epoch": 2.5002660989888237, + "grad_norm": 0.2621347904205322, + "learning_rate": 1.0158744922894473e-07, + "loss": 0.1674, + "step": 9396 + }, + { + "epoch": 2.500532197977648, + "grad_norm": 0.3937617242336273, + "learning_rate": 1.0157056286734352e-07, + "loss": 0.1971, + "step": 9397 + }, + { + "epoch": 2.5007982969664715, + "grad_norm": 0.5007838606834412, + "learning_rate": 1.0155367646094663e-07, + "loss": 0.1819, + "step": 9398 + }, + { + "epoch": 2.5010643959552956, + "grad_norm": 0.28133460879325867, + "learning_rate": 1.0153679001023578e-07, + "loss": 0.1776, + "step": 9399 + }, + { + "epoch": 2.5013304949441193, + "grad_norm": 0.304849237203598, + "learning_rate": 1.0151990351569254e-07, + "loss": 0.1675, + "step": 9400 + }, + { + "epoch": 2.501596593932943, + "grad_norm": 0.2645319998264313, + "learning_rate": 1.0150301697779857e-07, + "loss": 0.1644, + "step": 9401 + }, + { + "epoch": 2.501862692921767, + "grad_norm": 0.276554137468338, + "learning_rate": 1.0148613039703547e-07, + "loss": 0.1888, + "step": 9402 + }, + { + "epoch": 2.502128791910591, + "grad_norm": 0.27379709482192993, + "learning_rate": 1.0146924377388493e-07, + "loss": 0.1861, + "step": 9403 + }, + { + "epoch": 2.5023948908994145, + "grad_norm": 0.34909650683403015, + "learning_rate": 1.0145235710882858e-07, + "loss": 0.1914, + "step": 9404 + }, + { + "epoch": 2.5026609898882386, + "grad_norm": 0.3024834394454956, + "learning_rate": 1.0143547040234803e-07, + "loss": 0.2003, + "step": 9405 + }, + { + "epoch": 2.5029270888770623, + "grad_norm": 0.36381080746650696, + "learning_rate": 1.0141858365492497e-07, + "loss": 0.1786, + "step": 9406 + }, + { + "epoch": 2.503193187865886, + "grad_norm": 0.3240937292575836, + "learning_rate": 1.0140169686704101e-07, + "loss": 0.203, + "step": 9407 + }, + { + "epoch": 2.50345928685471, + "grad_norm": 0.2623179852962494, + "learning_rate": 1.013848100391778e-07, + "loss": 0.181, + "step": 9408 + }, + { + "epoch": 2.5037253858435338, + "grad_norm": 0.25759485363960266, + "learning_rate": 1.0136792317181699e-07, + "loss": 0.1574, + "step": 9409 + }, + { + "epoch": 2.5039914848323575, + "grad_norm": 0.27755168080329895, + "learning_rate": 1.0135103626544026e-07, + "loss": 0.168, + "step": 9410 + }, + { + "epoch": 2.5042575838211816, + "grad_norm": 0.2864038646221161, + "learning_rate": 1.0133414932052922e-07, + "loss": 0.1838, + "step": 9411 + }, + { + "epoch": 2.5045236828100053, + "grad_norm": 0.27256375551223755, + "learning_rate": 1.0131726233756553e-07, + "loss": 0.1878, + "step": 9412 + }, + { + "epoch": 2.504789781798829, + "grad_norm": 0.3024934232234955, + "learning_rate": 1.0130037531703081e-07, + "loss": 0.1785, + "step": 9413 + }, + { + "epoch": 2.505055880787653, + "grad_norm": 0.2781462073326111, + "learning_rate": 1.012834882594068e-07, + "loss": 0.175, + "step": 9414 + }, + { + "epoch": 2.5053219797764767, + "grad_norm": 0.33074694871902466, + "learning_rate": 1.0126660116517508e-07, + "loss": 0.1831, + "step": 9415 + }, + { + "epoch": 2.5055880787653004, + "grad_norm": 0.2930815517902374, + "learning_rate": 1.0124971403481728e-07, + "loss": 0.1777, + "step": 9416 + }, + { + "epoch": 2.5058541777541246, + "grad_norm": 0.33668163418769836, + "learning_rate": 1.0123282686881514e-07, + "loss": 0.1843, + "step": 9417 + }, + { + "epoch": 2.5061202767429482, + "grad_norm": 0.2635713815689087, + "learning_rate": 1.0121593966765026e-07, + "loss": 0.1815, + "step": 9418 + }, + { + "epoch": 2.5063863757317724, + "grad_norm": 0.2893019914627075, + "learning_rate": 1.0119905243180432e-07, + "loss": 0.1895, + "step": 9419 + }, + { + "epoch": 2.506652474720596, + "grad_norm": 0.3263629972934723, + "learning_rate": 1.0118216516175893e-07, + "loss": 0.1594, + "step": 9420 + }, + { + "epoch": 2.50691857370942, + "grad_norm": 0.32655391097068787, + "learning_rate": 1.0116527785799582e-07, + "loss": 0.1999, + "step": 9421 + }, + { + "epoch": 2.507184672698244, + "grad_norm": 0.28042858839035034, + "learning_rate": 1.0114839052099662e-07, + "loss": 0.2051, + "step": 9422 + }, + { + "epoch": 2.5074507716870675, + "grad_norm": 0.3698614537715912, + "learning_rate": 1.0113150315124297e-07, + "loss": 0.1811, + "step": 9423 + }, + { + "epoch": 2.5077168706758917, + "grad_norm": 0.39789533615112305, + "learning_rate": 1.0111461574921656e-07, + "loss": 0.1838, + "step": 9424 + }, + { + "epoch": 2.5079829696647153, + "grad_norm": 0.3378596901893616, + "learning_rate": 1.0109772831539905e-07, + "loss": 0.1755, + "step": 9425 + }, + { + "epoch": 2.508249068653539, + "grad_norm": 0.309006929397583, + "learning_rate": 1.0108084085027209e-07, + "loss": 0.1772, + "step": 9426 + }, + { + "epoch": 2.508515167642363, + "grad_norm": 0.3761221170425415, + "learning_rate": 1.0106395335431732e-07, + "loss": 0.1842, + "step": 9427 + }, + { + "epoch": 2.508781266631187, + "grad_norm": 0.2934996783733368, + "learning_rate": 1.0104706582801648e-07, + "loss": 0.1774, + "step": 9428 + }, + { + "epoch": 2.5090473656200105, + "grad_norm": 0.2644020915031433, + "learning_rate": 1.0103017827185118e-07, + "loss": 0.1668, + "step": 9429 + }, + { + "epoch": 2.5093134646088346, + "grad_norm": 0.26549822092056274, + "learning_rate": 1.0101329068630314e-07, + "loss": 0.1846, + "step": 9430 + }, + { + "epoch": 2.5095795635976583, + "grad_norm": 0.3482777178287506, + "learning_rate": 1.0099640307185393e-07, + "loss": 0.1779, + "step": 9431 + }, + { + "epoch": 2.509845662586482, + "grad_norm": 0.2761394679546356, + "learning_rate": 1.0097951542898531e-07, + "loss": 0.1757, + "step": 9432 + }, + { + "epoch": 2.510111761575306, + "grad_norm": 0.3381286859512329, + "learning_rate": 1.0096262775817894e-07, + "loss": 0.2021, + "step": 9433 + }, + { + "epoch": 2.51037786056413, + "grad_norm": 0.2596293091773987, + "learning_rate": 1.0094574005991643e-07, + "loss": 0.175, + "step": 9434 + }, + { + "epoch": 2.5106439595529535, + "grad_norm": 0.27266496419906616, + "learning_rate": 1.0092885233467952e-07, + "loss": 0.1825, + "step": 9435 + }, + { + "epoch": 2.5109100585417776, + "grad_norm": 0.27479055523872375, + "learning_rate": 1.0091196458294983e-07, + "loss": 0.175, + "step": 9436 + }, + { + "epoch": 2.5111761575306013, + "grad_norm": 0.31958964467048645, + "learning_rate": 1.008950768052091e-07, + "loss": 0.1932, + "step": 9437 + }, + { + "epoch": 2.511442256519425, + "grad_norm": 0.2699701189994812, + "learning_rate": 1.0087818900193892e-07, + "loss": 0.1891, + "step": 9438 + }, + { + "epoch": 2.511708355508249, + "grad_norm": 0.3619357645511627, + "learning_rate": 1.0086130117362101e-07, + "loss": 0.2006, + "step": 9439 + }, + { + "epoch": 2.511974454497073, + "grad_norm": 0.2874504029750824, + "learning_rate": 1.0084441332073705e-07, + "loss": 0.1854, + "step": 9440 + }, + { + "epoch": 2.5122405534858965, + "grad_norm": 0.2922062873840332, + "learning_rate": 1.0082752544376869e-07, + "loss": 0.1801, + "step": 9441 + }, + { + "epoch": 2.5125066524747206, + "grad_norm": 0.41597673296928406, + "learning_rate": 1.0081063754319766e-07, + "loss": 0.1669, + "step": 9442 + }, + { + "epoch": 2.5127727514635443, + "grad_norm": 0.2943788766860962, + "learning_rate": 1.0079374961950558e-07, + "loss": 0.192, + "step": 9443 + }, + { + "epoch": 2.5130388504523684, + "grad_norm": 0.280111163854599, + "learning_rate": 1.0077686167317414e-07, + "loss": 0.171, + "step": 9444 + }, + { + "epoch": 2.513304949441192, + "grad_norm": 0.27262642979621887, + "learning_rate": 1.00759973704685e-07, + "loss": 0.1693, + "step": 9445 + }, + { + "epoch": 2.513571048430016, + "grad_norm": 0.3183812201023102, + "learning_rate": 1.007430857145199e-07, + "loss": 0.1824, + "step": 9446 + }, + { + "epoch": 2.51383714741884, + "grad_norm": 0.5089240670204163, + "learning_rate": 1.0072619770316048e-07, + "loss": 0.1997, + "step": 9447 + }, + { + "epoch": 2.5141032464076636, + "grad_norm": 0.2816318869590759, + "learning_rate": 1.0070930967108844e-07, + "loss": 0.1774, + "step": 9448 + }, + { + "epoch": 2.5143693453964877, + "grad_norm": 0.2714790999889374, + "learning_rate": 1.0069242161878542e-07, + "loss": 0.1769, + "step": 9449 + }, + { + "epoch": 2.5146354443853114, + "grad_norm": 0.27681872248649597, + "learning_rate": 1.0067553354673315e-07, + "loss": 0.1866, + "step": 9450 + }, + { + "epoch": 2.514901543374135, + "grad_norm": 0.3030920624732971, + "learning_rate": 1.006586454554133e-07, + "loss": 0.1734, + "step": 9451 + }, + { + "epoch": 2.515167642362959, + "grad_norm": 0.3297004699707031, + "learning_rate": 1.0064175734530751e-07, + "loss": 0.1683, + "step": 9452 + }, + { + "epoch": 2.515433741351783, + "grad_norm": 0.29252344369888306, + "learning_rate": 1.0062486921689751e-07, + "loss": 0.1895, + "step": 9453 + }, + { + "epoch": 2.5156998403406066, + "grad_norm": 0.3448861539363861, + "learning_rate": 1.0060798107066497e-07, + "loss": 0.1933, + "step": 9454 + }, + { + "epoch": 2.5159659393294307, + "grad_norm": 0.46903082728385925, + "learning_rate": 1.005910929070916e-07, + "loss": 0.1785, + "step": 9455 + }, + { + "epoch": 2.5162320383182544, + "grad_norm": 0.2642805278301239, + "learning_rate": 1.00574204726659e-07, + "loss": 0.1731, + "step": 9456 + }, + { + "epoch": 2.516498137307078, + "grad_norm": 0.30152615904808044, + "learning_rate": 1.0055731652984898e-07, + "loss": 0.1819, + "step": 9457 + }, + { + "epoch": 2.516764236295902, + "grad_norm": 0.39055299758911133, + "learning_rate": 1.0054042831714313e-07, + "loss": 0.2044, + "step": 9458 + }, + { + "epoch": 2.517030335284726, + "grad_norm": 0.28765708208084106, + "learning_rate": 1.0052354008902316e-07, + "loss": 0.1863, + "step": 9459 + }, + { + "epoch": 2.5172964342735495, + "grad_norm": 0.2752886116504669, + "learning_rate": 1.0050665184597076e-07, + "loss": 0.1818, + "step": 9460 + }, + { + "epoch": 2.5175625332623737, + "grad_norm": 0.2709824740886688, + "learning_rate": 1.0048976358846764e-07, + "loss": 0.1733, + "step": 9461 + }, + { + "epoch": 2.5178286322511974, + "grad_norm": 0.334780752658844, + "learning_rate": 1.0047287531699546e-07, + "loss": 0.1912, + "step": 9462 + }, + { + "epoch": 2.518094731240021, + "grad_norm": 0.271957129240036, + "learning_rate": 1.004559870320359e-07, + "loss": 0.1753, + "step": 9463 + }, + { + "epoch": 2.518360830228845, + "grad_norm": 0.2737664580345154, + "learning_rate": 1.0043909873407066e-07, + "loss": 0.1747, + "step": 9464 + }, + { + "epoch": 2.518626929217669, + "grad_norm": 0.37890031933784485, + "learning_rate": 1.0042221042358145e-07, + "loss": 0.1718, + "step": 9465 + }, + { + "epoch": 2.518893028206493, + "grad_norm": 0.27979812026023865, + "learning_rate": 1.0040532210104995e-07, + "loss": 0.1852, + "step": 9466 + }, + { + "epoch": 2.5191591271953166, + "grad_norm": 0.41627946496009827, + "learning_rate": 1.0038843376695781e-07, + "loss": 0.1782, + "step": 9467 + }, + { + "epoch": 2.5194252261841408, + "grad_norm": 0.28743353486061096, + "learning_rate": 1.0037154542178676e-07, + "loss": 0.1909, + "step": 9468 + }, + { + "epoch": 2.5196913251729645, + "grad_norm": 0.276615709066391, + "learning_rate": 1.0035465706601852e-07, + "loss": 0.186, + "step": 9469 + }, + { + "epoch": 2.519957424161788, + "grad_norm": 0.2424396276473999, + "learning_rate": 1.0033776870013471e-07, + "loss": 0.1615, + "step": 9470 + }, + { + "epoch": 2.5202235231506123, + "grad_norm": 0.36039265990257263, + "learning_rate": 1.0032088032461703e-07, + "loss": 0.1954, + "step": 9471 + }, + { + "epoch": 2.520489622139436, + "grad_norm": 0.27022379636764526, + "learning_rate": 1.0030399193994721e-07, + "loss": 0.1778, + "step": 9472 + }, + { + "epoch": 2.5207557211282596, + "grad_norm": 0.3775623142719269, + "learning_rate": 1.0028710354660692e-07, + "loss": 0.1829, + "step": 9473 + }, + { + "epoch": 2.5210218201170838, + "grad_norm": 0.45802927017211914, + "learning_rate": 1.0027021514507783e-07, + "loss": 0.1891, + "step": 9474 + }, + { + "epoch": 2.5212879191059074, + "grad_norm": 0.27586764097213745, + "learning_rate": 1.002533267358417e-07, + "loss": 0.1765, + "step": 9475 + }, + { + "epoch": 2.521554018094731, + "grad_norm": 0.35065656900405884, + "learning_rate": 1.0023643831938015e-07, + "loss": 0.1734, + "step": 9476 + }, + { + "epoch": 2.5218201170835552, + "grad_norm": 0.2876172661781311, + "learning_rate": 1.0021954989617491e-07, + "loss": 0.1943, + "step": 9477 + }, + { + "epoch": 2.522086216072379, + "grad_norm": 0.309322327375412, + "learning_rate": 1.0020266146670764e-07, + "loss": 0.1599, + "step": 9478 + }, + { + "epoch": 2.5223523150612026, + "grad_norm": 0.293997585773468, + "learning_rate": 1.0018577303146008e-07, + "loss": 0.1816, + "step": 9479 + }, + { + "epoch": 2.5226184140500267, + "grad_norm": 0.3410063087940216, + "learning_rate": 1.0016888459091388e-07, + "loss": 0.1771, + "step": 9480 + }, + { + "epoch": 2.5228845130388504, + "grad_norm": 0.2656644880771637, + "learning_rate": 1.0015199614555075e-07, + "loss": 0.1578, + "step": 9481 + }, + { + "epoch": 2.523150612027674, + "grad_norm": 0.2915549874305725, + "learning_rate": 1.0013510769585236e-07, + "loss": 0.1987, + "step": 9482 + }, + { + "epoch": 2.5234167110164982, + "grad_norm": 0.2692394554615021, + "learning_rate": 1.0011821924230046e-07, + "loss": 0.1646, + "step": 9483 + }, + { + "epoch": 2.523682810005322, + "grad_norm": 0.2661122977733612, + "learning_rate": 1.001013307853767e-07, + "loss": 0.1752, + "step": 9484 + }, + { + "epoch": 2.5239489089941456, + "grad_norm": 0.28508642315864563, + "learning_rate": 1.0008444232556275e-07, + "loss": 0.1916, + "step": 9485 + }, + { + "epoch": 2.5242150079829697, + "grad_norm": 0.3907392621040344, + "learning_rate": 1.0006755386334036e-07, + "loss": 0.1998, + "step": 9486 + }, + { + "epoch": 2.5244811069717934, + "grad_norm": 0.27236008644104004, + "learning_rate": 1.000506653991912e-07, + "loss": 0.1729, + "step": 9487 + }, + { + "epoch": 2.524747205960617, + "grad_norm": 0.25948426127433777, + "learning_rate": 1.0003377693359698e-07, + "loss": 0.1796, + "step": 9488 + }, + { + "epoch": 2.525013304949441, + "grad_norm": 0.3987836539745331, + "learning_rate": 1.0001688846703933e-07, + "loss": 0.1844, + "step": 9489 + }, + { + "epoch": 2.525279403938265, + "grad_norm": 0.27093663811683655, + "learning_rate": 1e-07, + "loss": 0.165, + "step": 9490 + }, + { + "epoch": 2.525545502927089, + "grad_norm": 0.2558588683605194, + "learning_rate": 9.998311153296066e-08, + "loss": 0.167, + "step": 9491 + }, + { + "epoch": 2.5258116019159127, + "grad_norm": 0.27784502506256104, + "learning_rate": 9.996622306640303e-08, + "loss": 0.1796, + "step": 9492 + }, + { + "epoch": 2.526077700904737, + "grad_norm": 0.25713494420051575, + "learning_rate": 9.994933460080877e-08, + "loss": 0.1776, + "step": 9493 + }, + { + "epoch": 2.5263437998935605, + "grad_norm": 0.33622774481773376, + "learning_rate": 9.993244613665963e-08, + "loss": 0.1706, + "step": 9494 + }, + { + "epoch": 2.526609898882384, + "grad_norm": 0.2914058566093445, + "learning_rate": 9.991555767443723e-08, + "loss": 0.1789, + "step": 9495 + }, + { + "epoch": 2.5268759978712083, + "grad_norm": 0.29935789108276367, + "learning_rate": 9.989866921462332e-08, + "loss": 0.1817, + "step": 9496 + }, + { + "epoch": 2.527142096860032, + "grad_norm": 0.38617414236068726, + "learning_rate": 9.988178075769952e-08, + "loss": 0.1961, + "step": 9497 + }, + { + "epoch": 2.5274081958488557, + "grad_norm": 0.4359264373779297, + "learning_rate": 9.986489230414765e-08, + "loss": 0.2045, + "step": 9498 + }, + { + "epoch": 2.52767429483768, + "grad_norm": 0.37070462107658386, + "learning_rate": 9.984800385444926e-08, + "loss": 0.1861, + "step": 9499 + }, + { + "epoch": 2.5279403938265035, + "grad_norm": 0.3068004548549652, + "learning_rate": 9.983111540908613e-08, + "loss": 0.1654, + "step": 9500 + }, + { + "epoch": 2.528206492815327, + "grad_norm": 0.2679344117641449, + "learning_rate": 9.981422696853991e-08, + "loss": 0.1811, + "step": 9501 + }, + { + "epoch": 2.5284725918041513, + "grad_norm": 0.27577462792396545, + "learning_rate": 9.979733853329238e-08, + "loss": 0.1746, + "step": 9502 + }, + { + "epoch": 2.528738690792975, + "grad_norm": 0.48755311965942383, + "learning_rate": 9.978045010382509e-08, + "loss": 0.2213, + "step": 9503 + }, + { + "epoch": 2.5290047897817987, + "grad_norm": 0.2748046815395355, + "learning_rate": 9.976356168061987e-08, + "loss": 0.1807, + "step": 9504 + }, + { + "epoch": 2.529270888770623, + "grad_norm": 0.38188692927360535, + "learning_rate": 9.974667326415831e-08, + "loss": 0.186, + "step": 9505 + }, + { + "epoch": 2.5295369877594465, + "grad_norm": 0.26728665828704834, + "learning_rate": 9.972978485492216e-08, + "loss": 0.1823, + "step": 9506 + }, + { + "epoch": 2.52980308674827, + "grad_norm": 0.31346336007118225, + "learning_rate": 9.971289645339308e-08, + "loss": 0.1709, + "step": 9507 + }, + { + "epoch": 2.5300691857370943, + "grad_norm": 0.37465962767601013, + "learning_rate": 9.969600806005282e-08, + "loss": 0.1843, + "step": 9508 + }, + { + "epoch": 2.530335284725918, + "grad_norm": 0.257222443819046, + "learning_rate": 9.967911967538299e-08, + "loss": 0.1715, + "step": 9509 + }, + { + "epoch": 2.5306013837147416, + "grad_norm": 0.2720790207386017, + "learning_rate": 9.966223129986531e-08, + "loss": 0.1933, + "step": 9510 + }, + { + "epoch": 2.5308674827035658, + "grad_norm": 0.269132524728775, + "learning_rate": 9.964534293398149e-08, + "loss": 0.1884, + "step": 9511 + }, + { + "epoch": 2.5311335816923894, + "grad_norm": 0.27595555782318115, + "learning_rate": 9.96284545782132e-08, + "loss": 0.1781, + "step": 9512 + }, + { + "epoch": 2.5313996806812136, + "grad_norm": 0.2960522472858429, + "learning_rate": 9.961156623304218e-08, + "loss": 0.1815, + "step": 9513 + }, + { + "epoch": 2.5316657796700373, + "grad_norm": 0.6024343371391296, + "learning_rate": 9.959467789895005e-08, + "loss": 0.2124, + "step": 9514 + }, + { + "epoch": 2.531931878658861, + "grad_norm": 0.3494708836078644, + "learning_rate": 9.957778957641854e-08, + "loss": 0.2046, + "step": 9515 + }, + { + "epoch": 2.532197977647685, + "grad_norm": 0.5546698570251465, + "learning_rate": 9.956090126592931e-08, + "loss": 0.2122, + "step": 9516 + }, + { + "epoch": 2.5324640766365087, + "grad_norm": 0.28195780515670776, + "learning_rate": 9.954401296796409e-08, + "loss": 0.183, + "step": 9517 + }, + { + "epoch": 2.532730175625333, + "grad_norm": 0.4640962779521942, + "learning_rate": 9.952712468300454e-08, + "loss": 0.2034, + "step": 9518 + }, + { + "epoch": 2.5329962746141566, + "grad_norm": 0.3176080882549286, + "learning_rate": 9.951023641153239e-08, + "loss": 0.1873, + "step": 9519 + }, + { + "epoch": 2.5332623736029802, + "grad_norm": 0.25962144136428833, + "learning_rate": 9.949334815402923e-08, + "loss": 0.162, + "step": 9520 + }, + { + "epoch": 2.5335284725918044, + "grad_norm": 0.27766814827919006, + "learning_rate": 9.947645991097685e-08, + "loss": 0.1766, + "step": 9521 + }, + { + "epoch": 2.533794571580628, + "grad_norm": 0.33843523263931274, + "learning_rate": 9.945957168285688e-08, + "loss": 0.1776, + "step": 9522 + }, + { + "epoch": 2.5340606705694517, + "grad_norm": 0.28568026423454285, + "learning_rate": 9.944268347015105e-08, + "loss": 0.1939, + "step": 9523 + }, + { + "epoch": 2.534326769558276, + "grad_norm": 0.34453415870666504, + "learning_rate": 9.942579527334099e-08, + "loss": 0.2114, + "step": 9524 + }, + { + "epoch": 2.5345928685470995, + "grad_norm": 0.27384987473487854, + "learning_rate": 9.940890709290843e-08, + "loss": 0.1872, + "step": 9525 + }, + { + "epoch": 2.534858967535923, + "grad_norm": 0.3281422555446625, + "learning_rate": 9.939201892933502e-08, + "loss": 0.1821, + "step": 9526 + }, + { + "epoch": 2.5351250665247473, + "grad_norm": 0.3518308699131012, + "learning_rate": 9.937513078310247e-08, + "loss": 0.1957, + "step": 9527 + }, + { + "epoch": 2.535391165513571, + "grad_norm": 0.3120421767234802, + "learning_rate": 9.93582426546925e-08, + "loss": 0.1663, + "step": 9528 + }, + { + "epoch": 2.5356572645023947, + "grad_norm": 0.34653183817863464, + "learning_rate": 9.93413545445867e-08, + "loss": 0.1825, + "step": 9529 + }, + { + "epoch": 2.535923363491219, + "grad_norm": 0.30686140060424805, + "learning_rate": 9.932446645326686e-08, + "loss": 0.1767, + "step": 9530 + }, + { + "epoch": 2.5361894624800425, + "grad_norm": 0.356881707906723, + "learning_rate": 9.930757838121456e-08, + "loss": 0.1922, + "step": 9531 + }, + { + "epoch": 2.536455561468866, + "grad_norm": 0.3450409173965454, + "learning_rate": 9.929069032891156e-08, + "loss": 0.1812, + "step": 9532 + }, + { + "epoch": 2.5367216604576903, + "grad_norm": 0.3096384108066559, + "learning_rate": 9.92738022968395e-08, + "loss": 0.1858, + "step": 9533 + }, + { + "epoch": 2.536987759446514, + "grad_norm": 0.2937762439250946, + "learning_rate": 9.925691428548011e-08, + "loss": 0.2002, + "step": 9534 + }, + { + "epoch": 2.5372538584353377, + "grad_norm": 0.3497789800167084, + "learning_rate": 9.924002629531499e-08, + "loss": 0.1839, + "step": 9535 + }, + { + "epoch": 2.537519957424162, + "grad_norm": 0.27404722571372986, + "learning_rate": 9.922313832682589e-08, + "loss": 0.1859, + "step": 9536 + }, + { + "epoch": 2.5377860564129855, + "grad_norm": 0.38676080107688904, + "learning_rate": 9.920625038049443e-08, + "loss": 0.2313, + "step": 9537 + }, + { + "epoch": 2.5380521554018096, + "grad_norm": 0.390815407037735, + "learning_rate": 9.918936245680237e-08, + "loss": 0.1977, + "step": 9538 + }, + { + "epoch": 2.5383182543906333, + "grad_norm": 0.38408055901527405, + "learning_rate": 9.917247455623131e-08, + "loss": 0.1594, + "step": 9539 + }, + { + "epoch": 2.5385843533794574, + "grad_norm": 0.2704416513442993, + "learning_rate": 9.915558667926297e-08, + "loss": 0.1768, + "step": 9540 + }, + { + "epoch": 2.538850452368281, + "grad_norm": 0.3072574734687805, + "learning_rate": 9.913869882637899e-08, + "loss": 0.178, + "step": 9541 + }, + { + "epoch": 2.539116551357105, + "grad_norm": 0.34905341267585754, + "learning_rate": 9.91218109980611e-08, + "loss": 0.1882, + "step": 9542 + }, + { + "epoch": 2.539382650345929, + "grad_norm": 0.2499687373638153, + "learning_rate": 9.910492319479091e-08, + "loss": 0.1749, + "step": 9543 + }, + { + "epoch": 2.5396487493347526, + "grad_norm": 0.3268558979034424, + "learning_rate": 9.908803541705017e-08, + "loss": 0.1836, + "step": 9544 + }, + { + "epoch": 2.5399148483235763, + "grad_norm": 0.32253149151802063, + "learning_rate": 9.907114766532048e-08, + "loss": 0.1881, + "step": 9545 + }, + { + "epoch": 2.5401809473124004, + "grad_norm": 0.27704060077667236, + "learning_rate": 9.905425994008355e-08, + "loss": 0.1762, + "step": 9546 + }, + { + "epoch": 2.540447046301224, + "grad_norm": 0.3333449959754944, + "learning_rate": 9.903737224182107e-08, + "loss": 0.1955, + "step": 9547 + }, + { + "epoch": 2.5407131452900478, + "grad_norm": 0.34749773144721985, + "learning_rate": 9.902048457101467e-08, + "loss": 0.1835, + "step": 9548 + }, + { + "epoch": 2.540979244278872, + "grad_norm": 0.2926553785800934, + "learning_rate": 9.900359692814607e-08, + "loss": 0.1942, + "step": 9549 + }, + { + "epoch": 2.5412453432676956, + "grad_norm": 0.3386526107788086, + "learning_rate": 9.898670931369688e-08, + "loss": 0.1896, + "step": 9550 + }, + { + "epoch": 2.5415114422565193, + "grad_norm": 0.27444058656692505, + "learning_rate": 9.896982172814882e-08, + "loss": 0.1739, + "step": 9551 + }, + { + "epoch": 2.5417775412453434, + "grad_norm": 0.29062333703041077, + "learning_rate": 9.89529341719835e-08, + "loss": 0.1844, + "step": 9552 + }, + { + "epoch": 2.542043640234167, + "grad_norm": 0.26009708642959595, + "learning_rate": 9.893604664568268e-08, + "loss": 0.1792, + "step": 9553 + }, + { + "epoch": 2.5423097392229907, + "grad_norm": 0.33467158675193787, + "learning_rate": 9.89191591497279e-08, + "loss": 0.1794, + "step": 9554 + }, + { + "epoch": 2.542575838211815, + "grad_norm": 0.32778528332710266, + "learning_rate": 9.890227168460098e-08, + "loss": 0.1888, + "step": 9555 + }, + { + "epoch": 2.5428419372006386, + "grad_norm": 0.309946209192276, + "learning_rate": 9.888538425078345e-08, + "loss": 0.1903, + "step": 9556 + }, + { + "epoch": 2.5431080361894622, + "grad_norm": 0.3057613670825958, + "learning_rate": 9.886849684875704e-08, + "loss": 0.1857, + "step": 9557 + }, + { + "epoch": 2.5433741351782864, + "grad_norm": 0.2721988260746002, + "learning_rate": 9.885160947900338e-08, + "loss": 0.1932, + "step": 9558 + }, + { + "epoch": 2.54364023416711, + "grad_norm": 0.27205219864845276, + "learning_rate": 9.883472214200421e-08, + "loss": 0.1699, + "step": 9559 + }, + { + "epoch": 2.5439063331559337, + "grad_norm": 0.24950532615184784, + "learning_rate": 9.881783483824107e-08, + "loss": 0.1644, + "step": 9560 + }, + { + "epoch": 2.544172432144758, + "grad_norm": 0.37418267130851746, + "learning_rate": 9.880094756819572e-08, + "loss": 0.1855, + "step": 9561 + }, + { + "epoch": 2.5444385311335815, + "grad_norm": 0.2670641243457794, + "learning_rate": 9.878406033234975e-08, + "loss": 0.1651, + "step": 9562 + }, + { + "epoch": 2.5447046301224057, + "grad_norm": 0.30991020798683167, + "learning_rate": 9.87671731311849e-08, + "loss": 0.1843, + "step": 9563 + }, + { + "epoch": 2.5449707291112293, + "grad_norm": 0.35818609595298767, + "learning_rate": 9.875028596518272e-08, + "loss": 0.178, + "step": 9564 + }, + { + "epoch": 2.5452368281000535, + "grad_norm": 0.36016643047332764, + "learning_rate": 9.873339883482493e-08, + "loss": 0.1987, + "step": 9565 + }, + { + "epoch": 2.545502927088877, + "grad_norm": 0.26636430621147156, + "learning_rate": 9.871651174059321e-08, + "loss": 0.178, + "step": 9566 + }, + { + "epoch": 2.545769026077701, + "grad_norm": 0.38979944586753845, + "learning_rate": 9.869962468296915e-08, + "loss": 0.1892, + "step": 9567 + }, + { + "epoch": 2.546035125066525, + "grad_norm": 0.32912907004356384, + "learning_rate": 9.868273766243446e-08, + "loss": 0.1676, + "step": 9568 + }, + { + "epoch": 2.5463012240553486, + "grad_norm": 0.3542948365211487, + "learning_rate": 9.866585067947076e-08, + "loss": 0.1821, + "step": 9569 + }, + { + "epoch": 2.5465673230441723, + "grad_norm": 0.32739681005477905, + "learning_rate": 9.864896373455974e-08, + "loss": 0.1797, + "step": 9570 + }, + { + "epoch": 2.5468334220329965, + "grad_norm": 1.0314937829971313, + "learning_rate": 9.863207682818299e-08, + "loss": 0.1852, + "step": 9571 + }, + { + "epoch": 2.54709952102182, + "grad_norm": 0.2686983346939087, + "learning_rate": 9.86151899608222e-08, + "loss": 0.1704, + "step": 9572 + }, + { + "epoch": 2.547365620010644, + "grad_norm": 0.2795335650444031, + "learning_rate": 9.859830313295898e-08, + "loss": 0.1709, + "step": 9573 + }, + { + "epoch": 2.547631718999468, + "grad_norm": 0.2701765298843384, + "learning_rate": 9.858141634507506e-08, + "loss": 0.1856, + "step": 9574 + }, + { + "epoch": 2.5478978179882916, + "grad_norm": 0.3245029151439667, + "learning_rate": 9.856452959765196e-08, + "loss": 0.187, + "step": 9575 + }, + { + "epoch": 2.5481639169771153, + "grad_norm": 0.28797677159309387, + "learning_rate": 9.854764289117144e-08, + "loss": 0.1717, + "step": 9576 + }, + { + "epoch": 2.5484300159659394, + "grad_norm": 0.39030584692955017, + "learning_rate": 9.853075622611507e-08, + "loss": 0.2007, + "step": 9577 + }, + { + "epoch": 2.548696114954763, + "grad_norm": 0.2619514465332031, + "learning_rate": 9.851386960296457e-08, + "loss": 0.182, + "step": 9578 + }, + { + "epoch": 2.548962213943587, + "grad_norm": 0.280224472284317, + "learning_rate": 9.849698302220144e-08, + "loss": 0.1839, + "step": 9579 + }, + { + "epoch": 2.549228312932411, + "grad_norm": 0.27022045850753784, + "learning_rate": 9.848009648430748e-08, + "loss": 0.1862, + "step": 9580 + }, + { + "epoch": 2.5494944119212346, + "grad_norm": 0.37192755937576294, + "learning_rate": 9.846320998976422e-08, + "loss": 0.1961, + "step": 9581 + }, + { + "epoch": 2.5497605109100583, + "grad_norm": 0.35297420620918274, + "learning_rate": 9.844632353905333e-08, + "loss": 0.1878, + "step": 9582 + }, + { + "epoch": 2.5500266098988824, + "grad_norm": 0.28732725977897644, + "learning_rate": 9.842943713265647e-08, + "loss": 0.1992, + "step": 9583 + }, + { + "epoch": 2.550292708887706, + "grad_norm": 0.42785710096359253, + "learning_rate": 9.841255077105525e-08, + "loss": 0.2171, + "step": 9584 + }, + { + "epoch": 2.55055880787653, + "grad_norm": 0.2830508351325989, + "learning_rate": 9.839566445473135e-08, + "loss": 0.1722, + "step": 9585 + }, + { + "epoch": 2.550824906865354, + "grad_norm": 0.25080832839012146, + "learning_rate": 9.837877818416628e-08, + "loss": 0.1597, + "step": 9586 + }, + { + "epoch": 2.551091005854178, + "grad_norm": 0.2872253358364105, + "learning_rate": 9.836189195984182e-08, + "loss": 0.1743, + "step": 9587 + }, + { + "epoch": 2.5513571048430017, + "grad_norm": 0.28647908568382263, + "learning_rate": 9.834500578223948e-08, + "loss": 0.1859, + "step": 9588 + }, + { + "epoch": 2.5516232038318254, + "grad_norm": 0.25062328577041626, + "learning_rate": 9.832811965184098e-08, + "loss": 0.1692, + "step": 9589 + }, + { + "epoch": 2.5518893028206495, + "grad_norm": 0.2787856459617615, + "learning_rate": 9.831123356912787e-08, + "loss": 0.199, + "step": 9590 + }, + { + "epoch": 2.552155401809473, + "grad_norm": 0.2727806866168976, + "learning_rate": 9.829434753458184e-08, + "loss": 0.1924, + "step": 9591 + }, + { + "epoch": 2.552421500798297, + "grad_norm": 0.3361055552959442, + "learning_rate": 9.827746154868444e-08, + "loss": 0.1766, + "step": 9592 + }, + { + "epoch": 2.552687599787121, + "grad_norm": 0.2955795228481293, + "learning_rate": 9.826057561191737e-08, + "loss": 0.1894, + "step": 9593 + }, + { + "epoch": 2.5529536987759447, + "grad_norm": 0.2608002722263336, + "learning_rate": 9.82436897247622e-08, + "loss": 0.1749, + "step": 9594 + }, + { + "epoch": 2.5532197977647684, + "grad_norm": 0.30159705877304077, + "learning_rate": 9.822680388770061e-08, + "loss": 0.1999, + "step": 9595 + }, + { + "epoch": 2.5534858967535925, + "grad_norm": 0.484222948551178, + "learning_rate": 9.820991810121414e-08, + "loss": 0.206, + "step": 9596 + }, + { + "epoch": 2.553751995742416, + "grad_norm": 0.29209086298942566, + "learning_rate": 9.819303236578445e-08, + "loss": 0.2013, + "step": 9597 + }, + { + "epoch": 2.55401809473124, + "grad_norm": 0.37209340929985046, + "learning_rate": 9.817614668189315e-08, + "loss": 0.1981, + "step": 9598 + }, + { + "epoch": 2.554284193720064, + "grad_norm": 0.26489192247390747, + "learning_rate": 9.815926105002187e-08, + "loss": 0.1663, + "step": 9599 + }, + { + "epoch": 2.5545502927088877, + "grad_norm": 0.35285472869873047, + "learning_rate": 9.814237547065217e-08, + "loss": 0.1738, + "step": 9600 + }, + { + "epoch": 2.5548163916977114, + "grad_norm": 0.27752694487571716, + "learning_rate": 9.81254899442657e-08, + "loss": 0.1786, + "step": 9601 + }, + { + "epoch": 2.5550824906865355, + "grad_norm": 0.3854089677333832, + "learning_rate": 9.81086044713441e-08, + "loss": 0.2103, + "step": 9602 + }, + { + "epoch": 2.555348589675359, + "grad_norm": 0.3230680823326111, + "learning_rate": 9.80917190523689e-08, + "loss": 0.1925, + "step": 9603 + }, + { + "epoch": 2.555614688664183, + "grad_norm": 0.41135093569755554, + "learning_rate": 9.807483368782181e-08, + "loss": 0.1947, + "step": 9604 + }, + { + "epoch": 2.555880787653007, + "grad_norm": 0.45843350887298584, + "learning_rate": 9.80579483781843e-08, + "loss": 0.1914, + "step": 9605 + }, + { + "epoch": 2.5561468866418307, + "grad_norm": 0.2796018719673157, + "learning_rate": 9.804106312393811e-08, + "loss": 0.1743, + "step": 9606 + }, + { + "epoch": 2.5564129856306543, + "grad_norm": 0.3199927508831024, + "learning_rate": 9.802417792556476e-08, + "loss": 0.1677, + "step": 9607 + }, + { + "epoch": 2.5566790846194785, + "grad_norm": 0.28168973326683044, + "learning_rate": 9.800729278354587e-08, + "loss": 0.1905, + "step": 9608 + }, + { + "epoch": 2.556945183608302, + "grad_norm": 0.28853484988212585, + "learning_rate": 9.799040769836303e-08, + "loss": 0.1904, + "step": 9609 + }, + { + "epoch": 2.5572112825971263, + "grad_norm": 0.2794542908668518, + "learning_rate": 9.797352267049787e-08, + "loss": 0.1658, + "step": 9610 + }, + { + "epoch": 2.55747738158595, + "grad_norm": 0.4210289418697357, + "learning_rate": 9.795663770043194e-08, + "loss": 0.1771, + "step": 9611 + }, + { + "epoch": 2.557743480574774, + "grad_norm": 0.27274206280708313, + "learning_rate": 9.793975278864686e-08, + "loss": 0.169, + "step": 9612 + }, + { + "epoch": 2.5580095795635978, + "grad_norm": 0.4229726493358612, + "learning_rate": 9.792286793562422e-08, + "loss": 0.178, + "step": 9613 + }, + { + "epoch": 2.5582756785524214, + "grad_norm": 0.4171582758426666, + "learning_rate": 9.790598314184563e-08, + "loss": 0.2112, + "step": 9614 + }, + { + "epoch": 2.5585417775412456, + "grad_norm": 0.2553234398365021, + "learning_rate": 9.788909840779263e-08, + "loss": 0.1824, + "step": 9615 + }, + { + "epoch": 2.5588078765300692, + "grad_norm": 0.29155755043029785, + "learning_rate": 9.787221373394683e-08, + "loss": 0.1862, + "step": 9616 + }, + { + "epoch": 2.559073975518893, + "grad_norm": 0.4095528721809387, + "learning_rate": 9.785532912078981e-08, + "loss": 0.1739, + "step": 9617 + }, + { + "epoch": 2.559340074507717, + "grad_norm": 0.33003342151641846, + "learning_rate": 9.783844456880315e-08, + "loss": 0.1832, + "step": 9618 + }, + { + "epoch": 2.5596061734965407, + "grad_norm": 0.3293774127960205, + "learning_rate": 9.782156007846849e-08, + "loss": 0.1905, + "step": 9619 + }, + { + "epoch": 2.5598722724853644, + "grad_norm": 0.2629849314689636, + "learning_rate": 9.780467565026732e-08, + "loss": 0.1742, + "step": 9620 + }, + { + "epoch": 2.5601383714741885, + "grad_norm": 0.36851075291633606, + "learning_rate": 9.778779128468133e-08, + "loss": 0.1864, + "step": 9621 + }, + { + "epoch": 2.5604044704630122, + "grad_norm": 0.2640649378299713, + "learning_rate": 9.777090698219196e-08, + "loss": 0.1522, + "step": 9622 + }, + { + "epoch": 2.560670569451836, + "grad_norm": 0.29450827836990356, + "learning_rate": 9.775402274328088e-08, + "loss": 0.1846, + "step": 9623 + }, + { + "epoch": 2.56093666844066, + "grad_norm": 0.25782468914985657, + "learning_rate": 9.773713856842962e-08, + "loss": 0.1713, + "step": 9624 + }, + { + "epoch": 2.5612027674294837, + "grad_norm": 0.29816681146621704, + "learning_rate": 9.77202544581198e-08, + "loss": 0.1775, + "step": 9625 + }, + { + "epoch": 2.5614688664183074, + "grad_norm": 0.2633921504020691, + "learning_rate": 9.77033704128329e-08, + "loss": 0.1782, + "step": 9626 + }, + { + "epoch": 2.5617349654071315, + "grad_norm": 0.2919568717479706, + "learning_rate": 9.768648643305059e-08, + "loss": 0.1885, + "step": 9627 + }, + { + "epoch": 2.562001064395955, + "grad_norm": 0.3241487443447113, + "learning_rate": 9.766960251925436e-08, + "loss": 0.1954, + "step": 9628 + }, + { + "epoch": 2.562267163384779, + "grad_norm": 0.27768364548683167, + "learning_rate": 9.765271867192584e-08, + "loss": 0.1673, + "step": 9629 + }, + { + "epoch": 2.562533262373603, + "grad_norm": 0.27428698539733887, + "learning_rate": 9.76358348915465e-08, + "loss": 0.1729, + "step": 9630 + }, + { + "epoch": 2.5627993613624267, + "grad_norm": 0.29853811860084534, + "learning_rate": 9.761895117859801e-08, + "loss": 0.188, + "step": 9631 + }, + { + "epoch": 2.563065460351251, + "grad_norm": 0.3549630045890808, + "learning_rate": 9.760206753356182e-08, + "loss": 0.2, + "step": 9632 + }, + { + "epoch": 2.5633315593400745, + "grad_norm": 0.39441943168640137, + "learning_rate": 9.758518395691958e-08, + "loss": 0.1957, + "step": 9633 + }, + { + "epoch": 2.563597658328898, + "grad_norm": 0.3633538782596588, + "learning_rate": 9.756830044915275e-08, + "loss": 0.1772, + "step": 9634 + }, + { + "epoch": 2.5638637573177223, + "grad_norm": 0.3145713210105896, + "learning_rate": 9.7551417010743e-08, + "loss": 0.1866, + "step": 9635 + }, + { + "epoch": 2.564129856306546, + "grad_norm": 0.39881566166877747, + "learning_rate": 9.753453364217176e-08, + "loss": 0.185, + "step": 9636 + }, + { + "epoch": 2.56439595529537, + "grad_norm": 0.289692223072052, + "learning_rate": 9.751765034392062e-08, + "loss": 0.1933, + "step": 9637 + }, + { + "epoch": 2.564662054284194, + "grad_norm": 0.37874647974967957, + "learning_rate": 9.750076711647117e-08, + "loss": 0.1787, + "step": 9638 + }, + { + "epoch": 2.5649281532730175, + "grad_norm": 0.29155591130256653, + "learning_rate": 9.74838839603049e-08, + "loss": 0.1888, + "step": 9639 + }, + { + "epoch": 2.5651942522618416, + "grad_norm": 0.2831714451313019, + "learning_rate": 9.746700087590341e-08, + "loss": 0.1844, + "step": 9640 + }, + { + "epoch": 2.5654603512506653, + "grad_norm": 0.2822754979133606, + "learning_rate": 9.745011786374817e-08, + "loss": 0.1813, + "step": 9641 + }, + { + "epoch": 2.565726450239489, + "grad_norm": 0.3735508918762207, + "learning_rate": 9.743323492432076e-08, + "loss": 0.1844, + "step": 9642 + }, + { + "epoch": 2.565992549228313, + "grad_norm": 0.259752482175827, + "learning_rate": 9.741635205810268e-08, + "loss": 0.1743, + "step": 9643 + }, + { + "epoch": 2.566258648217137, + "grad_norm": 0.2648809254169464, + "learning_rate": 9.739946926557553e-08, + "loss": 0.1638, + "step": 9644 + }, + { + "epoch": 2.5665247472059605, + "grad_norm": 0.34075096249580383, + "learning_rate": 9.738258654722076e-08, + "loss": 0.1747, + "step": 9645 + }, + { + "epoch": 2.5667908461947846, + "grad_norm": 0.2595131993293762, + "learning_rate": 9.736570390352001e-08, + "loss": 0.1742, + "step": 9646 + }, + { + "epoch": 2.5670569451836083, + "grad_norm": 0.329607754945755, + "learning_rate": 9.734882133495467e-08, + "loss": 0.1811, + "step": 9647 + }, + { + "epoch": 2.567323044172432, + "grad_norm": 0.31119558215141296, + "learning_rate": 9.733193884200635e-08, + "loss": 0.1812, + "step": 9648 + }, + { + "epoch": 2.567589143161256, + "grad_norm": 0.26913779973983765, + "learning_rate": 9.731505642515657e-08, + "loss": 0.1887, + "step": 9649 + }, + { + "epoch": 2.5678552421500798, + "grad_norm": 0.3328515887260437, + "learning_rate": 9.729817408488685e-08, + "loss": 0.1768, + "step": 9650 + }, + { + "epoch": 2.5681213411389034, + "grad_norm": 0.24166499078273773, + "learning_rate": 9.728129182167865e-08, + "loss": 0.1688, + "step": 9651 + }, + { + "epoch": 2.5683874401277276, + "grad_norm": 0.39175090193748474, + "learning_rate": 9.726440963601357e-08, + "loss": 0.1991, + "step": 9652 + }, + { + "epoch": 2.5686535391165513, + "grad_norm": 0.2806987166404724, + "learning_rate": 9.724752752837305e-08, + "loss": 0.1654, + "step": 9653 + }, + { + "epoch": 2.568919638105375, + "grad_norm": 0.28938236832618713, + "learning_rate": 9.723064549923868e-08, + "loss": 0.1726, + "step": 9654 + }, + { + "epoch": 2.569185737094199, + "grad_norm": 0.27151593565940857, + "learning_rate": 9.72137635490919e-08, + "loss": 0.1726, + "step": 9655 + }, + { + "epoch": 2.5694518360830227, + "grad_norm": 0.3547748327255249, + "learning_rate": 9.719688167841422e-08, + "loss": 0.1848, + "step": 9656 + }, + { + "epoch": 2.569717935071847, + "grad_norm": 0.2879732847213745, + "learning_rate": 9.717999988768723e-08, + "loss": 0.1731, + "step": 9657 + }, + { + "epoch": 2.5699840340606706, + "grad_norm": 0.34447458386421204, + "learning_rate": 9.716311817739234e-08, + "loss": 0.196, + "step": 9658 + }, + { + "epoch": 2.5702501330494947, + "grad_norm": 0.4123215079307556, + "learning_rate": 9.714623654801108e-08, + "loss": 0.1897, + "step": 9659 + }, + { + "epoch": 2.5705162320383184, + "grad_norm": 0.32835856080055237, + "learning_rate": 9.712935500002496e-08, + "loss": 0.1905, + "step": 9660 + }, + { + "epoch": 2.570782331027142, + "grad_norm": 0.3419993221759796, + "learning_rate": 9.71124735339155e-08, + "loss": 0.1813, + "step": 9661 + }, + { + "epoch": 2.571048430015966, + "grad_norm": 0.23086319863796234, + "learning_rate": 9.709559215016412e-08, + "loss": 0.1522, + "step": 9662 + }, + { + "epoch": 2.57131452900479, + "grad_norm": 0.36193785071372986, + "learning_rate": 9.707871084925237e-08, + "loss": 0.1887, + "step": 9663 + }, + { + "epoch": 2.5715806279936135, + "grad_norm": 0.284031480550766, + "learning_rate": 9.706182963166172e-08, + "loss": 0.1818, + "step": 9664 + }, + { + "epoch": 2.5718467269824377, + "grad_norm": 0.27599453926086426, + "learning_rate": 9.704494849787369e-08, + "loss": 0.1774, + "step": 9665 + }, + { + "epoch": 2.5721128259712613, + "grad_norm": 0.2784956395626068, + "learning_rate": 9.702806744836968e-08, + "loss": 0.1968, + "step": 9666 + }, + { + "epoch": 2.572378924960085, + "grad_norm": 0.33327245712280273, + "learning_rate": 9.701118648363127e-08, + "loss": 0.1999, + "step": 9667 + }, + { + "epoch": 2.572645023948909, + "grad_norm": 0.28586307168006897, + "learning_rate": 9.699430560413985e-08, + "loss": 0.1819, + "step": 9668 + }, + { + "epoch": 2.572911122937733, + "grad_norm": 0.3275735676288605, + "learning_rate": 9.697742481037699e-08, + "loss": 0.174, + "step": 9669 + }, + { + "epoch": 2.5731772219265565, + "grad_norm": 0.3787324130535126, + "learning_rate": 9.696054410282409e-08, + "loss": 0.1877, + "step": 9670 + }, + { + "epoch": 2.5734433209153806, + "grad_norm": 0.29175645112991333, + "learning_rate": 9.694366348196267e-08, + "loss": 0.1938, + "step": 9671 + }, + { + "epoch": 2.5737094199042043, + "grad_norm": 0.2954092025756836, + "learning_rate": 9.692678294827415e-08, + "loss": 0.2013, + "step": 9672 + }, + { + "epoch": 2.573975518893028, + "grad_norm": 0.3244812488555908, + "learning_rate": 9.690990250224002e-08, + "loss": 0.1814, + "step": 9673 + }, + { + "epoch": 2.574241617881852, + "grad_norm": 0.27029985189437866, + "learning_rate": 9.689302214434178e-08, + "loss": 0.1688, + "step": 9674 + }, + { + "epoch": 2.574507716870676, + "grad_norm": 0.3019460141658783, + "learning_rate": 9.687614187506084e-08, + "loss": 0.1997, + "step": 9675 + }, + { + "epoch": 2.5747738158594995, + "grad_norm": 0.3278036117553711, + "learning_rate": 9.685926169487871e-08, + "loss": 0.2142, + "step": 9676 + }, + { + "epoch": 2.5750399148483236, + "grad_norm": 0.36169371008872986, + "learning_rate": 9.684238160427679e-08, + "loss": 0.1696, + "step": 9677 + }, + { + "epoch": 2.5753060138371473, + "grad_norm": 0.29988548159599304, + "learning_rate": 9.682550160373658e-08, + "loss": 0.1888, + "step": 9678 + }, + { + "epoch": 2.5755721128259714, + "grad_norm": 0.25758790969848633, + "learning_rate": 9.680862169373948e-08, + "loss": 0.1671, + "step": 9679 + }, + { + "epoch": 2.575838211814795, + "grad_norm": 0.3139188289642334, + "learning_rate": 9.679174187476704e-08, + "loss": 0.1681, + "step": 9680 + }, + { + "epoch": 2.576104310803619, + "grad_norm": 0.3733628988265991, + "learning_rate": 9.677486214730057e-08, + "loss": 0.1927, + "step": 9681 + }, + { + "epoch": 2.576370409792443, + "grad_norm": 0.3248082399368286, + "learning_rate": 9.675798251182164e-08, + "loss": 0.1754, + "step": 9682 + }, + { + "epoch": 2.5766365087812666, + "grad_norm": 0.355704665184021, + "learning_rate": 9.674110296881162e-08, + "loss": 0.1898, + "step": 9683 + }, + { + "epoch": 2.5769026077700907, + "grad_norm": 0.2793383300304413, + "learning_rate": 9.672422351875196e-08, + "loss": 0.1744, + "step": 9684 + }, + { + "epoch": 2.5771687067589144, + "grad_norm": 0.28288576006889343, + "learning_rate": 9.670734416212409e-08, + "loss": 0.1779, + "step": 9685 + }, + { + "epoch": 2.577434805747738, + "grad_norm": 0.2715231776237488, + "learning_rate": 9.66904648994095e-08, + "loss": 0.1814, + "step": 9686 + }, + { + "epoch": 2.577700904736562, + "grad_norm": 0.25751930475234985, + "learning_rate": 9.667358573108953e-08, + "loss": 0.166, + "step": 9687 + }, + { + "epoch": 2.577967003725386, + "grad_norm": 0.26252490282058716, + "learning_rate": 9.665670665764568e-08, + "loss": 0.1663, + "step": 9688 + }, + { + "epoch": 2.5782331027142096, + "grad_norm": 0.26306259632110596, + "learning_rate": 9.663982767955933e-08, + "loss": 0.1671, + "step": 9689 + }, + { + "epoch": 2.5784992017030337, + "grad_norm": 0.29122480750083923, + "learning_rate": 9.662294879731196e-08, + "loss": 0.1812, + "step": 9690 + }, + { + "epoch": 2.5787653006918574, + "grad_norm": 0.3589666485786438, + "learning_rate": 9.66060700113849e-08, + "loss": 0.172, + "step": 9691 + }, + { + "epoch": 2.579031399680681, + "grad_norm": 0.3068336546421051, + "learning_rate": 9.658919132225965e-08, + "loss": 0.1896, + "step": 9692 + }, + { + "epoch": 2.579297498669505, + "grad_norm": 0.33073869347572327, + "learning_rate": 9.657231273041759e-08, + "loss": 0.1958, + "step": 9693 + }, + { + "epoch": 2.579563597658329, + "grad_norm": 0.25652310252189636, + "learning_rate": 9.655543423634012e-08, + "loss": 0.1676, + "step": 9694 + }, + { + "epoch": 2.5798296966471526, + "grad_norm": 0.34617897868156433, + "learning_rate": 9.653855584050868e-08, + "loss": 0.2031, + "step": 9695 + }, + { + "epoch": 2.5800957956359767, + "grad_norm": 0.2695949971675873, + "learning_rate": 9.652167754340466e-08, + "loss": 0.1609, + "step": 9696 + }, + { + "epoch": 2.5803618946248004, + "grad_norm": 0.27187681198120117, + "learning_rate": 9.650479934550949e-08, + "loss": 0.1762, + "step": 9697 + }, + { + "epoch": 2.580627993613624, + "grad_norm": 0.36461761593818665, + "learning_rate": 9.648792124730452e-08, + "loss": 0.1817, + "step": 9698 + }, + { + "epoch": 2.580894092602448, + "grad_norm": 0.31754767894744873, + "learning_rate": 9.647104324927118e-08, + "loss": 0.1722, + "step": 9699 + }, + { + "epoch": 2.581160191591272, + "grad_norm": 0.26058533787727356, + "learning_rate": 9.645416535189084e-08, + "loss": 0.1715, + "step": 9700 + }, + { + "epoch": 2.5814262905800955, + "grad_norm": 0.2663837969303131, + "learning_rate": 9.643728755564496e-08, + "loss": 0.1907, + "step": 9701 + }, + { + "epoch": 2.5816923895689197, + "grad_norm": 0.3190403878688812, + "learning_rate": 9.642040986101481e-08, + "loss": 0.1877, + "step": 9702 + }, + { + "epoch": 2.5819584885577433, + "grad_norm": 0.3216318190097809, + "learning_rate": 9.640353226848189e-08, + "loss": 0.1856, + "step": 9703 + }, + { + "epoch": 2.5822245875465675, + "grad_norm": 0.24540485441684723, + "learning_rate": 9.638665477852752e-08, + "loss": 0.1623, + "step": 9704 + }, + { + "epoch": 2.582490686535391, + "grad_norm": 0.26769259572029114, + "learning_rate": 9.636977739163313e-08, + "loss": 0.173, + "step": 9705 + }, + { + "epoch": 2.5827567855242153, + "grad_norm": 0.2762245833873749, + "learning_rate": 9.635290010828e-08, + "loss": 0.183, + "step": 9706 + }, + { + "epoch": 2.583022884513039, + "grad_norm": 0.29595258831977844, + "learning_rate": 9.633602292894965e-08, + "loss": 0.1781, + "step": 9707 + }, + { + "epoch": 2.5832889835018626, + "grad_norm": 0.26491761207580566, + "learning_rate": 9.63191458541233e-08, + "loss": 0.1595, + "step": 9708 + }, + { + "epoch": 2.5835550824906868, + "grad_norm": 0.24808193743228912, + "learning_rate": 9.630226888428243e-08, + "loss": 0.1709, + "step": 9709 + }, + { + "epoch": 2.5838211814795105, + "grad_norm": 0.29357728362083435, + "learning_rate": 9.628539201990836e-08, + "loss": 0.1958, + "step": 9710 + }, + { + "epoch": 2.584087280468334, + "grad_norm": 0.26883551478385925, + "learning_rate": 9.626851526148243e-08, + "loss": 0.1679, + "step": 9711 + }, + { + "epoch": 2.5843533794571583, + "grad_norm": 0.446186900138855, + "learning_rate": 9.625163860948608e-08, + "loss": 0.2014, + "step": 9712 + }, + { + "epoch": 2.584619478445982, + "grad_norm": 0.3750261962413788, + "learning_rate": 9.623476206440057e-08, + "loss": 0.1795, + "step": 9713 + }, + { + "epoch": 2.5848855774348056, + "grad_norm": 0.4197099506855011, + "learning_rate": 9.621788562670733e-08, + "loss": 0.1817, + "step": 9714 + }, + { + "epoch": 2.5851516764236298, + "grad_norm": 0.3945803642272949, + "learning_rate": 9.620100929688764e-08, + "loss": 0.1899, + "step": 9715 + }, + { + "epoch": 2.5854177754124534, + "grad_norm": 0.2984009385108948, + "learning_rate": 9.618413307542294e-08, + "loss": 0.1786, + "step": 9716 + }, + { + "epoch": 2.585683874401277, + "grad_norm": 0.3078489899635315, + "learning_rate": 9.616725696279446e-08, + "loss": 0.1781, + "step": 9717 + }, + { + "epoch": 2.5859499733901012, + "grad_norm": 0.39274862408638, + "learning_rate": 9.615038095948364e-08, + "loss": 0.193, + "step": 9718 + }, + { + "epoch": 2.586216072378925, + "grad_norm": 0.2613302767276764, + "learning_rate": 9.613350506597174e-08, + "loss": 0.1887, + "step": 9719 + }, + { + "epoch": 2.5864821713677486, + "grad_norm": 0.2889817953109741, + "learning_rate": 9.611662928274017e-08, + "loss": 0.1902, + "step": 9720 + }, + { + "epoch": 2.5867482703565727, + "grad_norm": 0.277614951133728, + "learning_rate": 9.609975361027019e-08, + "loss": 0.1747, + "step": 9721 + }, + { + "epoch": 2.5870143693453964, + "grad_norm": 0.3987312614917755, + "learning_rate": 9.608287804904322e-08, + "loss": 0.1826, + "step": 9722 + }, + { + "epoch": 2.58728046833422, + "grad_norm": 0.428840696811676, + "learning_rate": 9.606600259954047e-08, + "loss": 0.1807, + "step": 9723 + }, + { + "epoch": 2.5875465673230442, + "grad_norm": 0.31751537322998047, + "learning_rate": 9.604912726224335e-08, + "loss": 0.1815, + "step": 9724 + }, + { + "epoch": 2.587812666311868, + "grad_norm": 0.3880613148212433, + "learning_rate": 9.603225203763311e-08, + "loss": 0.1913, + "step": 9725 + }, + { + "epoch": 2.5880787653006916, + "grad_norm": 0.37112852931022644, + "learning_rate": 9.601537692619118e-08, + "loss": 0.1793, + "step": 9726 + }, + { + "epoch": 2.5883448642895157, + "grad_norm": 0.28016921877861023, + "learning_rate": 9.599850192839873e-08, + "loss": 0.1691, + "step": 9727 + }, + { + "epoch": 2.5886109632783394, + "grad_norm": 0.4673555791378021, + "learning_rate": 9.598162704473715e-08, + "loss": 0.1851, + "step": 9728 + }, + { + "epoch": 2.5888770622671635, + "grad_norm": 0.29296061396598816, + "learning_rate": 9.596475227568774e-08, + "loss": 0.2013, + "step": 9729 + }, + { + "epoch": 2.589143161255987, + "grad_norm": 0.25497472286224365, + "learning_rate": 9.594787762173179e-08, + "loss": 0.1805, + "step": 9730 + }, + { + "epoch": 2.5894092602448113, + "grad_norm": 0.25372955203056335, + "learning_rate": 9.593100308335063e-08, + "loss": 0.1693, + "step": 9731 + }, + { + "epoch": 2.589675359233635, + "grad_norm": 0.3385317325592041, + "learning_rate": 9.591412866102549e-08, + "loss": 0.1946, + "step": 9732 + }, + { + "epoch": 2.5899414582224587, + "grad_norm": 0.29302287101745605, + "learning_rate": 9.589725435523774e-08, + "loss": 0.1888, + "step": 9733 + }, + { + "epoch": 2.590207557211283, + "grad_norm": 0.3573448956012726, + "learning_rate": 9.588038016646861e-08, + "loss": 0.1827, + "step": 9734 + }, + { + "epoch": 2.5904736562001065, + "grad_norm": 0.34225496649742126, + "learning_rate": 9.586350609519942e-08, + "loss": 0.1786, + "step": 9735 + }, + { + "epoch": 2.59073975518893, + "grad_norm": 0.2648475766181946, + "learning_rate": 9.584663214191142e-08, + "loss": 0.179, + "step": 9736 + }, + { + "epoch": 2.5910058541777543, + "grad_norm": 0.3239668905735016, + "learning_rate": 9.582975830708596e-08, + "loss": 0.1789, + "step": 9737 + }, + { + "epoch": 2.591271953166578, + "grad_norm": 0.26145434379577637, + "learning_rate": 9.581288459120422e-08, + "loss": 0.1688, + "step": 9738 + }, + { + "epoch": 2.5915380521554017, + "grad_norm": 0.2642635405063629, + "learning_rate": 9.579601099474754e-08, + "loss": 0.1875, + "step": 9739 + }, + { + "epoch": 2.591804151144226, + "grad_norm": 0.27958813309669495, + "learning_rate": 9.577913751819716e-08, + "loss": 0.1496, + "step": 9740 + }, + { + "epoch": 2.5920702501330495, + "grad_norm": 0.2737657427787781, + "learning_rate": 9.576226416203438e-08, + "loss": 0.1536, + "step": 9741 + }, + { + "epoch": 2.592336349121873, + "grad_norm": 0.27434301376342773, + "learning_rate": 9.574539092674042e-08, + "loss": 0.1873, + "step": 9742 + }, + { + "epoch": 2.5926024481106973, + "grad_norm": 0.2798338234424591, + "learning_rate": 9.572851781279656e-08, + "loss": 0.1838, + "step": 9743 + }, + { + "epoch": 2.592868547099521, + "grad_norm": 0.244810089468956, + "learning_rate": 9.571164482068404e-08, + "loss": 0.1565, + "step": 9744 + }, + { + "epoch": 2.5931346460883447, + "grad_norm": 0.27308541536331177, + "learning_rate": 9.569477195088414e-08, + "loss": 0.1714, + "step": 9745 + }, + { + "epoch": 2.593400745077169, + "grad_norm": 0.3051944971084595, + "learning_rate": 9.567789920387806e-08, + "loss": 0.1959, + "step": 9746 + }, + { + "epoch": 2.5936668440659925, + "grad_norm": 0.2657693922519684, + "learning_rate": 9.566102658014708e-08, + "loss": 0.1876, + "step": 9747 + }, + { + "epoch": 2.593932943054816, + "grad_norm": 0.27798718214035034, + "learning_rate": 9.56441540801725e-08, + "loss": 0.1826, + "step": 9748 + }, + { + "epoch": 2.5941990420436403, + "grad_norm": 0.26067858934402466, + "learning_rate": 9.562728170443545e-08, + "loss": 0.172, + "step": 9749 + }, + { + "epoch": 2.594465141032464, + "grad_norm": 0.2627626359462738, + "learning_rate": 9.561040945341722e-08, + "loss": 0.1744, + "step": 9750 + }, + { + "epoch": 2.594731240021288, + "grad_norm": 0.2584655284881592, + "learning_rate": 9.559353732759902e-08, + "loss": 0.1693, + "step": 9751 + }, + { + "epoch": 2.5949973390101118, + "grad_norm": 0.46732932329177856, + "learning_rate": 9.557666532746213e-08, + "loss": 0.196, + "step": 9752 + }, + { + "epoch": 2.5952634379989354, + "grad_norm": 0.30928102135658264, + "learning_rate": 9.55597934534877e-08, + "loss": 0.1666, + "step": 9753 + }, + { + "epoch": 2.5955295369877596, + "grad_norm": 0.2631963789463043, + "learning_rate": 9.554292170615698e-08, + "loss": 0.1685, + "step": 9754 + }, + { + "epoch": 2.5957956359765832, + "grad_norm": 0.27246928215026855, + "learning_rate": 9.552605008595119e-08, + "loss": 0.1721, + "step": 9755 + }, + { + "epoch": 2.5960617349654074, + "grad_norm": 0.3475940525531769, + "learning_rate": 9.550917859335158e-08, + "loss": 0.1858, + "step": 9756 + }, + { + "epoch": 2.596327833954231, + "grad_norm": 0.35911789536476135, + "learning_rate": 9.549230722883926e-08, + "loss": 0.1867, + "step": 9757 + }, + { + "epoch": 2.5965939329430547, + "grad_norm": 0.29554247856140137, + "learning_rate": 9.547543599289556e-08, + "loss": 0.1859, + "step": 9758 + }, + { + "epoch": 2.596860031931879, + "grad_norm": 0.26229751110076904, + "learning_rate": 9.545856488600159e-08, + "loss": 0.1691, + "step": 9759 + }, + { + "epoch": 2.5971261309207025, + "grad_norm": 0.4213313162326813, + "learning_rate": 9.544169390863858e-08, + "loss": 0.1655, + "step": 9760 + }, + { + "epoch": 2.5973922299095262, + "grad_norm": 0.38641661405563354, + "learning_rate": 9.54248230612877e-08, + "loss": 0.1952, + "step": 9761 + }, + { + "epoch": 2.5976583288983504, + "grad_norm": 0.2653907239437103, + "learning_rate": 9.540795234443023e-08, + "loss": 0.1677, + "step": 9762 + }, + { + "epoch": 2.597924427887174, + "grad_norm": 0.3863420784473419, + "learning_rate": 9.539108175854723e-08, + "loss": 0.2001, + "step": 9763 + }, + { + "epoch": 2.5981905268759977, + "grad_norm": 0.2783592641353607, + "learning_rate": 9.537421130411997e-08, + "loss": 0.18, + "step": 9764 + }, + { + "epoch": 2.598456625864822, + "grad_norm": 0.3180851638317108, + "learning_rate": 9.535734098162958e-08, + "loss": 0.1656, + "step": 9765 + }, + { + "epoch": 2.5987227248536455, + "grad_norm": 0.3144052028656006, + "learning_rate": 9.534047079155726e-08, + "loss": 0.1862, + "step": 9766 + }, + { + "epoch": 2.598988823842469, + "grad_norm": 0.2915765941143036, + "learning_rate": 9.532360073438423e-08, + "loss": 0.1938, + "step": 9767 + }, + { + "epoch": 2.5992549228312933, + "grad_norm": 0.3016008138656616, + "learning_rate": 9.530673081059155e-08, + "loss": 0.168, + "step": 9768 + }, + { + "epoch": 2.599521021820117, + "grad_norm": 0.2611393928527832, + "learning_rate": 9.528986102066048e-08, + "loss": 0.1591, + "step": 9769 + }, + { + "epoch": 2.5997871208089407, + "grad_norm": 0.36306434869766235, + "learning_rate": 9.527299136507212e-08, + "loss": 0.1921, + "step": 9770 + }, + { + "epoch": 2.600053219797765, + "grad_norm": 0.34045788645744324, + "learning_rate": 9.525612184430768e-08, + "loss": 0.1896, + "step": 9771 + }, + { + "epoch": 2.6003193187865885, + "grad_norm": 0.2735370695590973, + "learning_rate": 9.523925245884825e-08, + "loss": 0.194, + "step": 9772 + }, + { + "epoch": 2.600585417775412, + "grad_norm": 0.2806396782398224, + "learning_rate": 9.522238320917507e-08, + "loss": 0.1923, + "step": 9773 + }, + { + "epoch": 2.6008515167642363, + "grad_norm": 0.28955885767936707, + "learning_rate": 9.520551409576918e-08, + "loss": 0.1711, + "step": 9774 + }, + { + "epoch": 2.60111761575306, + "grad_norm": 0.492364764213562, + "learning_rate": 9.51886451191118e-08, + "loss": 0.1913, + "step": 9775 + }, + { + "epoch": 2.601383714741884, + "grad_norm": 0.26312556862831116, + "learning_rate": 9.517177627968401e-08, + "loss": 0.1817, + "step": 9776 + }, + { + "epoch": 2.601649813730708, + "grad_norm": 0.3291328549385071, + "learning_rate": 9.515490757796702e-08, + "loss": 0.2015, + "step": 9777 + }, + { + "epoch": 2.601915912719532, + "grad_norm": 0.2696113586425781, + "learning_rate": 9.513803901444186e-08, + "loss": 0.1902, + "step": 9778 + }, + { + "epoch": 2.6021820117083556, + "grad_norm": 0.27321857213974, + "learning_rate": 9.512117058958974e-08, + "loss": 0.1608, + "step": 9779 + }, + { + "epoch": 2.6024481106971793, + "grad_norm": 0.3035888075828552, + "learning_rate": 9.510430230389172e-08, + "loss": 0.1852, + "step": 9780 + }, + { + "epoch": 2.6027142096860034, + "grad_norm": 0.26440945267677307, + "learning_rate": 9.508743415782898e-08, + "loss": 0.1789, + "step": 9781 + }, + { + "epoch": 2.602980308674827, + "grad_norm": 0.3859265446662903, + "learning_rate": 9.507056615188257e-08, + "loss": 0.1925, + "step": 9782 + }, + { + "epoch": 2.603246407663651, + "grad_norm": 0.2687169313430786, + "learning_rate": 9.505369828653362e-08, + "loss": 0.1676, + "step": 9783 + }, + { + "epoch": 2.603512506652475, + "grad_norm": 0.3428683876991272, + "learning_rate": 9.503683056226329e-08, + "loss": 0.1818, + "step": 9784 + }, + { + "epoch": 2.6037786056412986, + "grad_norm": 0.2904994785785675, + "learning_rate": 9.501996297955261e-08, + "loss": 0.1785, + "step": 9785 + }, + { + "epoch": 2.6040447046301223, + "grad_norm": 0.28198114037513733, + "learning_rate": 9.500309553888271e-08, + "loss": 0.1858, + "step": 9786 + }, + { + "epoch": 2.6043108036189464, + "grad_norm": 0.4301988482475281, + "learning_rate": 9.498622824073467e-08, + "loss": 0.1921, + "step": 9787 + }, + { + "epoch": 2.60457690260777, + "grad_norm": 0.3210729658603668, + "learning_rate": 9.496936108558964e-08, + "loss": 0.1731, + "step": 9788 + }, + { + "epoch": 2.6048430015965938, + "grad_norm": 0.3224325478076935, + "learning_rate": 9.495249407392859e-08, + "loss": 0.1777, + "step": 9789 + }, + { + "epoch": 2.605109100585418, + "grad_norm": 0.36494573950767517, + "learning_rate": 9.49356272062327e-08, + "loss": 0.2137, + "step": 9790 + }, + { + "epoch": 2.6053751995742416, + "grad_norm": 0.37058955430984497, + "learning_rate": 9.4918760482983e-08, + "loss": 0.1877, + "step": 9791 + }, + { + "epoch": 2.6056412985630653, + "grad_norm": 0.2924136221408844, + "learning_rate": 9.490189390466062e-08, + "loss": 0.1908, + "step": 9792 + }, + { + "epoch": 2.6059073975518894, + "grad_norm": 0.2637692391872406, + "learning_rate": 9.488502747174653e-08, + "loss": 0.1755, + "step": 9793 + }, + { + "epoch": 2.606173496540713, + "grad_norm": 0.2766974866390228, + "learning_rate": 9.486816118472186e-08, + "loss": 0.1881, + "step": 9794 + }, + { + "epoch": 2.6064395955295367, + "grad_norm": 0.2868952453136444, + "learning_rate": 9.485129504406766e-08, + "loss": 0.1925, + "step": 9795 + }, + { + "epoch": 2.606705694518361, + "grad_norm": 0.27662912011146545, + "learning_rate": 9.483442905026501e-08, + "loss": 0.1724, + "step": 9796 + }, + { + "epoch": 2.6069717935071846, + "grad_norm": 0.4095354378223419, + "learning_rate": 9.481756320379491e-08, + "loss": 0.1683, + "step": 9797 + }, + { + "epoch": 2.6072378924960087, + "grad_norm": 0.2625327706336975, + "learning_rate": 9.480069750513849e-08, + "loss": 0.1717, + "step": 9798 + }, + { + "epoch": 2.6075039914848324, + "grad_norm": 0.3477133810520172, + "learning_rate": 9.47838319547767e-08, + "loss": 0.184, + "step": 9799 + }, + { + "epoch": 2.607770090473656, + "grad_norm": 0.3417242169380188, + "learning_rate": 9.476696655319063e-08, + "loss": 0.1964, + "step": 9800 + }, + { + "epoch": 2.60803618946248, + "grad_norm": 0.2921861708164215, + "learning_rate": 9.475010130086129e-08, + "loss": 0.1889, + "step": 9801 + }, + { + "epoch": 2.608302288451304, + "grad_norm": 0.2770630121231079, + "learning_rate": 9.473323619826974e-08, + "loss": 0.1664, + "step": 9802 + }, + { + "epoch": 2.608568387440128, + "grad_norm": 0.2712157666683197, + "learning_rate": 9.471637124589705e-08, + "loss": 0.1662, + "step": 9803 + }, + { + "epoch": 2.6088344864289517, + "grad_norm": 0.2832953631877899, + "learning_rate": 9.469950644422412e-08, + "loss": 0.164, + "step": 9804 + }, + { + "epoch": 2.6091005854177753, + "grad_norm": 0.2975037693977356, + "learning_rate": 9.468264179373207e-08, + "loss": 0.1848, + "step": 9805 + }, + { + "epoch": 2.6093666844065995, + "grad_norm": 0.27495628595352173, + "learning_rate": 9.466577729490186e-08, + "loss": 0.1815, + "step": 9806 + }, + { + "epoch": 2.609632783395423, + "grad_norm": 0.2702060341835022, + "learning_rate": 9.464891294821457e-08, + "loss": 0.1797, + "step": 9807 + }, + { + "epoch": 2.609898882384247, + "grad_norm": 0.3033934533596039, + "learning_rate": 9.463204875415107e-08, + "loss": 0.1997, + "step": 9808 + }, + { + "epoch": 2.610164981373071, + "grad_norm": 0.2980945408344269, + "learning_rate": 9.461518471319253e-08, + "loss": 0.1862, + "step": 9809 + }, + { + "epoch": 2.6104310803618946, + "grad_norm": 0.35873544216156006, + "learning_rate": 9.459832082581982e-08, + "loss": 0.2045, + "step": 9810 + }, + { + "epoch": 2.6106971793507183, + "grad_norm": 0.26078036427497864, + "learning_rate": 9.458145709251399e-08, + "loss": 0.1694, + "step": 9811 + }, + { + "epoch": 2.6109632783395424, + "grad_norm": 0.2723875045776367, + "learning_rate": 9.456459351375601e-08, + "loss": 0.1718, + "step": 9812 + }, + { + "epoch": 2.611229377328366, + "grad_norm": 0.4052169620990753, + "learning_rate": 9.45477300900269e-08, + "loss": 0.1978, + "step": 9813 + }, + { + "epoch": 2.61149547631719, + "grad_norm": 0.2522009015083313, + "learning_rate": 9.453086682180757e-08, + "loss": 0.1704, + "step": 9814 + }, + { + "epoch": 2.611761575306014, + "grad_norm": 0.2691408395767212, + "learning_rate": 9.451400370957905e-08, + "loss": 0.181, + "step": 9815 + }, + { + "epoch": 2.6120276742948376, + "grad_norm": 0.29490992426872253, + "learning_rate": 9.449714075382227e-08, + "loss": 0.1786, + "step": 9816 + }, + { + "epoch": 2.6122937732836613, + "grad_norm": 0.2864140272140503, + "learning_rate": 9.448027795501826e-08, + "loss": 0.1895, + "step": 9817 + }, + { + "epoch": 2.6125598722724854, + "grad_norm": 0.2724868357181549, + "learning_rate": 9.44634153136479e-08, + "loss": 0.1835, + "step": 9818 + }, + { + "epoch": 2.612825971261309, + "grad_norm": 0.4459450840950012, + "learning_rate": 9.444655283019219e-08, + "loss": 0.2053, + "step": 9819 + }, + { + "epoch": 2.613092070250133, + "grad_norm": 0.27251070737838745, + "learning_rate": 9.442969050513209e-08, + "loss": 0.1728, + "step": 9820 + }, + { + "epoch": 2.613358169238957, + "grad_norm": 0.27021706104278564, + "learning_rate": 9.441282833894852e-08, + "loss": 0.1901, + "step": 9821 + }, + { + "epoch": 2.6136242682277806, + "grad_norm": 0.2682243585586548, + "learning_rate": 9.439596633212246e-08, + "loss": 0.182, + "step": 9822 + }, + { + "epoch": 2.6138903672166047, + "grad_norm": 0.38251057267189026, + "learning_rate": 9.437910448513479e-08, + "loss": 0.2036, + "step": 9823 + }, + { + "epoch": 2.6141564662054284, + "grad_norm": 0.32503101229667664, + "learning_rate": 9.436224279846655e-08, + "loss": 0.1867, + "step": 9824 + }, + { + "epoch": 2.6144225651942525, + "grad_norm": 0.45270946621894836, + "learning_rate": 9.434538127259854e-08, + "loss": 0.1889, + "step": 9825 + }, + { + "epoch": 2.614688664183076, + "grad_norm": 0.35032159090042114, + "learning_rate": 9.432851990801179e-08, + "loss": 0.1931, + "step": 9826 + }, + { + "epoch": 2.6149547631719, + "grad_norm": 0.2960491478443146, + "learning_rate": 9.431165870518715e-08, + "loss": 0.1899, + "step": 9827 + }, + { + "epoch": 2.615220862160724, + "grad_norm": 0.26278194785118103, + "learning_rate": 9.42947976646056e-08, + "loss": 0.1813, + "step": 9828 + }, + { + "epoch": 2.6154869611495477, + "grad_norm": 0.367779016494751, + "learning_rate": 9.427793678674797e-08, + "loss": 0.1816, + "step": 9829 + }, + { + "epoch": 2.6157530601383714, + "grad_norm": 0.44788119196891785, + "learning_rate": 9.426107607209523e-08, + "loss": 0.1902, + "step": 9830 + }, + { + "epoch": 2.6160191591271955, + "grad_norm": 0.2986317574977875, + "learning_rate": 9.424421552112825e-08, + "loss": 0.1716, + "step": 9831 + }, + { + "epoch": 2.616285258116019, + "grad_norm": 0.28565776348114014, + "learning_rate": 9.422735513432797e-08, + "loss": 0.1832, + "step": 9832 + }, + { + "epoch": 2.616551357104843, + "grad_norm": 0.24611829221248627, + "learning_rate": 9.42104949121752e-08, + "loss": 0.1576, + "step": 9833 + }, + { + "epoch": 2.616817456093667, + "grad_norm": 0.3372734487056732, + "learning_rate": 9.419363485515094e-08, + "loss": 0.1905, + "step": 9834 + }, + { + "epoch": 2.6170835550824907, + "grad_norm": 0.26267826557159424, + "learning_rate": 9.417677496373597e-08, + "loss": 0.1631, + "step": 9835 + }, + { + "epoch": 2.6173496540713144, + "grad_norm": 0.2497083842754364, + "learning_rate": 9.415991523841123e-08, + "loss": 0.1617, + "step": 9836 + }, + { + "epoch": 2.6176157530601385, + "grad_norm": 0.2632092833518982, + "learning_rate": 9.414305567965756e-08, + "loss": 0.1758, + "step": 9837 + }, + { + "epoch": 2.617881852048962, + "grad_norm": 0.3316352963447571, + "learning_rate": 9.412619628795583e-08, + "loss": 0.1885, + "step": 9838 + }, + { + "epoch": 2.618147951037786, + "grad_norm": 0.2618423104286194, + "learning_rate": 9.410933706378697e-08, + "loss": 0.1623, + "step": 9839 + }, + { + "epoch": 2.61841405002661, + "grad_norm": 0.3456522226333618, + "learning_rate": 9.409247800763175e-08, + "loss": 0.1847, + "step": 9840 + }, + { + "epoch": 2.6186801490154337, + "grad_norm": 0.29940348863601685, + "learning_rate": 9.407561911997108e-08, + "loss": 0.1954, + "step": 9841 + }, + { + "epoch": 2.6189462480042573, + "grad_norm": 0.333452045917511, + "learning_rate": 9.405876040128576e-08, + "loss": 0.1944, + "step": 9842 + }, + { + "epoch": 2.6192123469930815, + "grad_norm": 0.26410195231437683, + "learning_rate": 9.404190185205673e-08, + "loss": 0.1823, + "step": 9843 + }, + { + "epoch": 2.619478445981905, + "grad_norm": 0.28947946429252625, + "learning_rate": 9.402504347276471e-08, + "loss": 0.1814, + "step": 9844 + }, + { + "epoch": 2.619744544970729, + "grad_norm": 0.2860977351665497, + "learning_rate": 9.400818526389061e-08, + "loss": 0.1731, + "step": 9845 + }, + { + "epoch": 2.620010643959553, + "grad_norm": 0.330018550157547, + "learning_rate": 9.399132722591523e-08, + "loss": 0.1619, + "step": 9846 + }, + { + "epoch": 2.6202767429483766, + "grad_norm": 0.31481820344924927, + "learning_rate": 9.397446935931942e-08, + "loss": 0.1839, + "step": 9847 + }, + { + "epoch": 2.6205428419372008, + "grad_norm": 0.25569164752960205, + "learning_rate": 9.395761166458397e-08, + "loss": 0.1631, + "step": 9848 + }, + { + "epoch": 2.6208089409260245, + "grad_norm": 0.3644865155220032, + "learning_rate": 9.394075414218975e-08, + "loss": 0.1789, + "step": 9849 + }, + { + "epoch": 2.6210750399148486, + "grad_norm": 0.2832759618759155, + "learning_rate": 9.392389679261752e-08, + "loss": 0.1802, + "step": 9850 + }, + { + "epoch": 2.6213411389036723, + "grad_norm": 0.2713339626789093, + "learning_rate": 9.39070396163481e-08, + "loss": 0.1917, + "step": 9851 + }, + { + "epoch": 2.621607237892496, + "grad_norm": 0.28522253036499023, + "learning_rate": 9.389018261386227e-08, + "loss": 0.1767, + "step": 9852 + }, + { + "epoch": 2.62187333688132, + "grad_norm": 0.39069288969039917, + "learning_rate": 9.387332578564088e-08, + "loss": 0.1925, + "step": 9853 + }, + { + "epoch": 2.6221394358701438, + "grad_norm": 0.36317896842956543, + "learning_rate": 9.385646913216468e-08, + "loss": 0.1876, + "step": 9854 + }, + { + "epoch": 2.6224055348589674, + "grad_norm": 0.2610635757446289, + "learning_rate": 9.383961265391446e-08, + "loss": 0.179, + "step": 9855 + }, + { + "epoch": 2.6226716338477916, + "grad_norm": 0.2656244933605194, + "learning_rate": 9.382275635137099e-08, + "loss": 0.1774, + "step": 9856 + }, + { + "epoch": 2.6229377328366152, + "grad_norm": 0.31510719656944275, + "learning_rate": 9.380590022501505e-08, + "loss": 0.1936, + "step": 9857 + }, + { + "epoch": 2.623203831825439, + "grad_norm": 0.4055366814136505, + "learning_rate": 9.378904427532748e-08, + "loss": 0.1877, + "step": 9858 + }, + { + "epoch": 2.623469930814263, + "grad_norm": 0.3132481276988983, + "learning_rate": 9.377218850278893e-08, + "loss": 0.181, + "step": 9859 + }, + { + "epoch": 2.6237360298030867, + "grad_norm": 0.34978780150413513, + "learning_rate": 9.375533290788027e-08, + "loss": 0.1932, + "step": 9860 + }, + { + "epoch": 2.6240021287919104, + "grad_norm": 0.2627173960208893, + "learning_rate": 9.373847749108217e-08, + "loss": 0.1723, + "step": 9861 + }, + { + "epoch": 2.6242682277807345, + "grad_norm": 0.2974148392677307, + "learning_rate": 9.372162225287543e-08, + "loss": 0.2051, + "step": 9862 + }, + { + "epoch": 2.6245343267695582, + "grad_norm": 0.43068256974220276, + "learning_rate": 9.370476719374077e-08, + "loss": 0.2049, + "step": 9863 + }, + { + "epoch": 2.624800425758382, + "grad_norm": 0.26751410961151123, + "learning_rate": 9.368791231415898e-08, + "loss": 0.1756, + "step": 9864 + }, + { + "epoch": 2.625066524747206, + "grad_norm": 0.5218235850334167, + "learning_rate": 9.367105761461072e-08, + "loss": 0.1906, + "step": 9865 + }, + { + "epoch": 2.6253326237360297, + "grad_norm": 0.2868984639644623, + "learning_rate": 9.365420309557677e-08, + "loss": 0.1777, + "step": 9866 + }, + { + "epoch": 2.6255987227248534, + "grad_norm": 0.2839038074016571, + "learning_rate": 9.363734875753781e-08, + "loss": 0.1895, + "step": 9867 + }, + { + "epoch": 2.6258648217136775, + "grad_norm": 0.2586352825164795, + "learning_rate": 9.362049460097466e-08, + "loss": 0.1601, + "step": 9868 + }, + { + "epoch": 2.626130920702501, + "grad_norm": 0.29588472843170166, + "learning_rate": 9.360364062636792e-08, + "loss": 0.1816, + "step": 9869 + }, + { + "epoch": 2.6263970196913253, + "grad_norm": 0.2714025676250458, + "learning_rate": 9.358678683419837e-08, + "loss": 0.1857, + "step": 9870 + }, + { + "epoch": 2.626663118680149, + "grad_norm": 0.28234758973121643, + "learning_rate": 9.356993322494666e-08, + "loss": 0.181, + "step": 9871 + }, + { + "epoch": 2.6269292176689727, + "grad_norm": 0.29456573724746704, + "learning_rate": 9.355307979909357e-08, + "loss": 0.1662, + "step": 9872 + }, + { + "epoch": 2.627195316657797, + "grad_norm": 0.29930299520492554, + "learning_rate": 9.353622655711968e-08, + "loss": 0.1737, + "step": 9873 + }, + { + "epoch": 2.6274614156466205, + "grad_norm": 0.26474788784980774, + "learning_rate": 9.351937349950577e-08, + "loss": 0.1674, + "step": 9874 + }, + { + "epoch": 2.6277275146354446, + "grad_norm": 0.2532167136669159, + "learning_rate": 9.350252062673252e-08, + "loss": 0.1669, + "step": 9875 + }, + { + "epoch": 2.6279936136242683, + "grad_norm": 0.32035747170448303, + "learning_rate": 9.348566793928056e-08, + "loss": 0.1778, + "step": 9876 + }, + { + "epoch": 2.628259712613092, + "grad_norm": 0.2795013189315796, + "learning_rate": 9.346881543763062e-08, + "loss": 0.1895, + "step": 9877 + }, + { + "epoch": 2.628525811601916, + "grad_norm": 0.29472553730010986, + "learning_rate": 9.345196312226328e-08, + "loss": 0.1781, + "step": 9878 + }, + { + "epoch": 2.62879191059074, + "grad_norm": 0.2500092387199402, + "learning_rate": 9.343511099365933e-08, + "loss": 0.1754, + "step": 9879 + }, + { + "epoch": 2.6290580095795635, + "grad_norm": 0.2924249470233917, + "learning_rate": 9.341825905229929e-08, + "loss": 0.1897, + "step": 9880 + }, + { + "epoch": 2.6293241085683876, + "grad_norm": 0.374016135931015, + "learning_rate": 9.340140729866392e-08, + "loss": 0.1856, + "step": 9881 + }, + { + "epoch": 2.6295902075572113, + "grad_norm": 0.30612489581108093, + "learning_rate": 9.338455573323379e-08, + "loss": 0.1948, + "step": 9882 + }, + { + "epoch": 2.629856306546035, + "grad_norm": 0.29019325971603394, + "learning_rate": 9.336770435648964e-08, + "loss": 0.1869, + "step": 9883 + }, + { + "epoch": 2.630122405534859, + "grad_norm": 0.26235824823379517, + "learning_rate": 9.335085316891193e-08, + "loss": 0.171, + "step": 9884 + }, + { + "epoch": 2.630388504523683, + "grad_norm": 0.2880159318447113, + "learning_rate": 9.33340021709815e-08, + "loss": 0.1959, + "step": 9885 + }, + { + "epoch": 2.6306546035125065, + "grad_norm": 0.3531763553619385, + "learning_rate": 9.331715136317882e-08, + "loss": 0.1742, + "step": 9886 + }, + { + "epoch": 2.6309207025013306, + "grad_norm": 0.26425784826278687, + "learning_rate": 9.330030074598459e-08, + "loss": 0.1825, + "step": 9887 + }, + { + "epoch": 2.6311868014901543, + "grad_norm": 0.2966040074825287, + "learning_rate": 9.328345031987937e-08, + "loss": 0.1774, + "step": 9888 + }, + { + "epoch": 2.631452900478978, + "grad_norm": 0.45102715492248535, + "learning_rate": 9.326660008534383e-08, + "loss": 0.1896, + "step": 9889 + }, + { + "epoch": 2.631718999467802, + "grad_norm": 0.2804625928401947, + "learning_rate": 9.32497500428585e-08, + "loss": 0.1813, + "step": 9890 + }, + { + "epoch": 2.6319850984566258, + "grad_norm": 0.2536540627479553, + "learning_rate": 9.323290019290404e-08, + "loss": 0.1849, + "step": 9891 + }, + { + "epoch": 2.6322511974454494, + "grad_norm": 0.26480063796043396, + "learning_rate": 9.3216050535961e-08, + "loss": 0.1702, + "step": 9892 + }, + { + "epoch": 2.6325172964342736, + "grad_norm": 0.2651907205581665, + "learning_rate": 9.319920107250998e-08, + "loss": 0.1822, + "step": 9893 + }, + { + "epoch": 2.6327833954230973, + "grad_norm": 0.268526166677475, + "learning_rate": 9.318235180303161e-08, + "loss": 0.1792, + "step": 9894 + }, + { + "epoch": 2.6330494944119214, + "grad_norm": 0.4010355472564697, + "learning_rate": 9.316550272800639e-08, + "loss": 0.1763, + "step": 9895 + }, + { + "epoch": 2.633315593400745, + "grad_norm": 0.36404505372047424, + "learning_rate": 9.314865384791493e-08, + "loss": 0.1703, + "step": 9896 + }, + { + "epoch": 2.633581692389569, + "grad_norm": 0.4523656368255615, + "learning_rate": 9.313180516323775e-08, + "loss": 0.1989, + "step": 9897 + }, + { + "epoch": 2.633847791378393, + "grad_norm": 0.25793540477752686, + "learning_rate": 9.311495667445549e-08, + "loss": 0.1684, + "step": 9898 + }, + { + "epoch": 2.6341138903672165, + "grad_norm": 0.27985596656799316, + "learning_rate": 9.309810838204861e-08, + "loss": 0.186, + "step": 9899 + }, + { + "epoch": 2.6343799893560407, + "grad_norm": 0.3523977994918823, + "learning_rate": 9.308126028649777e-08, + "loss": 0.1815, + "step": 9900 + }, + { + "epoch": 2.6346460883448644, + "grad_norm": 0.27222850918769836, + "learning_rate": 9.30644123882834e-08, + "loss": 0.1836, + "step": 9901 + }, + { + "epoch": 2.634912187333688, + "grad_norm": 0.3022732436656952, + "learning_rate": 9.304756468788609e-08, + "loss": 0.1832, + "step": 9902 + }, + { + "epoch": 2.635178286322512, + "grad_norm": 0.2730262279510498, + "learning_rate": 9.303071718578634e-08, + "loss": 0.1727, + "step": 9903 + }, + { + "epoch": 2.635444385311336, + "grad_norm": 0.3865545392036438, + "learning_rate": 9.301386988246476e-08, + "loss": 0.1911, + "step": 9904 + }, + { + "epoch": 2.6357104843001595, + "grad_norm": 0.28257009387016296, + "learning_rate": 9.299702277840175e-08, + "loss": 0.1831, + "step": 9905 + }, + { + "epoch": 2.6359765832889837, + "grad_norm": 0.3639448881149292, + "learning_rate": 9.298017587407792e-08, + "loss": 0.1731, + "step": 9906 + }, + { + "epoch": 2.6362426822778073, + "grad_norm": 0.30669984221458435, + "learning_rate": 9.29633291699737e-08, + "loss": 0.1882, + "step": 9907 + }, + { + "epoch": 2.636508781266631, + "grad_norm": 0.26607728004455566, + "learning_rate": 9.294648266656968e-08, + "loss": 0.1786, + "step": 9908 + }, + { + "epoch": 2.636774880255455, + "grad_norm": 0.311396986246109, + "learning_rate": 9.292963636434626e-08, + "loss": 0.1798, + "step": 9909 + }, + { + "epoch": 2.637040979244279, + "grad_norm": 0.263460248708725, + "learning_rate": 9.2912790263784e-08, + "loss": 0.1704, + "step": 9910 + }, + { + "epoch": 2.6373070782331025, + "grad_norm": 0.3403688967227936, + "learning_rate": 9.289594436536336e-08, + "loss": 0.1938, + "step": 9911 + }, + { + "epoch": 2.6375731772219266, + "grad_norm": 0.26518046855926514, + "learning_rate": 9.287909866956481e-08, + "loss": 0.1764, + "step": 9912 + }, + { + "epoch": 2.6378392762107503, + "grad_norm": 0.2877962589263916, + "learning_rate": 9.286225317686885e-08, + "loss": 0.1752, + "step": 9913 + }, + { + "epoch": 2.638105375199574, + "grad_norm": 0.28340575098991394, + "learning_rate": 9.284540788775592e-08, + "loss": 0.19, + "step": 9914 + }, + { + "epoch": 2.638371474188398, + "grad_norm": 0.31818103790283203, + "learning_rate": 9.282856280270655e-08, + "loss": 0.1851, + "step": 9915 + }, + { + "epoch": 2.638637573177222, + "grad_norm": 0.3263067305088043, + "learning_rate": 9.281171792220108e-08, + "loss": 0.1794, + "step": 9916 + }, + { + "epoch": 2.638903672166046, + "grad_norm": 0.44165387749671936, + "learning_rate": 9.279487324672005e-08, + "loss": 0.1959, + "step": 9917 + }, + { + "epoch": 2.6391697711548696, + "grad_norm": 0.28026899695396423, + "learning_rate": 9.277802877674386e-08, + "loss": 0.1699, + "step": 9918 + }, + { + "epoch": 2.6394358701436933, + "grad_norm": 0.27356478571891785, + "learning_rate": 9.2761184512753e-08, + "loss": 0.1796, + "step": 9919 + }, + { + "epoch": 2.6397019691325174, + "grad_norm": 0.2569533884525299, + "learning_rate": 9.274434045522783e-08, + "loss": 0.1526, + "step": 9920 + }, + { + "epoch": 2.639968068121341, + "grad_norm": 0.33614739775657654, + "learning_rate": 9.272749660464884e-08, + "loss": 0.1907, + "step": 9921 + }, + { + "epoch": 2.6402341671101652, + "grad_norm": 0.2787568271160126, + "learning_rate": 9.27106529614964e-08, + "loss": 0.1715, + "step": 9922 + }, + { + "epoch": 2.640500266098989, + "grad_norm": 0.28668755292892456, + "learning_rate": 9.269380952625096e-08, + "loss": 0.1978, + "step": 9923 + }, + { + "epoch": 2.6407663650878126, + "grad_norm": 0.3267413377761841, + "learning_rate": 9.267696629939291e-08, + "loss": 0.1759, + "step": 9924 + }, + { + "epoch": 2.6410324640766367, + "grad_norm": 0.2987154424190521, + "learning_rate": 9.266012328140271e-08, + "loss": 0.1791, + "step": 9925 + }, + { + "epoch": 2.6412985630654604, + "grad_norm": 0.34854933619499207, + "learning_rate": 9.264328047276065e-08, + "loss": 0.1824, + "step": 9926 + }, + { + "epoch": 2.641564662054284, + "grad_norm": 0.36680805683135986, + "learning_rate": 9.262643787394722e-08, + "loss": 0.1909, + "step": 9927 + }, + { + "epoch": 2.641830761043108, + "grad_norm": 0.32628700137138367, + "learning_rate": 9.260959548544273e-08, + "loss": 0.1813, + "step": 9928 + }, + { + "epoch": 2.642096860031932, + "grad_norm": 0.39409035444259644, + "learning_rate": 9.25927533077276e-08, + "loss": 0.1578, + "step": 9929 + }, + { + "epoch": 2.6423629590207556, + "grad_norm": 0.24414780735969543, + "learning_rate": 9.257591134128224e-08, + "loss": 0.1617, + "step": 9930 + }, + { + "epoch": 2.6426290580095797, + "grad_norm": 0.27121180295944214, + "learning_rate": 9.255906958658694e-08, + "loss": 0.1739, + "step": 9931 + }, + { + "epoch": 2.6428951569984034, + "grad_norm": 0.3409609794616699, + "learning_rate": 9.254222804412211e-08, + "loss": 0.2158, + "step": 9932 + }, + { + "epoch": 2.643161255987227, + "grad_norm": 0.29064589738845825, + "learning_rate": 9.252538671436807e-08, + "loss": 0.1747, + "step": 9933 + }, + { + "epoch": 2.643427354976051, + "grad_norm": 0.3503568768501282, + "learning_rate": 9.250854559780523e-08, + "loss": 0.1883, + "step": 9934 + }, + { + "epoch": 2.643693453964875, + "grad_norm": 0.3056628406047821, + "learning_rate": 9.249170469491383e-08, + "loss": 0.1903, + "step": 9935 + }, + { + "epoch": 2.6439595529536986, + "grad_norm": 0.49929365515708923, + "learning_rate": 9.247486400617434e-08, + "loss": 0.2025, + "step": 9936 + }, + { + "epoch": 2.6442256519425227, + "grad_norm": 0.262788861989975, + "learning_rate": 9.245802353206697e-08, + "loss": 0.1794, + "step": 9937 + }, + { + "epoch": 2.6444917509313464, + "grad_norm": 0.28673455119132996, + "learning_rate": 9.244118327307212e-08, + "loss": 0.1966, + "step": 9938 + }, + { + "epoch": 2.64475784992017, + "grad_norm": 0.264872282743454, + "learning_rate": 9.242434322967007e-08, + "loss": 0.1766, + "step": 9939 + }, + { + "epoch": 2.645023948908994, + "grad_norm": 0.32098573446273804, + "learning_rate": 9.240750340234117e-08, + "loss": 0.1908, + "step": 9940 + }, + { + "epoch": 2.645290047897818, + "grad_norm": 0.39961934089660645, + "learning_rate": 9.239066379156567e-08, + "loss": 0.1876, + "step": 9941 + }, + { + "epoch": 2.645556146886642, + "grad_norm": 0.2504035234451294, + "learning_rate": 9.237382439782391e-08, + "loss": 0.1629, + "step": 9942 + }, + { + "epoch": 2.6458222458754657, + "grad_norm": 0.2672024071216583, + "learning_rate": 9.235698522159619e-08, + "loss": 0.1676, + "step": 9943 + }, + { + "epoch": 2.64608834486429, + "grad_norm": 0.2568296492099762, + "learning_rate": 9.23401462633628e-08, + "loss": 0.1702, + "step": 9944 + }, + { + "epoch": 2.6463544438531135, + "grad_norm": 0.2787717580795288, + "learning_rate": 9.232330752360395e-08, + "loss": 0.1797, + "step": 9945 + }, + { + "epoch": 2.646620542841937, + "grad_norm": 0.2945546507835388, + "learning_rate": 9.23064690028e-08, + "loss": 0.1729, + "step": 9946 + }, + { + "epoch": 2.6468866418307613, + "grad_norm": 0.41789478063583374, + "learning_rate": 9.228963070143118e-08, + "loss": 0.1823, + "step": 9947 + }, + { + "epoch": 2.647152740819585, + "grad_norm": 0.2689272165298462, + "learning_rate": 9.227279261997774e-08, + "loss": 0.1672, + "step": 9948 + }, + { + "epoch": 2.6474188398084086, + "grad_norm": 0.267954021692276, + "learning_rate": 9.225595475892e-08, + "loss": 0.1789, + "step": 9949 + }, + { + "epoch": 2.6476849387972328, + "grad_norm": 0.3356037139892578, + "learning_rate": 9.223911711873813e-08, + "loss": 0.1809, + "step": 9950 + }, + { + "epoch": 2.6479510377860564, + "grad_norm": 0.3746070861816406, + "learning_rate": 9.222227969991247e-08, + "loss": 0.2049, + "step": 9951 + }, + { + "epoch": 2.64821713677488, + "grad_norm": 0.35864678025245667, + "learning_rate": 9.220544250292314e-08, + "loss": 0.1916, + "step": 9952 + }, + { + "epoch": 2.6484832357637043, + "grad_norm": 0.2871319353580475, + "learning_rate": 9.218860552825045e-08, + "loss": 0.1946, + "step": 9953 + }, + { + "epoch": 2.648749334752528, + "grad_norm": 0.3123330771923065, + "learning_rate": 9.217176877637458e-08, + "loss": 0.1784, + "step": 9954 + }, + { + "epoch": 2.6490154337413516, + "grad_norm": 0.32265153527259827, + "learning_rate": 9.215493224777582e-08, + "loss": 0.1838, + "step": 9955 + }, + { + "epoch": 2.6492815327301757, + "grad_norm": 0.2640184462070465, + "learning_rate": 9.21380959429343e-08, + "loss": 0.1682, + "step": 9956 + }, + { + "epoch": 2.6495476317189994, + "grad_norm": 0.2690771520137787, + "learning_rate": 9.212125986233026e-08, + "loss": 0.1921, + "step": 9957 + }, + { + "epoch": 2.649813730707823, + "grad_norm": 0.2766142785549164, + "learning_rate": 9.21044240064439e-08, + "loss": 0.1665, + "step": 9958 + }, + { + "epoch": 2.6500798296966472, + "grad_norm": 0.25619444251060486, + "learning_rate": 9.208758837575546e-08, + "loss": 0.1711, + "step": 9959 + }, + { + "epoch": 2.650345928685471, + "grad_norm": 0.4016231596469879, + "learning_rate": 9.207075297074499e-08, + "loss": 0.2188, + "step": 9960 + }, + { + "epoch": 2.6506120276742946, + "grad_norm": 0.3645983338356018, + "learning_rate": 9.205391779189284e-08, + "loss": 0.182, + "step": 9961 + }, + { + "epoch": 2.6508781266631187, + "grad_norm": 0.34192997217178345, + "learning_rate": 9.203708283967905e-08, + "loss": 0.2032, + "step": 9962 + }, + { + "epoch": 2.6511442256519424, + "grad_norm": 0.2653360366821289, + "learning_rate": 9.202024811458385e-08, + "loss": 0.1789, + "step": 9963 + }, + { + "epoch": 2.651410324640766, + "grad_norm": 0.39346277713775635, + "learning_rate": 9.200341361708739e-08, + "loss": 0.1851, + "step": 9964 + }, + { + "epoch": 2.65167642362959, + "grad_norm": 0.27569305896759033, + "learning_rate": 9.198657934766986e-08, + "loss": 0.1832, + "step": 9965 + }, + { + "epoch": 2.651942522618414, + "grad_norm": 0.3245052397251129, + "learning_rate": 9.196974530681132e-08, + "loss": 0.1707, + "step": 9966 + }, + { + "epoch": 2.652208621607238, + "grad_norm": 0.40541043877601624, + "learning_rate": 9.195291149499196e-08, + "loss": 0.1907, + "step": 9967 + }, + { + "epoch": 2.6524747205960617, + "grad_norm": 0.40088897943496704, + "learning_rate": 9.193607791269195e-08, + "loss": 0.1773, + "step": 9968 + }, + { + "epoch": 2.652740819584886, + "grad_norm": 0.27515652775764465, + "learning_rate": 9.191924456039135e-08, + "loss": 0.1804, + "step": 9969 + }, + { + "epoch": 2.6530069185737095, + "grad_norm": 0.2933943569660187, + "learning_rate": 9.190241143857037e-08, + "loss": 0.1768, + "step": 9970 + }, + { + "epoch": 2.653273017562533, + "grad_norm": 0.3275844156742096, + "learning_rate": 9.188557854770902e-08, + "loss": 0.1741, + "step": 9971 + }, + { + "epoch": 2.6535391165513573, + "grad_norm": 0.38079652190208435, + "learning_rate": 9.186874588828749e-08, + "loss": 0.1772, + "step": 9972 + }, + { + "epoch": 2.653805215540181, + "grad_norm": 0.37537306547164917, + "learning_rate": 9.185191346078583e-08, + "loss": 0.1861, + "step": 9973 + }, + { + "epoch": 2.6540713145290047, + "grad_norm": 0.26127272844314575, + "learning_rate": 9.183508126568417e-08, + "loss": 0.1702, + "step": 9974 + }, + { + "epoch": 2.654337413517829, + "grad_norm": 0.26497095823287964, + "learning_rate": 9.181824930346258e-08, + "loss": 0.1683, + "step": 9975 + }, + { + "epoch": 2.6546035125066525, + "grad_norm": 0.35691484808921814, + "learning_rate": 9.180141757460116e-08, + "loss": 0.1848, + "step": 9976 + }, + { + "epoch": 2.654869611495476, + "grad_norm": 0.4080338180065155, + "learning_rate": 9.178458607957994e-08, + "loss": 0.1808, + "step": 9977 + }, + { + "epoch": 2.6551357104843003, + "grad_norm": 0.2898789644241333, + "learning_rate": 9.176775481887904e-08, + "loss": 0.1609, + "step": 9978 + }, + { + "epoch": 2.655401809473124, + "grad_norm": 0.3777288794517517, + "learning_rate": 9.175092379297849e-08, + "loss": 0.2041, + "step": 9979 + }, + { + "epoch": 2.6556679084619477, + "grad_norm": 0.27949631214141846, + "learning_rate": 9.173409300235839e-08, + "loss": 0.1916, + "step": 9980 + }, + { + "epoch": 2.655934007450772, + "grad_norm": 0.2625676989555359, + "learning_rate": 9.171726244749871e-08, + "loss": 0.1721, + "step": 9981 + }, + { + "epoch": 2.6562001064395955, + "grad_norm": 0.33419033885002136, + "learning_rate": 9.170043212887957e-08, + "loss": 0.1637, + "step": 9982 + }, + { + "epoch": 2.656466205428419, + "grad_norm": 0.3892166018486023, + "learning_rate": 9.168360204698094e-08, + "loss": 0.1931, + "step": 9983 + }, + { + "epoch": 2.6567323044172433, + "grad_norm": 0.27435746788978577, + "learning_rate": 9.166677220228287e-08, + "loss": 0.1796, + "step": 9984 + }, + { + "epoch": 2.656998403406067, + "grad_norm": 0.38653799891471863, + "learning_rate": 9.164994259526544e-08, + "loss": 0.1944, + "step": 9985 + }, + { + "epoch": 2.6572645023948906, + "grad_norm": 0.31604230403900146, + "learning_rate": 9.163311322640857e-08, + "loss": 0.1632, + "step": 9986 + }, + { + "epoch": 2.6575306013837148, + "grad_norm": 0.2735404372215271, + "learning_rate": 9.161628409619236e-08, + "loss": 0.1858, + "step": 9987 + }, + { + "epoch": 2.6577967003725385, + "grad_norm": 0.3109484910964966, + "learning_rate": 9.159945520509671e-08, + "loss": 0.142, + "step": 9988 + }, + { + "epoch": 2.6580627993613626, + "grad_norm": 0.31654784083366394, + "learning_rate": 9.158262655360172e-08, + "loss": 0.1854, + "step": 9989 + }, + { + "epoch": 2.6583288983501863, + "grad_norm": 0.36993929743766785, + "learning_rate": 9.156579814218727e-08, + "loss": 0.1888, + "step": 9990 + }, + { + "epoch": 2.65859499733901, + "grad_norm": 0.3101636469364166, + "learning_rate": 9.154896997133347e-08, + "loss": 0.1889, + "step": 9991 + }, + { + "epoch": 2.658861096327834, + "grad_norm": 0.329862117767334, + "learning_rate": 9.153214204152017e-08, + "loss": 0.1673, + "step": 9992 + }, + { + "epoch": 2.6591271953166578, + "grad_norm": 0.408832311630249, + "learning_rate": 9.151531435322739e-08, + "loss": 0.1832, + "step": 9993 + }, + { + "epoch": 2.659393294305482, + "grad_norm": 0.27455970644950867, + "learning_rate": 9.149848690693508e-08, + "loss": 0.1833, + "step": 9994 + }, + { + "epoch": 2.6596593932943056, + "grad_norm": 0.2712862193584442, + "learning_rate": 9.148165970312323e-08, + "loss": 0.168, + "step": 9995 + }, + { + "epoch": 2.6599254922831292, + "grad_norm": 0.36131203174591064, + "learning_rate": 9.146483274227173e-08, + "loss": 0.1999, + "step": 9996 + }, + { + "epoch": 2.6601915912719534, + "grad_norm": 0.2764999568462372, + "learning_rate": 9.144800602486056e-08, + "loss": 0.1943, + "step": 9997 + }, + { + "epoch": 2.660457690260777, + "grad_norm": 0.28637808561325073, + "learning_rate": 9.14311795513696e-08, + "loss": 0.188, + "step": 9998 + }, + { + "epoch": 2.6607237892496007, + "grad_norm": 0.28202563524246216, + "learning_rate": 9.141435332227888e-08, + "loss": 0.1787, + "step": 9999 + }, + { + "epoch": 2.660989888238425, + "grad_norm": 0.28670307993888855, + "learning_rate": 9.139752733806817e-08, + "loss": 0.1747, + "step": 10000 + }, + { + "epoch": 2.6612559872272485, + "grad_norm": 0.267574280500412, + "learning_rate": 9.138070159921754e-08, + "loss": 0.1628, + "step": 10001 + }, + { + "epoch": 2.6615220862160722, + "grad_norm": 0.29030242562294006, + "learning_rate": 9.136387610620675e-08, + "loss": 0.1845, + "step": 10002 + }, + { + "epoch": 2.6617881852048964, + "grad_norm": 0.2709352672100067, + "learning_rate": 9.134705085951578e-08, + "loss": 0.186, + "step": 10003 + }, + { + "epoch": 2.66205428419372, + "grad_norm": 0.34458276629447937, + "learning_rate": 9.133022585962451e-08, + "loss": 0.1919, + "step": 10004 + }, + { + "epoch": 2.6623203831825437, + "grad_norm": 0.38303443789482117, + "learning_rate": 9.131340110701281e-08, + "loss": 0.185, + "step": 10005 + }, + { + "epoch": 2.662586482171368, + "grad_norm": 0.27487683296203613, + "learning_rate": 9.129657660216059e-08, + "loss": 0.1922, + "step": 10006 + }, + { + "epoch": 2.6628525811601915, + "grad_norm": 0.2595682740211487, + "learning_rate": 9.127975234554767e-08, + "loss": 0.1572, + "step": 10007 + }, + { + "epoch": 2.663118680149015, + "grad_norm": 0.3731018602848053, + "learning_rate": 9.126292833765393e-08, + "loss": 0.1875, + "step": 10008 + }, + { + "epoch": 2.6633847791378393, + "grad_norm": 0.3953750431537628, + "learning_rate": 9.124610457895922e-08, + "loss": 0.1879, + "step": 10009 + }, + { + "epoch": 2.663650878126663, + "grad_norm": 0.24793919920921326, + "learning_rate": 9.122928106994344e-08, + "loss": 0.1582, + "step": 10010 + }, + { + "epoch": 2.6639169771154867, + "grad_norm": 0.33563607931137085, + "learning_rate": 9.121245781108632e-08, + "loss": 0.1776, + "step": 10011 + }, + { + "epoch": 2.664183076104311, + "grad_norm": 0.3061542809009552, + "learning_rate": 9.119563480286783e-08, + "loss": 0.1957, + "step": 10012 + }, + { + "epoch": 2.6644491750931345, + "grad_norm": 0.3935464918613434, + "learning_rate": 9.117881204576768e-08, + "loss": 0.1882, + "step": 10013 + }, + { + "epoch": 2.6647152740819586, + "grad_norm": 0.34469011425971985, + "learning_rate": 9.116198954026576e-08, + "loss": 0.1909, + "step": 10014 + }, + { + "epoch": 2.6649813730707823, + "grad_norm": 0.27872711420059204, + "learning_rate": 9.114516728684183e-08, + "loss": 0.1765, + "step": 10015 + }, + { + "epoch": 2.6652474720596064, + "grad_norm": 0.2631060779094696, + "learning_rate": 9.112834528597577e-08, + "loss": 0.1752, + "step": 10016 + }, + { + "epoch": 2.66551357104843, + "grad_norm": 0.39264926314353943, + "learning_rate": 9.111152353814729e-08, + "loss": 0.1988, + "step": 10017 + }, + { + "epoch": 2.665779670037254, + "grad_norm": 0.4331711530685425, + "learning_rate": 9.109470204383623e-08, + "loss": 0.1936, + "step": 10018 + }, + { + "epoch": 2.666045769026078, + "grad_norm": 0.38755470514297485, + "learning_rate": 9.107788080352235e-08, + "loss": 0.2048, + "step": 10019 + }, + { + "epoch": 2.6663118680149016, + "grad_norm": 0.2579104006290436, + "learning_rate": 9.106105981768545e-08, + "loss": 0.1726, + "step": 10020 + }, + { + "epoch": 2.6665779670037253, + "grad_norm": 0.3030620515346527, + "learning_rate": 9.104423908680532e-08, + "loss": 0.1951, + "step": 10021 + }, + { + "epoch": 2.6668440659925494, + "grad_norm": 0.30293431878089905, + "learning_rate": 9.102741861136165e-08, + "loss": 0.184, + "step": 10022 + }, + { + "epoch": 2.667110164981373, + "grad_norm": 0.3313908576965332, + "learning_rate": 9.101059839183426e-08, + "loss": 0.1943, + "step": 10023 + }, + { + "epoch": 2.667376263970197, + "grad_norm": 0.32024282217025757, + "learning_rate": 9.099377842870285e-08, + "loss": 0.1756, + "step": 10024 + }, + { + "epoch": 2.667642362959021, + "grad_norm": 0.2513349652290344, + "learning_rate": 9.09769587224472e-08, + "loss": 0.1651, + "step": 10025 + }, + { + "epoch": 2.6679084619478446, + "grad_norm": 0.34395062923431396, + "learning_rate": 9.0960139273547e-08, + "loss": 0.1772, + "step": 10026 + }, + { + "epoch": 2.6681745609366683, + "grad_norm": 0.2734125852584839, + "learning_rate": 9.094332008248205e-08, + "loss": 0.1762, + "step": 10027 + }, + { + "epoch": 2.6684406599254924, + "grad_norm": 0.2657914161682129, + "learning_rate": 9.092650114973195e-08, + "loss": 0.1777, + "step": 10028 + }, + { + "epoch": 2.668706758914316, + "grad_norm": 0.27411705255508423, + "learning_rate": 9.090968247577652e-08, + "loss": 0.1801, + "step": 10029 + }, + { + "epoch": 2.6689728579031398, + "grad_norm": 0.28035086393356323, + "learning_rate": 9.089286406109538e-08, + "loss": 0.1922, + "step": 10030 + }, + { + "epoch": 2.669238956891964, + "grad_norm": 0.284420907497406, + "learning_rate": 9.08760459061683e-08, + "loss": 0.1629, + "step": 10031 + }, + { + "epoch": 2.6695050558807876, + "grad_norm": 0.2715936303138733, + "learning_rate": 9.085922801147491e-08, + "loss": 0.1843, + "step": 10032 + }, + { + "epoch": 2.6697711548696113, + "grad_norm": 0.25250008702278137, + "learning_rate": 9.084241037749493e-08, + "loss": 0.174, + "step": 10033 + }, + { + "epoch": 2.6700372538584354, + "grad_norm": 0.3138415813446045, + "learning_rate": 9.0825593004708e-08, + "loss": 0.1794, + "step": 10034 + }, + { + "epoch": 2.670303352847259, + "grad_norm": 0.28953808546066284, + "learning_rate": 9.080877589359383e-08, + "loss": 0.1562, + "step": 10035 + }, + { + "epoch": 2.670569451836083, + "grad_norm": 0.3085833191871643, + "learning_rate": 9.0791959044632e-08, + "loss": 0.1679, + "step": 10036 + }, + { + "epoch": 2.670835550824907, + "grad_norm": 0.2558818757534027, + "learning_rate": 9.077514245830224e-08, + "loss": 0.1495, + "step": 10037 + }, + { + "epoch": 2.6711016498137305, + "grad_norm": 0.3288246989250183, + "learning_rate": 9.075832613508414e-08, + "loss": 0.1996, + "step": 10038 + }, + { + "epoch": 2.6713677488025547, + "grad_norm": 0.3200533390045166, + "learning_rate": 9.074151007545736e-08, + "loss": 0.1766, + "step": 10039 + }, + { + "epoch": 2.6716338477913784, + "grad_norm": 0.42297205328941345, + "learning_rate": 9.072469427990155e-08, + "loss": 0.1872, + "step": 10040 + }, + { + "epoch": 2.6718999467802025, + "grad_norm": 0.3786930441856384, + "learning_rate": 9.070787874889628e-08, + "loss": 0.1903, + "step": 10041 + }, + { + "epoch": 2.672166045769026, + "grad_norm": 0.2912586033344269, + "learning_rate": 9.069106348292125e-08, + "loss": 0.1734, + "step": 10042 + }, + { + "epoch": 2.67243214475785, + "grad_norm": 0.3342036008834839, + "learning_rate": 9.067424848245595e-08, + "loss": 0.1811, + "step": 10043 + }, + { + "epoch": 2.672698243746674, + "grad_norm": 0.3246244788169861, + "learning_rate": 9.065743374798005e-08, + "loss": 0.1997, + "step": 10044 + }, + { + "epoch": 2.6729643427354977, + "grad_norm": 0.2731422185897827, + "learning_rate": 9.064061927997311e-08, + "loss": 0.1785, + "step": 10045 + }, + { + "epoch": 2.6732304417243213, + "grad_norm": 0.260247141122818, + "learning_rate": 9.062380507891478e-08, + "loss": 0.1796, + "step": 10046 + }, + { + "epoch": 2.6734965407131455, + "grad_norm": 0.3045346736907959, + "learning_rate": 9.060699114528451e-08, + "loss": 0.1721, + "step": 10047 + }, + { + "epoch": 2.673762639701969, + "grad_norm": 0.34339508414268494, + "learning_rate": 9.0590177479562e-08, + "loss": 0.1959, + "step": 10048 + }, + { + "epoch": 2.674028738690793, + "grad_norm": 0.2739868462085724, + "learning_rate": 9.05733640822267e-08, + "loss": 0.1815, + "step": 10049 + }, + { + "epoch": 2.674294837679617, + "grad_norm": 0.3320675194263458, + "learning_rate": 9.055655095375824e-08, + "loss": 0.1917, + "step": 10050 + }, + { + "epoch": 2.6745609366684406, + "grad_norm": 0.2681055963039398, + "learning_rate": 9.053973809463614e-08, + "loss": 0.1852, + "step": 10051 + }, + { + "epoch": 2.6748270356572643, + "grad_norm": 0.2742852568626404, + "learning_rate": 9.052292550533994e-08, + "loss": 0.1831, + "step": 10052 + }, + { + "epoch": 2.6750931346460884, + "grad_norm": 0.2837640047073364, + "learning_rate": 9.050611318634913e-08, + "loss": 0.176, + "step": 10053 + }, + { + "epoch": 2.675359233634912, + "grad_norm": 0.25861096382141113, + "learning_rate": 9.048930113814331e-08, + "loss": 0.1593, + "step": 10054 + }, + { + "epoch": 2.675625332623736, + "grad_norm": 0.27250322699546814, + "learning_rate": 9.047248936120189e-08, + "loss": 0.1854, + "step": 10055 + }, + { + "epoch": 2.67589143161256, + "grad_norm": 0.28410518169403076, + "learning_rate": 9.04556778560045e-08, + "loss": 0.1795, + "step": 10056 + }, + { + "epoch": 2.6761575306013836, + "grad_norm": 0.3335801959037781, + "learning_rate": 9.043886662303053e-08, + "loss": 0.1963, + "step": 10057 + }, + { + "epoch": 2.6764236295902073, + "grad_norm": 0.3114930987358093, + "learning_rate": 9.04220556627595e-08, + "loss": 0.1898, + "step": 10058 + }, + { + "epoch": 2.6766897285790314, + "grad_norm": 0.26958000659942627, + "learning_rate": 9.040524497567093e-08, + "loss": 0.174, + "step": 10059 + }, + { + "epoch": 2.676955827567855, + "grad_norm": 0.25257596373558044, + "learning_rate": 9.038843456224426e-08, + "loss": 0.1686, + "step": 10060 + }, + { + "epoch": 2.6772219265566792, + "grad_norm": 0.44411981105804443, + "learning_rate": 9.0371624422959e-08, + "loss": 0.2014, + "step": 10061 + }, + { + "epoch": 2.677488025545503, + "grad_norm": 0.2591886520385742, + "learning_rate": 9.035481455829452e-08, + "loss": 0.161, + "step": 10062 + }, + { + "epoch": 2.677754124534327, + "grad_norm": 0.2547597587108612, + "learning_rate": 9.033800496873039e-08, + "loss": 0.1706, + "step": 10063 + }, + { + "epoch": 2.6780202235231507, + "grad_norm": 0.28233733773231506, + "learning_rate": 9.032119565474596e-08, + "loss": 0.1895, + "step": 10064 + }, + { + "epoch": 2.6782863225119744, + "grad_norm": 0.4520227909088135, + "learning_rate": 9.03043866168207e-08, + "loss": 0.2012, + "step": 10065 + }, + { + "epoch": 2.6785524215007985, + "grad_norm": 0.2692636549472809, + "learning_rate": 9.028757785543404e-08, + "loss": 0.1721, + "step": 10066 + }, + { + "epoch": 2.678818520489622, + "grad_norm": 0.39036762714385986, + "learning_rate": 9.027076937106541e-08, + "loss": 0.2014, + "step": 10067 + }, + { + "epoch": 2.679084619478446, + "grad_norm": 0.4733116924762726, + "learning_rate": 9.025396116419419e-08, + "loss": 0.1909, + "step": 10068 + }, + { + "epoch": 2.67935071846727, + "grad_norm": 0.35945749282836914, + "learning_rate": 9.023715323529981e-08, + "loss": 0.1872, + "step": 10069 + }, + { + "epoch": 2.6796168174560937, + "grad_norm": 0.2562508285045624, + "learning_rate": 9.022034558486164e-08, + "loss": 0.1556, + "step": 10070 + }, + { + "epoch": 2.6798829164449174, + "grad_norm": 0.28426995873451233, + "learning_rate": 9.020353821335913e-08, + "loss": 0.1572, + "step": 10071 + }, + { + "epoch": 2.6801490154337415, + "grad_norm": 0.3121631443500519, + "learning_rate": 9.018673112127158e-08, + "loss": 0.1767, + "step": 10072 + }, + { + "epoch": 2.680415114422565, + "grad_norm": 3.903776168823242, + "learning_rate": 9.016992430907841e-08, + "loss": 0.1917, + "step": 10073 + }, + { + "epoch": 2.680681213411389, + "grad_norm": 0.37996363639831543, + "learning_rate": 9.015311777725895e-08, + "loss": 0.1724, + "step": 10074 + }, + { + "epoch": 2.680947312400213, + "grad_norm": 0.839939534664154, + "learning_rate": 9.01363115262926e-08, + "loss": 0.1869, + "step": 10075 + }, + { + "epoch": 2.6812134113890367, + "grad_norm": 0.36720582842826843, + "learning_rate": 9.01195055566587e-08, + "loss": 0.1674, + "step": 10076 + }, + { + "epoch": 2.6814795103778604, + "grad_norm": 0.28125807642936707, + "learning_rate": 9.010269986883656e-08, + "loss": 0.1885, + "step": 10077 + }, + { + "epoch": 2.6817456093666845, + "grad_norm": 0.2659710645675659, + "learning_rate": 9.008589446330558e-08, + "loss": 0.1641, + "step": 10078 + }, + { + "epoch": 2.682011708355508, + "grad_norm": 0.36179319024086, + "learning_rate": 9.006908934054499e-08, + "loss": 0.1754, + "step": 10079 + }, + { + "epoch": 2.682277807344332, + "grad_norm": 0.3280620872974396, + "learning_rate": 9.005228450103417e-08, + "loss": 0.1913, + "step": 10080 + }, + { + "epoch": 2.682543906333156, + "grad_norm": 0.49512189626693726, + "learning_rate": 9.003547994525239e-08, + "loss": 0.1947, + "step": 10081 + }, + { + "epoch": 2.6828100053219797, + "grad_norm": 0.3238423466682434, + "learning_rate": 9.001867567367902e-08, + "loss": 0.1743, + "step": 10082 + }, + { + "epoch": 2.6830761043108033, + "grad_norm": 0.46297404170036316, + "learning_rate": 9.000187168679325e-08, + "loss": 0.1717, + "step": 10083 + }, + { + "epoch": 2.6833422032996275, + "grad_norm": 0.26602375507354736, + "learning_rate": 8.998506798507444e-08, + "loss": 0.1747, + "step": 10084 + }, + { + "epoch": 2.683608302288451, + "grad_norm": 0.38010963797569275, + "learning_rate": 8.996826456900183e-08, + "loss": 0.2027, + "step": 10085 + }, + { + "epoch": 2.6838744012772753, + "grad_norm": 0.2854499816894531, + "learning_rate": 8.995146143905474e-08, + "loss": 0.1798, + "step": 10086 + }, + { + "epoch": 2.684140500266099, + "grad_norm": 0.32503682374954224, + "learning_rate": 8.993465859571231e-08, + "loss": 0.1966, + "step": 10087 + }, + { + "epoch": 2.684406599254923, + "grad_norm": 0.3568556308746338, + "learning_rate": 8.991785603945395e-08, + "loss": 0.1811, + "step": 10088 + }, + { + "epoch": 2.6846726982437468, + "grad_norm": 0.2960200309753418, + "learning_rate": 8.990105377075878e-08, + "loss": 0.2062, + "step": 10089 + }, + { + "epoch": 2.6849387972325705, + "grad_norm": 0.2944389581680298, + "learning_rate": 8.98842517901061e-08, + "loss": 0.175, + "step": 10090 + }, + { + "epoch": 2.6852048962213946, + "grad_norm": 0.3684190511703491, + "learning_rate": 8.986745009797508e-08, + "loss": 0.181, + "step": 10091 + }, + { + "epoch": 2.6854709952102183, + "grad_norm": 0.2968311011791229, + "learning_rate": 8.985064869484504e-08, + "loss": 0.1794, + "step": 10092 + }, + { + "epoch": 2.685737094199042, + "grad_norm": 0.37255623936653137, + "learning_rate": 8.983384758119507e-08, + "loss": 0.168, + "step": 10093 + }, + { + "epoch": 2.686003193187866, + "grad_norm": 0.4769171178340912, + "learning_rate": 8.981704675750441e-08, + "loss": 0.2207, + "step": 10094 + }, + { + "epoch": 2.6862692921766897, + "grad_norm": 0.2988687753677368, + "learning_rate": 8.98002462242523e-08, + "loss": 0.1744, + "step": 10095 + }, + { + "epoch": 2.6865353911655134, + "grad_norm": 0.2969455122947693, + "learning_rate": 8.978344598191788e-08, + "loss": 0.1751, + "step": 10096 + }, + { + "epoch": 2.6868014901543376, + "grad_norm": 0.25111281871795654, + "learning_rate": 8.976664603098039e-08, + "loss": 0.1527, + "step": 10097 + }, + { + "epoch": 2.6870675891431612, + "grad_norm": 0.2567497789859772, + "learning_rate": 8.97498463719189e-08, + "loss": 0.1639, + "step": 10098 + }, + { + "epoch": 2.687333688131985, + "grad_norm": 0.2680957615375519, + "learning_rate": 8.973304700521264e-08, + "loss": 0.1568, + "step": 10099 + }, + { + "epoch": 2.687599787120809, + "grad_norm": 0.2660462558269501, + "learning_rate": 8.971624793134072e-08, + "loss": 0.1786, + "step": 10100 + }, + { + "epoch": 2.6878658861096327, + "grad_norm": 0.34010758996009827, + "learning_rate": 8.969944915078234e-08, + "loss": 0.1846, + "step": 10101 + }, + { + "epoch": 2.6881319850984564, + "grad_norm": 0.29962822794914246, + "learning_rate": 8.968265066401657e-08, + "loss": 0.1658, + "step": 10102 + }, + { + "epoch": 2.6883980840872805, + "grad_norm": 0.3072107136249542, + "learning_rate": 8.96658524715226e-08, + "loss": 0.1803, + "step": 10103 + }, + { + "epoch": 2.688664183076104, + "grad_norm": 0.35288041830062866, + "learning_rate": 8.96490545737795e-08, + "loss": 0.1778, + "step": 10104 + }, + { + "epoch": 2.688930282064928, + "grad_norm": 0.2697868347167969, + "learning_rate": 8.96322569712664e-08, + "loss": 0.164, + "step": 10105 + }, + { + "epoch": 2.689196381053752, + "grad_norm": 0.3362870216369629, + "learning_rate": 8.961545966446239e-08, + "loss": 0.1791, + "step": 10106 + }, + { + "epoch": 2.6894624800425757, + "grad_norm": 0.30321216583251953, + "learning_rate": 8.959866265384661e-08, + "loss": 0.1604, + "step": 10107 + }, + { + "epoch": 2.6897285790314, + "grad_norm": 0.28540584444999695, + "learning_rate": 8.958186593989807e-08, + "loss": 0.1744, + "step": 10108 + }, + { + "epoch": 2.6899946780202235, + "grad_norm": 0.2982577681541443, + "learning_rate": 8.956506952309591e-08, + "loss": 0.1972, + "step": 10109 + }, + { + "epoch": 2.690260777009047, + "grad_norm": 0.3096013367176056, + "learning_rate": 8.954827340391912e-08, + "loss": 0.1805, + "step": 10110 + }, + { + "epoch": 2.6905268759978713, + "grad_norm": 0.3607174754142761, + "learning_rate": 8.95314775828469e-08, + "loss": 0.2043, + "step": 10111 + }, + { + "epoch": 2.690792974986695, + "grad_norm": 0.2816198468208313, + "learning_rate": 8.951468206035814e-08, + "loss": 0.1822, + "step": 10112 + }, + { + "epoch": 2.691059073975519, + "grad_norm": 0.4228906035423279, + "learning_rate": 8.949788683693193e-08, + "loss": 0.1859, + "step": 10113 + }, + { + "epoch": 2.691325172964343, + "grad_norm": 0.28982987999916077, + "learning_rate": 8.94810919130474e-08, + "loss": 0.1974, + "step": 10114 + }, + { + "epoch": 2.6915912719531665, + "grad_norm": 0.2579204738140106, + "learning_rate": 8.946429728918347e-08, + "loss": 0.1699, + "step": 10115 + }, + { + "epoch": 2.6918573709419906, + "grad_norm": 0.3576054275035858, + "learning_rate": 8.94475029658192e-08, + "loss": 0.2119, + "step": 10116 + }, + { + "epoch": 2.6921234699308143, + "grad_norm": 0.30816450715065, + "learning_rate": 8.943070894343357e-08, + "loss": 0.1688, + "step": 10117 + }, + { + "epoch": 2.692389568919638, + "grad_norm": 0.320906400680542, + "learning_rate": 8.941391522250565e-08, + "loss": 0.1769, + "step": 10118 + }, + { + "epoch": 2.692655667908462, + "grad_norm": 0.2971029579639435, + "learning_rate": 8.939712180351433e-08, + "loss": 0.1768, + "step": 10119 + }, + { + "epoch": 2.692921766897286, + "grad_norm": 0.26542961597442627, + "learning_rate": 8.938032868693866e-08, + "loss": 0.1687, + "step": 10120 + }, + { + "epoch": 2.6931878658861095, + "grad_norm": 0.34445449709892273, + "learning_rate": 8.936353587325759e-08, + "loss": 0.1904, + "step": 10121 + }, + { + "epoch": 2.6934539648749336, + "grad_norm": 0.3770556151866913, + "learning_rate": 8.934674336295012e-08, + "loss": 0.1958, + "step": 10122 + }, + { + "epoch": 2.6937200638637573, + "grad_norm": 0.363455593585968, + "learning_rate": 8.932995115649515e-08, + "loss": 0.187, + "step": 10123 + }, + { + "epoch": 2.693986162852581, + "grad_norm": 0.2981138229370117, + "learning_rate": 8.931315925437167e-08, + "loss": 0.1842, + "step": 10124 + }, + { + "epoch": 2.694252261841405, + "grad_norm": 0.4156396687030792, + "learning_rate": 8.929636765705857e-08, + "loss": 0.1714, + "step": 10125 + }, + { + "epoch": 2.6945183608302288, + "grad_norm": 0.4072192311286926, + "learning_rate": 8.927957636503487e-08, + "loss": 0.1848, + "step": 10126 + }, + { + "epoch": 2.6947844598190525, + "grad_norm": 0.4341007173061371, + "learning_rate": 8.926278537877935e-08, + "loss": 0.1922, + "step": 10127 + }, + { + "epoch": 2.6950505588078766, + "grad_norm": 0.261820912361145, + "learning_rate": 8.924599469877111e-08, + "loss": 0.1727, + "step": 10128 + }, + { + "epoch": 2.6953166577967003, + "grad_norm": 0.2592100501060486, + "learning_rate": 8.92292043254889e-08, + "loss": 0.1759, + "step": 10129 + }, + { + "epoch": 2.695582756785524, + "grad_norm": 0.2643696367740631, + "learning_rate": 8.921241425941166e-08, + "loss": 0.1816, + "step": 10130 + }, + { + "epoch": 2.695848855774348, + "grad_norm": 0.5677347183227539, + "learning_rate": 8.919562450101832e-08, + "loss": 0.177, + "step": 10131 + }, + { + "epoch": 2.6961149547631718, + "grad_norm": 0.27937066555023193, + "learning_rate": 8.917883505078769e-08, + "loss": 0.1869, + "step": 10132 + }, + { + "epoch": 2.696381053751996, + "grad_norm": 0.2531352937221527, + "learning_rate": 8.916204590919873e-08, + "loss": 0.1462, + "step": 10133 + }, + { + "epoch": 2.6966471527408196, + "grad_norm": 0.2875722050666809, + "learning_rate": 8.91452570767302e-08, + "loss": 0.1716, + "step": 10134 + }, + { + "epoch": 2.6969132517296437, + "grad_norm": 0.34469935297966003, + "learning_rate": 8.9128468553861e-08, + "loss": 0.1941, + "step": 10135 + }, + { + "epoch": 2.6971793507184674, + "grad_norm": 0.30269619822502136, + "learning_rate": 8.911168034106997e-08, + "loss": 0.1829, + "step": 10136 + }, + { + "epoch": 2.697445449707291, + "grad_norm": 0.34573879837989807, + "learning_rate": 8.909489243883597e-08, + "loss": 0.1741, + "step": 10137 + }, + { + "epoch": 2.697711548696115, + "grad_norm": 0.2952106297016144, + "learning_rate": 8.907810484763775e-08, + "loss": 0.1781, + "step": 10138 + }, + { + "epoch": 2.697977647684939, + "grad_norm": 0.388670951128006, + "learning_rate": 8.906131756795422e-08, + "loss": 0.1916, + "step": 10139 + }, + { + "epoch": 2.6982437466737625, + "grad_norm": 0.2737944424152374, + "learning_rate": 8.904453060026412e-08, + "loss": 0.18, + "step": 10140 + }, + { + "epoch": 2.6985098456625867, + "grad_norm": 0.2939338982105255, + "learning_rate": 8.902774394504627e-08, + "loss": 0.206, + "step": 10141 + }, + { + "epoch": 2.6987759446514104, + "grad_norm": 0.47684624791145325, + "learning_rate": 8.901095760277945e-08, + "loss": 0.1935, + "step": 10142 + }, + { + "epoch": 2.699042043640234, + "grad_norm": 0.28818753361701965, + "learning_rate": 8.899417157394249e-08, + "loss": 0.1882, + "step": 10143 + }, + { + "epoch": 2.699308142629058, + "grad_norm": 0.41158419847488403, + "learning_rate": 8.897738585901408e-08, + "loss": 0.2065, + "step": 10144 + }, + { + "epoch": 2.699574241617882, + "grad_norm": 0.26725924015045166, + "learning_rate": 8.896060045847304e-08, + "loss": 0.1773, + "step": 10145 + }, + { + "epoch": 2.6998403406067055, + "grad_norm": 0.36079150438308716, + "learning_rate": 8.89438153727981e-08, + "loss": 0.1808, + "step": 10146 + }, + { + "epoch": 2.7001064395955297, + "grad_norm": 0.34199240803718567, + "learning_rate": 8.892703060246803e-08, + "loss": 0.189, + "step": 10147 + }, + { + "epoch": 2.7003725385843533, + "grad_norm": 0.29975488781929016, + "learning_rate": 8.891024614796151e-08, + "loss": 0.1885, + "step": 10148 + }, + { + "epoch": 2.700638637573177, + "grad_norm": 0.2643377184867859, + "learning_rate": 8.889346200975734e-08, + "loss": 0.1837, + "step": 10149 + }, + { + "epoch": 2.700904736562001, + "grad_norm": 0.3505846858024597, + "learning_rate": 8.88766781883342e-08, + "loss": 0.1836, + "step": 10150 + }, + { + "epoch": 2.701170835550825, + "grad_norm": 0.2702533006668091, + "learning_rate": 8.885989468417077e-08, + "loss": 0.1627, + "step": 10151 + }, + { + "epoch": 2.7014369345396485, + "grad_norm": 0.29287290573120117, + "learning_rate": 8.884311149774581e-08, + "loss": 0.1721, + "step": 10152 + }, + { + "epoch": 2.7017030335284726, + "grad_norm": 0.28821998834609985, + "learning_rate": 8.882632862953797e-08, + "loss": 0.183, + "step": 10153 + }, + { + "epoch": 2.7019691325172963, + "grad_norm": 0.26041144132614136, + "learning_rate": 8.880954608002599e-08, + "loss": 0.1777, + "step": 10154 + }, + { + "epoch": 2.7022352315061204, + "grad_norm": 0.3602108359336853, + "learning_rate": 8.879276384968843e-08, + "loss": 0.1832, + "step": 10155 + }, + { + "epoch": 2.702501330494944, + "grad_norm": 0.4285091757774353, + "learning_rate": 8.877598193900406e-08, + "loss": 0.2214, + "step": 10156 + }, + { + "epoch": 2.702767429483768, + "grad_norm": 0.2912614345550537, + "learning_rate": 8.875920034845147e-08, + "loss": 0.176, + "step": 10157 + }, + { + "epoch": 2.703033528472592, + "grad_norm": 0.5382276773452759, + "learning_rate": 8.874241907850937e-08, + "loss": 0.1903, + "step": 10158 + }, + { + "epoch": 2.7032996274614156, + "grad_norm": 0.27825629711151123, + "learning_rate": 8.87256381296563e-08, + "loss": 0.1859, + "step": 10159 + }, + { + "epoch": 2.7035657264502397, + "grad_norm": 0.305756539106369, + "learning_rate": 8.870885750237098e-08, + "loss": 0.1887, + "step": 10160 + }, + { + "epoch": 2.7038318254390634, + "grad_norm": 0.3692728579044342, + "learning_rate": 8.869207719713196e-08, + "loss": 0.1917, + "step": 10161 + }, + { + "epoch": 2.704097924427887, + "grad_norm": 0.27538925409317017, + "learning_rate": 8.867529721441792e-08, + "loss": 0.1673, + "step": 10162 + }, + { + "epoch": 2.7043640234167112, + "grad_norm": 0.2593870162963867, + "learning_rate": 8.865851755470738e-08, + "loss": 0.171, + "step": 10163 + }, + { + "epoch": 2.704630122405535, + "grad_norm": 0.37012845277786255, + "learning_rate": 8.864173821847898e-08, + "loss": 0.1996, + "step": 10164 + }, + { + "epoch": 2.7048962213943586, + "grad_norm": 0.3410525321960449, + "learning_rate": 8.862495920621127e-08, + "loss": 0.1728, + "step": 10165 + }, + { + "epoch": 2.7051623203831827, + "grad_norm": 0.39764082431793213, + "learning_rate": 8.860818051838285e-08, + "loss": 0.1693, + "step": 10166 + }, + { + "epoch": 2.7054284193720064, + "grad_norm": 0.3480909764766693, + "learning_rate": 8.859140215547227e-08, + "loss": 0.1689, + "step": 10167 + }, + { + "epoch": 2.70569451836083, + "grad_norm": 0.2732059359550476, + "learning_rate": 8.857462411795807e-08, + "loss": 0.1747, + "step": 10168 + }, + { + "epoch": 2.705960617349654, + "grad_norm": 0.2704159617424011, + "learning_rate": 8.855784640631883e-08, + "loss": 0.1789, + "step": 10169 + }, + { + "epoch": 2.706226716338478, + "grad_norm": 0.2805172801017761, + "learning_rate": 8.854106902103304e-08, + "loss": 0.1648, + "step": 10170 + }, + { + "epoch": 2.7064928153273016, + "grad_norm": 0.2681899070739746, + "learning_rate": 8.852429196257928e-08, + "loss": 0.1677, + "step": 10171 + }, + { + "epoch": 2.7067589143161257, + "grad_norm": 0.3990040421485901, + "learning_rate": 8.8507515231436e-08, + "loss": 0.1906, + "step": 10172 + }, + { + "epoch": 2.7070250133049494, + "grad_norm": 0.2486782819032669, + "learning_rate": 8.849073882808177e-08, + "loss": 0.1569, + "step": 10173 + }, + { + "epoch": 2.707291112293773, + "grad_norm": 0.2682887017726898, + "learning_rate": 8.847396275299504e-08, + "loss": 0.1704, + "step": 10174 + }, + { + "epoch": 2.707557211282597, + "grad_norm": 0.8545610904693604, + "learning_rate": 8.845718700665432e-08, + "loss": 0.1906, + "step": 10175 + }, + { + "epoch": 2.707823310271421, + "grad_norm": 0.38903936743736267, + "learning_rate": 8.844041158953805e-08, + "loss": 0.1908, + "step": 10176 + }, + { + "epoch": 2.7080894092602446, + "grad_norm": 0.27929016947746277, + "learning_rate": 8.842363650212477e-08, + "loss": 0.1794, + "step": 10177 + }, + { + "epoch": 2.7083555082490687, + "grad_norm": 0.34160929918289185, + "learning_rate": 8.840686174489287e-08, + "loss": 0.1677, + "step": 10178 + }, + { + "epoch": 2.7086216072378924, + "grad_norm": 0.427101194858551, + "learning_rate": 8.83900873183209e-08, + "loss": 0.163, + "step": 10179 + }, + { + "epoch": 2.7088877062267165, + "grad_norm": 0.4094027280807495, + "learning_rate": 8.837331322288717e-08, + "loss": 0.1996, + "step": 10180 + }, + { + "epoch": 2.70915380521554, + "grad_norm": 0.27134865522384644, + "learning_rate": 8.835653945907019e-08, + "loss": 0.1708, + "step": 10181 + }, + { + "epoch": 2.7094199042043643, + "grad_norm": 0.3426108956336975, + "learning_rate": 8.833976602734835e-08, + "loss": 0.1937, + "step": 10182 + }, + { + "epoch": 2.709686003193188, + "grad_norm": 0.3335613012313843, + "learning_rate": 8.832299292820012e-08, + "loss": 0.192, + "step": 10183 + }, + { + "epoch": 2.7099521021820117, + "grad_norm": 0.2962746322154999, + "learning_rate": 8.83062201621038e-08, + "loss": 0.1819, + "step": 10184 + }, + { + "epoch": 2.710218201170836, + "grad_norm": 0.2588707506656647, + "learning_rate": 8.828944772953787e-08, + "loss": 0.176, + "step": 10185 + }, + { + "epoch": 2.7104843001596595, + "grad_norm": 0.25426268577575684, + "learning_rate": 8.82726756309807e-08, + "loss": 0.1551, + "step": 10186 + }, + { + "epoch": 2.710750399148483, + "grad_norm": 0.2659386694431305, + "learning_rate": 8.825590386691063e-08, + "loss": 0.1848, + "step": 10187 + }, + { + "epoch": 2.7110164981373073, + "grad_norm": 0.2758108377456665, + "learning_rate": 8.82391324378061e-08, + "loss": 0.1758, + "step": 10188 + }, + { + "epoch": 2.711282597126131, + "grad_norm": 0.3660848140716553, + "learning_rate": 8.822236134414532e-08, + "loss": 0.2001, + "step": 10189 + }, + { + "epoch": 2.7115486961149546, + "grad_norm": 0.2764400243759155, + "learning_rate": 8.820559058640682e-08, + "loss": 0.173, + "step": 10190 + }, + { + "epoch": 2.7118147951037788, + "grad_norm": 0.33397433161735535, + "learning_rate": 8.81888201650688e-08, + "loss": 0.19, + "step": 10191 + }, + { + "epoch": 2.7120808940926024, + "grad_norm": 0.2806268334388733, + "learning_rate": 8.817205008060962e-08, + "loss": 0.1736, + "step": 10192 + }, + { + "epoch": 2.712346993081426, + "grad_norm": 0.31081220507621765, + "learning_rate": 8.815528033350762e-08, + "loss": 0.1777, + "step": 10193 + }, + { + "epoch": 2.7126130920702503, + "grad_norm": 0.2900986075401306, + "learning_rate": 8.813851092424113e-08, + "loss": 0.1851, + "step": 10194 + }, + { + "epoch": 2.712879191059074, + "grad_norm": 0.27028366923332214, + "learning_rate": 8.812174185328835e-08, + "loss": 0.164, + "step": 10195 + }, + { + "epoch": 2.7131452900478976, + "grad_norm": 0.28560957312583923, + "learning_rate": 8.810497312112769e-08, + "loss": 0.1865, + "step": 10196 + }, + { + "epoch": 2.7134113890367217, + "grad_norm": 0.3551480174064636, + "learning_rate": 8.808820472823732e-08, + "loss": 0.1851, + "step": 10197 + }, + { + "epoch": 2.7136774880255454, + "grad_norm": 0.4092720150947571, + "learning_rate": 8.80714366750956e-08, + "loss": 0.2046, + "step": 10198 + }, + { + "epoch": 2.713943587014369, + "grad_norm": 0.2789689600467682, + "learning_rate": 8.80546689621807e-08, + "loss": 0.1927, + "step": 10199 + }, + { + "epoch": 2.7142096860031932, + "grad_norm": 0.25765693187713623, + "learning_rate": 8.803790158997094e-08, + "loss": 0.1811, + "step": 10200 + }, + { + "epoch": 2.714475784992017, + "grad_norm": 0.2806060016155243, + "learning_rate": 8.802113455894452e-08, + "loss": 0.1903, + "step": 10201 + }, + { + "epoch": 2.7147418839808406, + "grad_norm": 0.26114538311958313, + "learning_rate": 8.800436786957973e-08, + "loss": 0.1747, + "step": 10202 + }, + { + "epoch": 2.7150079829696647, + "grad_norm": 0.4424227476119995, + "learning_rate": 8.798760152235467e-08, + "loss": 0.2105, + "step": 10203 + }, + { + "epoch": 2.7152740819584884, + "grad_norm": 0.2534455955028534, + "learning_rate": 8.797083551774764e-08, + "loss": 0.1785, + "step": 10204 + }, + { + "epoch": 2.7155401809473125, + "grad_norm": 0.3268113434314728, + "learning_rate": 8.79540698562369e-08, + "loss": 0.1791, + "step": 10205 + }, + { + "epoch": 2.715806279936136, + "grad_norm": 0.2617591321468353, + "learning_rate": 8.79373045383005e-08, + "loss": 0.1828, + "step": 10206 + }, + { + "epoch": 2.7160723789249603, + "grad_norm": 0.3572991192340851, + "learning_rate": 8.79205395644167e-08, + "loss": 0.1823, + "step": 10207 + }, + { + "epoch": 2.716338477913784, + "grad_norm": 0.341977059841156, + "learning_rate": 8.790377493506364e-08, + "loss": 0.2021, + "step": 10208 + }, + { + "epoch": 2.7166045769026077, + "grad_norm": 0.32825109362602234, + "learning_rate": 8.788701065071956e-08, + "loss": 0.1765, + "step": 10209 + }, + { + "epoch": 2.716870675891432, + "grad_norm": 0.37775763869285583, + "learning_rate": 8.78702467118625e-08, + "loss": 0.2008, + "step": 10210 + }, + { + "epoch": 2.7171367748802555, + "grad_norm": 0.3085496127605438, + "learning_rate": 8.785348311897068e-08, + "loss": 0.182, + "step": 10211 + }, + { + "epoch": 2.717402873869079, + "grad_norm": 0.28029903769493103, + "learning_rate": 8.783671987252217e-08, + "loss": 0.1904, + "step": 10212 + }, + { + "epoch": 2.7176689728579033, + "grad_norm": 0.35570812225341797, + "learning_rate": 8.781995697299516e-08, + "loss": 0.2011, + "step": 10213 + }, + { + "epoch": 2.717935071846727, + "grad_norm": 0.3496656119823456, + "learning_rate": 8.78031944208677e-08, + "loss": 0.1911, + "step": 10214 + }, + { + "epoch": 2.7182011708355507, + "grad_norm": 0.351254940032959, + "learning_rate": 8.778643221661795e-08, + "loss": 0.1989, + "step": 10215 + }, + { + "epoch": 2.718467269824375, + "grad_norm": 0.39990293979644775, + "learning_rate": 8.776967036072394e-08, + "loss": 0.1857, + "step": 10216 + }, + { + "epoch": 2.7187333688131985, + "grad_norm": 0.29852256178855896, + "learning_rate": 8.775290885366381e-08, + "loss": 0.1732, + "step": 10217 + }, + { + "epoch": 2.718999467802022, + "grad_norm": 0.26731589436531067, + "learning_rate": 8.773614769591559e-08, + "loss": 0.1849, + "step": 10218 + }, + { + "epoch": 2.7192655667908463, + "grad_norm": 0.2912091910839081, + "learning_rate": 8.771938688795739e-08, + "loss": 0.1862, + "step": 10219 + }, + { + "epoch": 2.71953166577967, + "grad_norm": 0.27967652678489685, + "learning_rate": 8.77026264302672e-08, + "loss": 0.1915, + "step": 10220 + }, + { + "epoch": 2.7197977647684937, + "grad_norm": 0.4573948383331299, + "learning_rate": 8.768586632332309e-08, + "loss": 0.1801, + "step": 10221 + }, + { + "epoch": 2.720063863757318, + "grad_norm": 0.274983674287796, + "learning_rate": 8.766910656760311e-08, + "loss": 0.1902, + "step": 10222 + }, + { + "epoch": 2.7203299627461415, + "grad_norm": 0.2837230861186981, + "learning_rate": 8.765234716358525e-08, + "loss": 0.18, + "step": 10223 + }, + { + "epoch": 2.720596061734965, + "grad_norm": 0.2899896204471588, + "learning_rate": 8.763558811174758e-08, + "loss": 0.1633, + "step": 10224 + }, + { + "epoch": 2.7208621607237893, + "grad_norm": 0.25870653986930847, + "learning_rate": 8.761882941256803e-08, + "loss": 0.1709, + "step": 10225 + }, + { + "epoch": 2.721128259712613, + "grad_norm": 0.24654991924762726, + "learning_rate": 8.760207106652464e-08, + "loss": 0.1582, + "step": 10226 + }, + { + "epoch": 2.721394358701437, + "grad_norm": 0.25490471720695496, + "learning_rate": 8.758531307409536e-08, + "loss": 0.1708, + "step": 10227 + }, + { + "epoch": 2.7216604576902608, + "grad_norm": 0.3935433030128479, + "learning_rate": 8.75685554357582e-08, + "loss": 0.1883, + "step": 10228 + }, + { + "epoch": 2.721926556679085, + "grad_norm": 0.29085373878479004, + "learning_rate": 8.755179815199107e-08, + "loss": 0.1783, + "step": 10229 + }, + { + "epoch": 2.7221926556679086, + "grad_norm": 0.2944786548614502, + "learning_rate": 8.753504122327201e-08, + "loss": 0.1906, + "step": 10230 + }, + { + "epoch": 2.7224587546567323, + "grad_norm": 0.2565939128398895, + "learning_rate": 8.751828465007885e-08, + "loss": 0.1803, + "step": 10231 + }, + { + "epoch": 2.7227248536455564, + "grad_norm": 0.33882707357406616, + "learning_rate": 8.75015284328896e-08, + "loss": 0.1661, + "step": 10232 + }, + { + "epoch": 2.72299095263438, + "grad_norm": 0.26401060819625854, + "learning_rate": 8.748477257218214e-08, + "loss": 0.1756, + "step": 10233 + }, + { + "epoch": 2.7232570516232038, + "grad_norm": 0.27416905760765076, + "learning_rate": 8.746801706843445e-08, + "loss": 0.1832, + "step": 10234 + }, + { + "epoch": 2.723523150612028, + "grad_norm": 0.37201404571533203, + "learning_rate": 8.745126192212431e-08, + "loss": 0.1796, + "step": 10235 + }, + { + "epoch": 2.7237892496008516, + "grad_norm": 0.3022174537181854, + "learning_rate": 8.743450713372972e-08, + "loss": 0.1791, + "step": 10236 + }, + { + "epoch": 2.7240553485896752, + "grad_norm": 0.3364136815071106, + "learning_rate": 8.74177527037285e-08, + "loss": 0.1893, + "step": 10237 + }, + { + "epoch": 2.7243214475784994, + "grad_norm": 0.2865677773952484, + "learning_rate": 8.740099863259857e-08, + "loss": 0.1788, + "step": 10238 + }, + { + "epoch": 2.724587546567323, + "grad_norm": 0.2885628938674927, + "learning_rate": 8.738424492081774e-08, + "loss": 0.1721, + "step": 10239 + }, + { + "epoch": 2.7248536455561467, + "grad_norm": 0.43939244747161865, + "learning_rate": 8.736749156886385e-08, + "loss": 0.1884, + "step": 10240 + }, + { + "epoch": 2.725119744544971, + "grad_norm": 0.2995360195636749, + "learning_rate": 8.735073857721483e-08, + "loss": 0.1932, + "step": 10241 + }, + { + "epoch": 2.7253858435337945, + "grad_norm": 0.2684129774570465, + "learning_rate": 8.733398594634841e-08, + "loss": 0.1722, + "step": 10242 + }, + { + "epoch": 2.725651942522618, + "grad_norm": 0.32697856426239014, + "learning_rate": 8.731723367674248e-08, + "loss": 0.1842, + "step": 10243 + }, + { + "epoch": 2.7259180415114423, + "grad_norm": 0.3333415985107422, + "learning_rate": 8.730048176887477e-08, + "loss": 0.1717, + "step": 10244 + }, + { + "epoch": 2.726184140500266, + "grad_norm": 0.28908535838127136, + "learning_rate": 8.72837302232232e-08, + "loss": 0.1835, + "step": 10245 + }, + { + "epoch": 2.7264502394890897, + "grad_norm": 0.28265225887298584, + "learning_rate": 8.726697904026543e-08, + "loss": 0.17, + "step": 10246 + }, + { + "epoch": 2.726716338477914, + "grad_norm": 0.27673810720443726, + "learning_rate": 8.725022822047931e-08, + "loss": 0.1979, + "step": 10247 + }, + { + "epoch": 2.7269824374667375, + "grad_norm": 0.2958580255508423, + "learning_rate": 8.723347776434258e-08, + "loss": 0.1773, + "step": 10248 + }, + { + "epoch": 2.727248536455561, + "grad_norm": 0.4416358768939972, + "learning_rate": 8.721672767233303e-08, + "loss": 0.177, + "step": 10249 + }, + { + "epoch": 2.7275146354443853, + "grad_norm": 0.3145715296268463, + "learning_rate": 8.719997794492837e-08, + "loss": 0.1751, + "step": 10250 + }, + { + "epoch": 2.727780734433209, + "grad_norm": 0.4560079574584961, + "learning_rate": 8.718322858260633e-08, + "loss": 0.1833, + "step": 10251 + }, + { + "epoch": 2.728046833422033, + "grad_norm": 0.38918864727020264, + "learning_rate": 8.716647958584467e-08, + "loss": 0.1759, + "step": 10252 + }, + { + "epoch": 2.728312932410857, + "grad_norm": 0.27456963062286377, + "learning_rate": 8.714973095512112e-08, + "loss": 0.1838, + "step": 10253 + }, + { + "epoch": 2.728579031399681, + "grad_norm": 0.6774786114692688, + "learning_rate": 8.713298269091329e-08, + "loss": 0.1962, + "step": 10254 + }, + { + "epoch": 2.7288451303885046, + "grad_norm": 0.29850462079048157, + "learning_rate": 8.711623479369901e-08, + "loss": 0.1915, + "step": 10255 + }, + { + "epoch": 2.7291112293773283, + "grad_norm": 0.2651542127132416, + "learning_rate": 8.709948726395586e-08, + "loss": 0.1688, + "step": 10256 + }, + { + "epoch": 2.7293773283661524, + "grad_norm": 0.3018924593925476, + "learning_rate": 8.708274010216157e-08, + "loss": 0.1927, + "step": 10257 + }, + { + "epoch": 2.729643427354976, + "grad_norm": 0.2676025927066803, + "learning_rate": 8.706599330879374e-08, + "loss": 0.1824, + "step": 10258 + }, + { + "epoch": 2.7299095263438, + "grad_norm": 0.30266323685646057, + "learning_rate": 8.704924688433008e-08, + "loss": 0.1686, + "step": 10259 + }, + { + "epoch": 2.730175625332624, + "grad_norm": 0.28094035387039185, + "learning_rate": 8.703250082924826e-08, + "loss": 0.187, + "step": 10260 + }, + { + "epoch": 2.7304417243214476, + "grad_norm": 0.28734883666038513, + "learning_rate": 8.701575514402582e-08, + "loss": 0.18, + "step": 10261 + }, + { + "epoch": 2.7307078233102713, + "grad_norm": 0.33451539278030396, + "learning_rate": 8.699900982914044e-08, + "loss": 0.1646, + "step": 10262 + }, + { + "epoch": 2.7309739222990954, + "grad_norm": 0.35799330472946167, + "learning_rate": 8.69822648850697e-08, + "loss": 0.1997, + "step": 10263 + }, + { + "epoch": 2.731240021287919, + "grad_norm": 0.27013099193573, + "learning_rate": 8.696552031229127e-08, + "loss": 0.1848, + "step": 10264 + }, + { + "epoch": 2.7315061202767428, + "grad_norm": 0.24879994988441467, + "learning_rate": 8.694877611128263e-08, + "loss": 0.1749, + "step": 10265 + }, + { + "epoch": 2.731772219265567, + "grad_norm": 0.26015225052833557, + "learning_rate": 8.693203228252147e-08, + "loss": 0.1688, + "step": 10266 + }, + { + "epoch": 2.7320383182543906, + "grad_norm": 0.37655720114707947, + "learning_rate": 8.691528882648527e-08, + "loss": 0.1866, + "step": 10267 + }, + { + "epoch": 2.7323044172432143, + "grad_norm": 0.2954133450984955, + "learning_rate": 8.689854574365163e-08, + "loss": 0.1911, + "step": 10268 + }, + { + "epoch": 2.7325705162320384, + "grad_norm": 0.27129629254341125, + "learning_rate": 8.688180303449807e-08, + "loss": 0.1729, + "step": 10269 + }, + { + "epoch": 2.732836615220862, + "grad_norm": 0.4683985114097595, + "learning_rate": 8.68650606995022e-08, + "loss": 0.205, + "step": 10270 + }, + { + "epoch": 2.7331027142096858, + "grad_norm": 0.28207269310951233, + "learning_rate": 8.684831873914145e-08, + "loss": 0.1754, + "step": 10271 + }, + { + "epoch": 2.73336881319851, + "grad_norm": 0.38934096693992615, + "learning_rate": 8.683157715389337e-08, + "loss": 0.183, + "step": 10272 + }, + { + "epoch": 2.7336349121873336, + "grad_norm": 0.2974493205547333, + "learning_rate": 8.681483594423547e-08, + "loss": 0.1823, + "step": 10273 + }, + { + "epoch": 2.7339010111761577, + "grad_norm": 0.2852097451686859, + "learning_rate": 8.679809511064529e-08, + "loss": 0.1971, + "step": 10274 + }, + { + "epoch": 2.7341671101649814, + "grad_norm": 0.2954697906970978, + "learning_rate": 8.67813546536002e-08, + "loss": 0.1709, + "step": 10275 + }, + { + "epoch": 2.734433209153805, + "grad_norm": 0.299146831035614, + "learning_rate": 8.676461457357776e-08, + "loss": 0.1937, + "step": 10276 + }, + { + "epoch": 2.734699308142629, + "grad_norm": 0.26963260769844055, + "learning_rate": 8.674787487105542e-08, + "loss": 0.1956, + "step": 10277 + }, + { + "epoch": 2.734965407131453, + "grad_norm": 0.27322688698768616, + "learning_rate": 8.673113554651059e-08, + "loss": 0.1969, + "step": 10278 + }, + { + "epoch": 2.735231506120277, + "grad_norm": 0.2801707088947296, + "learning_rate": 8.671439660042076e-08, + "loss": 0.171, + "step": 10279 + }, + { + "epoch": 2.7354976051091007, + "grad_norm": 0.3873008191585541, + "learning_rate": 8.669765803326332e-08, + "loss": 0.1895, + "step": 10280 + }, + { + "epoch": 2.7357637040979244, + "grad_norm": 0.26196226477622986, + "learning_rate": 8.668091984551576e-08, + "loss": 0.1781, + "step": 10281 + }, + { + "epoch": 2.7360298030867485, + "grad_norm": 0.2808561623096466, + "learning_rate": 8.666418203765537e-08, + "loss": 0.1888, + "step": 10282 + }, + { + "epoch": 2.736295902075572, + "grad_norm": 0.3372064530849457, + "learning_rate": 8.664744461015963e-08, + "loss": 0.1906, + "step": 10283 + }, + { + "epoch": 2.736562001064396, + "grad_norm": 0.40297532081604004, + "learning_rate": 8.663070756350589e-08, + "loss": 0.1943, + "step": 10284 + }, + { + "epoch": 2.73682810005322, + "grad_norm": 0.2754114866256714, + "learning_rate": 8.661397089817158e-08, + "loss": 0.1829, + "step": 10285 + }, + { + "epoch": 2.7370941990420437, + "grad_norm": 0.30781739950180054, + "learning_rate": 8.659723461463398e-08, + "loss": 0.1834, + "step": 10286 + }, + { + "epoch": 2.7373602980308673, + "grad_norm": 0.3645215928554535, + "learning_rate": 8.658049871337049e-08, + "loss": 0.1989, + "step": 10287 + }, + { + "epoch": 2.7376263970196915, + "grad_norm": 0.2869147062301636, + "learning_rate": 8.656376319485844e-08, + "loss": 0.1914, + "step": 10288 + }, + { + "epoch": 2.737892496008515, + "grad_norm": 0.25994956493377686, + "learning_rate": 8.654702805957521e-08, + "loss": 0.1769, + "step": 10289 + }, + { + "epoch": 2.738158594997339, + "grad_norm": 0.283097505569458, + "learning_rate": 8.653029330799804e-08, + "loss": 0.1666, + "step": 10290 + }, + { + "epoch": 2.738424693986163, + "grad_norm": 0.38749608397483826, + "learning_rate": 8.651355894060428e-08, + "loss": 0.2071, + "step": 10291 + }, + { + "epoch": 2.7386907929749866, + "grad_norm": 0.2625177502632141, + "learning_rate": 8.649682495787121e-08, + "loss": 0.1627, + "step": 10292 + }, + { + "epoch": 2.7389568919638103, + "grad_norm": 0.3016887307167053, + "learning_rate": 8.648009136027616e-08, + "loss": 0.174, + "step": 10293 + }, + { + "epoch": 2.7392229909526344, + "grad_norm": 0.27141454815864563, + "learning_rate": 8.646335814829636e-08, + "loss": 0.1693, + "step": 10294 + }, + { + "epoch": 2.739489089941458, + "grad_norm": 0.25714218616485596, + "learning_rate": 8.644662532240908e-08, + "loss": 0.1761, + "step": 10295 + }, + { + "epoch": 2.739755188930282, + "grad_norm": 0.34422630071640015, + "learning_rate": 8.642989288309165e-08, + "loss": 0.1948, + "step": 10296 + }, + { + "epoch": 2.740021287919106, + "grad_norm": 0.28459441661834717, + "learning_rate": 8.641316083082119e-08, + "loss": 0.1773, + "step": 10297 + }, + { + "epoch": 2.7402873869079296, + "grad_norm": 0.3453534245491028, + "learning_rate": 8.639642916607502e-08, + "loss": 0.1851, + "step": 10298 + }, + { + "epoch": 2.7405534858967537, + "grad_norm": 1.3169769048690796, + "learning_rate": 8.637969788933031e-08, + "loss": 0.1923, + "step": 10299 + }, + { + "epoch": 2.7408195848855774, + "grad_norm": 0.3462636470794678, + "learning_rate": 8.636296700106434e-08, + "loss": 0.1755, + "step": 10300 + }, + { + "epoch": 2.7410856838744015, + "grad_norm": 0.27524685859680176, + "learning_rate": 8.634623650175423e-08, + "loss": 0.1897, + "step": 10301 + }, + { + "epoch": 2.7413517828632252, + "grad_norm": 0.280376136302948, + "learning_rate": 8.63295063918772e-08, + "loss": 0.1825, + "step": 10302 + }, + { + "epoch": 2.741617881852049, + "grad_norm": 0.2674706280231476, + "learning_rate": 8.631277667191043e-08, + "loss": 0.1658, + "step": 10303 + }, + { + "epoch": 2.741883980840873, + "grad_norm": 0.29353994131088257, + "learning_rate": 8.629604734233108e-08, + "loss": 0.156, + "step": 10304 + }, + { + "epoch": 2.7421500798296967, + "grad_norm": 0.27997156977653503, + "learning_rate": 8.627931840361631e-08, + "loss": 0.1819, + "step": 10305 + }, + { + "epoch": 2.7424161788185204, + "grad_norm": 0.3603297173976898, + "learning_rate": 8.626258985624328e-08, + "loss": 0.1718, + "step": 10306 + }, + { + "epoch": 2.7426822778073445, + "grad_norm": 0.25919464230537415, + "learning_rate": 8.624586170068908e-08, + "loss": 0.1817, + "step": 10307 + }, + { + "epoch": 2.742948376796168, + "grad_norm": 0.3433176279067993, + "learning_rate": 8.622913393743086e-08, + "loss": 0.1875, + "step": 10308 + }, + { + "epoch": 2.743214475784992, + "grad_norm": 0.2856091856956482, + "learning_rate": 8.621240656694572e-08, + "loss": 0.1971, + "step": 10309 + }, + { + "epoch": 2.743480574773816, + "grad_norm": 0.27777451276779175, + "learning_rate": 8.61956795897108e-08, + "loss": 0.1765, + "step": 10310 + }, + { + "epoch": 2.7437466737626397, + "grad_norm": 0.338666707277298, + "learning_rate": 8.61789530062031e-08, + "loss": 0.1997, + "step": 10311 + }, + { + "epoch": 2.7440127727514634, + "grad_norm": 0.6035832762718201, + "learning_rate": 8.616222681689976e-08, + "loss": 0.1941, + "step": 10312 + }, + { + "epoch": 2.7442788717402875, + "grad_norm": 0.2722722887992859, + "learning_rate": 8.614550102227783e-08, + "loss": 0.1693, + "step": 10313 + }, + { + "epoch": 2.744544970729111, + "grad_norm": 0.2731606066226959, + "learning_rate": 8.612877562281435e-08, + "loss": 0.1622, + "step": 10314 + }, + { + "epoch": 2.744811069717935, + "grad_norm": 0.27494558691978455, + "learning_rate": 8.611205061898642e-08, + "loss": 0.1862, + "step": 10315 + }, + { + "epoch": 2.745077168706759, + "grad_norm": 0.4376981556415558, + "learning_rate": 8.609532601127096e-08, + "loss": 0.1944, + "step": 10316 + }, + { + "epoch": 2.7453432676955827, + "grad_norm": 0.37484797835350037, + "learning_rate": 8.607860180014514e-08, + "loss": 0.1965, + "step": 10317 + }, + { + "epoch": 2.7456093666844064, + "grad_norm": 0.3881039619445801, + "learning_rate": 8.606187798608584e-08, + "loss": 0.1908, + "step": 10318 + }, + { + "epoch": 2.7458754656732305, + "grad_norm": 0.2943107783794403, + "learning_rate": 8.604515456957012e-08, + "loss": 0.1661, + "step": 10319 + }, + { + "epoch": 2.746141564662054, + "grad_norm": 0.3508729934692383, + "learning_rate": 8.602843155107492e-08, + "loss": 0.1859, + "step": 10320 + }, + { + "epoch": 2.746407663650878, + "grad_norm": 0.2724536955356598, + "learning_rate": 8.601170893107729e-08, + "loss": 0.1698, + "step": 10321 + }, + { + "epoch": 2.746673762639702, + "grad_norm": 0.34160521626472473, + "learning_rate": 8.599498671005411e-08, + "loss": 0.1882, + "step": 10322 + }, + { + "epoch": 2.7469398616285257, + "grad_norm": 0.30196988582611084, + "learning_rate": 8.59782648884824e-08, + "loss": 0.186, + "step": 10323 + }, + { + "epoch": 2.74720596061735, + "grad_norm": 0.25606468319892883, + "learning_rate": 8.596154346683901e-08, + "loss": 0.1697, + "step": 10324 + }, + { + "epoch": 2.7474720596061735, + "grad_norm": 0.26879772543907166, + "learning_rate": 8.5944822445601e-08, + "loss": 0.1694, + "step": 10325 + }, + { + "epoch": 2.7477381585949976, + "grad_norm": 0.30136293172836304, + "learning_rate": 8.592810182524516e-08, + "loss": 0.1809, + "step": 10326 + }, + { + "epoch": 2.7480042575838213, + "grad_norm": 0.32402563095092773, + "learning_rate": 8.591138160624849e-08, + "loss": 0.1796, + "step": 10327 + }, + { + "epoch": 2.748270356572645, + "grad_norm": 0.37273484468460083, + "learning_rate": 8.589466178908782e-08, + "loss": 0.1902, + "step": 10328 + }, + { + "epoch": 2.748536455561469, + "grad_norm": 0.26499757170677185, + "learning_rate": 8.587794237424009e-08, + "loss": 0.1689, + "step": 10329 + }, + { + "epoch": 2.7488025545502928, + "grad_norm": 0.33487996459007263, + "learning_rate": 8.58612233621821e-08, + "loss": 0.1883, + "step": 10330 + }, + { + "epoch": 2.7490686535391164, + "grad_norm": 0.2865464687347412, + "learning_rate": 8.584450475339073e-08, + "loss": 0.1877, + "step": 10331 + }, + { + "epoch": 2.7493347525279406, + "grad_norm": 0.31548240780830383, + "learning_rate": 8.582778654834292e-08, + "loss": 0.1679, + "step": 10332 + }, + { + "epoch": 2.7496008515167643, + "grad_norm": 0.2848540246486664, + "learning_rate": 8.58110687475154e-08, + "loss": 0.1819, + "step": 10333 + }, + { + "epoch": 2.749866950505588, + "grad_norm": 0.27880042791366577, + "learning_rate": 8.579435135138504e-08, + "loss": 0.1707, + "step": 10334 + }, + { + "epoch": 2.750133049494412, + "grad_norm": 0.2700616419315338, + "learning_rate": 8.577763436042864e-08, + "loss": 0.1635, + "step": 10335 + }, + { + "epoch": 2.7503991484832357, + "grad_norm": 0.3823104202747345, + "learning_rate": 8.576091777512304e-08, + "loss": 0.1774, + "step": 10336 + }, + { + "epoch": 2.7506652474720594, + "grad_norm": 0.30813032388687134, + "learning_rate": 8.574420159594496e-08, + "loss": 0.2031, + "step": 10337 + }, + { + "epoch": 2.7509313464608836, + "grad_norm": 0.43764615058898926, + "learning_rate": 8.572748582337125e-08, + "loss": 0.1879, + "step": 10338 + }, + { + "epoch": 2.7511974454497072, + "grad_norm": 0.2886647582054138, + "learning_rate": 8.571077045787861e-08, + "loss": 0.1901, + "step": 10339 + }, + { + "epoch": 2.751463544438531, + "grad_norm": 0.24159595370292664, + "learning_rate": 8.569405549994389e-08, + "loss": 0.1553, + "step": 10340 + }, + { + "epoch": 2.751729643427355, + "grad_norm": 0.5920830965042114, + "learning_rate": 8.56773409500437e-08, + "loss": 0.1741, + "step": 10341 + }, + { + "epoch": 2.7519957424161787, + "grad_norm": 0.33965831995010376, + "learning_rate": 8.566062680865494e-08, + "loss": 0.1844, + "step": 10342 + }, + { + "epoch": 2.7522618414050024, + "grad_norm": 0.2668326795101166, + "learning_rate": 8.564391307625417e-08, + "loss": 0.1914, + "step": 10343 + }, + { + "epoch": 2.7525279403938265, + "grad_norm": 0.27687862515449524, + "learning_rate": 8.562719975331821e-08, + "loss": 0.1878, + "step": 10344 + }, + { + "epoch": 2.75279403938265, + "grad_norm": 0.2632240653038025, + "learning_rate": 8.56104868403237e-08, + "loss": 0.166, + "step": 10345 + }, + { + "epoch": 2.7530601383714743, + "grad_norm": 0.2836097180843353, + "learning_rate": 8.559377433774738e-08, + "loss": 0.165, + "step": 10346 + }, + { + "epoch": 2.753326237360298, + "grad_norm": 0.2619103491306305, + "learning_rate": 8.557706224606586e-08, + "loss": 0.157, + "step": 10347 + }, + { + "epoch": 2.753592336349122, + "grad_norm": 0.2668003439903259, + "learning_rate": 8.556035056575585e-08, + "loss": 0.1776, + "step": 10348 + }, + { + "epoch": 2.753858435337946, + "grad_norm": 0.2638910114765167, + "learning_rate": 8.554363929729396e-08, + "loss": 0.1785, + "step": 10349 + }, + { + "epoch": 2.7541245343267695, + "grad_norm": 0.28776484727859497, + "learning_rate": 8.552692844115686e-08, + "loss": 0.1857, + "step": 10350 + }, + { + "epoch": 2.7543906333155936, + "grad_norm": 1.6781558990478516, + "learning_rate": 8.551021799782119e-08, + "loss": 0.1734, + "step": 10351 + }, + { + "epoch": 2.7546567323044173, + "grad_norm": 0.3055911660194397, + "learning_rate": 8.549350796776354e-08, + "loss": 0.1715, + "step": 10352 + }, + { + "epoch": 2.754922831293241, + "grad_norm": 0.24533876776695251, + "learning_rate": 8.547679835146052e-08, + "loss": 0.1557, + "step": 10353 + }, + { + "epoch": 2.755188930282065, + "grad_norm": 0.3341943323612213, + "learning_rate": 8.54600891493887e-08, + "loss": 0.1763, + "step": 10354 + }, + { + "epoch": 2.755455029270889, + "grad_norm": 0.3580697178840637, + "learning_rate": 8.544338036202472e-08, + "loss": 0.1851, + "step": 10355 + }, + { + "epoch": 2.7557211282597125, + "grad_norm": 0.3141325116157532, + "learning_rate": 8.542667198984508e-08, + "loss": 0.1924, + "step": 10356 + }, + { + "epoch": 2.7559872272485366, + "grad_norm": 0.29069334268569946, + "learning_rate": 8.540996403332642e-08, + "loss": 0.1665, + "step": 10357 + }, + { + "epoch": 2.7562533262373603, + "grad_norm": 0.29978471994400024, + "learning_rate": 8.53932564929452e-08, + "loss": 0.2084, + "step": 10358 + }, + { + "epoch": 2.756519425226184, + "grad_norm": 0.27986377477645874, + "learning_rate": 8.537654936917799e-08, + "loss": 0.1766, + "step": 10359 + }, + { + "epoch": 2.756785524215008, + "grad_norm": 0.2813527286052704, + "learning_rate": 8.535984266250129e-08, + "loss": 0.1714, + "step": 10360 + }, + { + "epoch": 2.757051623203832, + "grad_norm": 0.27554088830947876, + "learning_rate": 8.534313637339167e-08, + "loss": 0.1762, + "step": 10361 + }, + { + "epoch": 2.7573177221926555, + "grad_norm": 0.2794727087020874, + "learning_rate": 8.532643050232555e-08, + "loss": 0.1951, + "step": 10362 + }, + { + "epoch": 2.7575838211814796, + "grad_norm": 0.30636993050575256, + "learning_rate": 8.530972504977947e-08, + "loss": 0.1782, + "step": 10363 + }, + { + "epoch": 2.7578499201703033, + "grad_norm": 0.2917519211769104, + "learning_rate": 8.529302001622987e-08, + "loss": 0.1901, + "step": 10364 + }, + { + "epoch": 2.758116019159127, + "grad_norm": 0.25990113615989685, + "learning_rate": 8.527631540215325e-08, + "loss": 0.171, + "step": 10365 + }, + { + "epoch": 2.758382118147951, + "grad_norm": 0.32614076137542725, + "learning_rate": 8.5259611208026e-08, + "loss": 0.1815, + "step": 10366 + }, + { + "epoch": 2.7586482171367748, + "grad_norm": 0.45086878538131714, + "learning_rate": 8.524290743432461e-08, + "loss": 0.1875, + "step": 10367 + }, + { + "epoch": 2.7589143161255985, + "grad_norm": 0.29891231656074524, + "learning_rate": 8.522620408152548e-08, + "loss": 0.1736, + "step": 10368 + }, + { + "epoch": 2.7591804151144226, + "grad_norm": 0.25706860423088074, + "learning_rate": 8.520950115010501e-08, + "loss": 0.1694, + "step": 10369 + }, + { + "epoch": 2.7594465141032463, + "grad_norm": 0.2800769507884979, + "learning_rate": 8.519279864053966e-08, + "loss": 0.1815, + "step": 10370 + }, + { + "epoch": 2.7597126130920704, + "grad_norm": 0.5480833053588867, + "learning_rate": 8.517609655330576e-08, + "loss": 0.1838, + "step": 10371 + }, + { + "epoch": 2.759978712080894, + "grad_norm": 0.28009846806526184, + "learning_rate": 8.515939488887975e-08, + "loss": 0.1938, + "step": 10372 + }, + { + "epoch": 2.760244811069718, + "grad_norm": 0.2858147621154785, + "learning_rate": 8.514269364773791e-08, + "loss": 0.1869, + "step": 10373 + }, + { + "epoch": 2.760510910058542, + "grad_norm": 0.35482102632522583, + "learning_rate": 8.512599283035666e-08, + "loss": 0.1832, + "step": 10374 + }, + { + "epoch": 2.7607770090473656, + "grad_norm": 0.3505927622318268, + "learning_rate": 8.510929243721229e-08, + "loss": 0.171, + "step": 10375 + }, + { + "epoch": 2.7610431080361897, + "grad_norm": 0.36756962537765503, + "learning_rate": 8.509259246878122e-08, + "loss": 0.1962, + "step": 10376 + }, + { + "epoch": 2.7613092070250134, + "grad_norm": 0.2789543867111206, + "learning_rate": 8.507589292553966e-08, + "loss": 0.1789, + "step": 10377 + }, + { + "epoch": 2.761575306013837, + "grad_norm": 0.3638564646244049, + "learning_rate": 8.505919380796398e-08, + "loss": 0.1703, + "step": 10378 + }, + { + "epoch": 2.761841405002661, + "grad_norm": 0.32793495059013367, + "learning_rate": 8.504249511653044e-08, + "loss": 0.1876, + "step": 10379 + }, + { + "epoch": 2.762107503991485, + "grad_norm": 0.26306700706481934, + "learning_rate": 8.502579685171536e-08, + "loss": 0.1729, + "step": 10380 + }, + { + "epoch": 2.7623736029803085, + "grad_norm": 0.2695697546005249, + "learning_rate": 8.500909901399492e-08, + "loss": 0.1651, + "step": 10381 + }, + { + "epoch": 2.7626397019691327, + "grad_norm": 0.2750864028930664, + "learning_rate": 8.499240160384551e-08, + "loss": 0.1787, + "step": 10382 + }, + { + "epoch": 2.7629058009579563, + "grad_norm": 0.27674245834350586, + "learning_rate": 8.497570462174326e-08, + "loss": 0.1754, + "step": 10383 + }, + { + "epoch": 2.76317189994678, + "grad_norm": 0.29161420464515686, + "learning_rate": 8.495900806816448e-08, + "loss": 0.1764, + "step": 10384 + }, + { + "epoch": 2.763437998935604, + "grad_norm": 0.2718483805656433, + "learning_rate": 8.49423119435853e-08, + "loss": 0.1702, + "step": 10385 + }, + { + "epoch": 2.763704097924428, + "grad_norm": 0.26662692427635193, + "learning_rate": 8.492561624848201e-08, + "loss": 0.1847, + "step": 10386 + }, + { + "epoch": 2.7639701969132515, + "grad_norm": 0.28774362802505493, + "learning_rate": 8.490892098333082e-08, + "loss": 0.1652, + "step": 10387 + }, + { + "epoch": 2.7642362959020756, + "grad_norm": 0.27167296409606934, + "learning_rate": 8.489222614860782e-08, + "loss": 0.1813, + "step": 10388 + }, + { + "epoch": 2.7645023948908993, + "grad_norm": 0.3267362117767334, + "learning_rate": 8.487553174478925e-08, + "loss": 0.1823, + "step": 10389 + }, + { + "epoch": 2.764768493879723, + "grad_norm": 1.1147489547729492, + "learning_rate": 8.485883777235123e-08, + "loss": 0.1937, + "step": 10390 + }, + { + "epoch": 2.765034592868547, + "grad_norm": 0.3610471189022064, + "learning_rate": 8.484214423176997e-08, + "loss": 0.1791, + "step": 10391 + }, + { + "epoch": 2.765300691857371, + "grad_norm": 0.28280961513519287, + "learning_rate": 8.482545112352148e-08, + "loss": 0.1551, + "step": 10392 + }, + { + "epoch": 2.765566790846195, + "grad_norm": 0.2733195722103119, + "learning_rate": 8.480875844808204e-08, + "loss": 0.168, + "step": 10393 + }, + { + "epoch": 2.7658328898350186, + "grad_norm": 0.35967111587524414, + "learning_rate": 8.479206620592763e-08, + "loss": 0.1886, + "step": 10394 + }, + { + "epoch": 2.7660989888238423, + "grad_norm": 0.2750062942504883, + "learning_rate": 8.477537439753441e-08, + "loss": 0.1754, + "step": 10395 + }, + { + "epoch": 2.7663650878126664, + "grad_norm": 0.2799331843852997, + "learning_rate": 8.475868302337844e-08, + "loss": 0.1926, + "step": 10396 + }, + { + "epoch": 2.76663118680149, + "grad_norm": 0.24189211428165436, + "learning_rate": 8.474199208393585e-08, + "loss": 0.1608, + "step": 10397 + }, + { + "epoch": 2.7668972857903142, + "grad_norm": 0.4758145809173584, + "learning_rate": 8.47253015796826e-08, + "loss": 0.1746, + "step": 10398 + }, + { + "epoch": 2.767163384779138, + "grad_norm": 0.4310840368270874, + "learning_rate": 8.470861151109481e-08, + "loss": 0.1783, + "step": 10399 + }, + { + "epoch": 2.7674294837679616, + "grad_norm": 0.29401859641075134, + "learning_rate": 8.469192187864847e-08, + "loss": 0.1922, + "step": 10400 + }, + { + "epoch": 2.7676955827567857, + "grad_norm": 0.27424490451812744, + "learning_rate": 8.467523268281968e-08, + "loss": 0.1703, + "step": 10401 + }, + { + "epoch": 2.7679616817456094, + "grad_norm": 0.258113294839859, + "learning_rate": 8.465854392408433e-08, + "loss": 0.1587, + "step": 10402 + }, + { + "epoch": 2.768227780734433, + "grad_norm": 0.5997978448867798, + "learning_rate": 8.464185560291852e-08, + "loss": 0.1993, + "step": 10403 + }, + { + "epoch": 2.7684938797232572, + "grad_norm": 0.3719400465488434, + "learning_rate": 8.462516771979818e-08, + "loss": 0.1714, + "step": 10404 + }, + { + "epoch": 2.768759978712081, + "grad_norm": 0.27436065673828125, + "learning_rate": 8.460848027519929e-08, + "loss": 0.1798, + "step": 10405 + }, + { + "epoch": 2.7690260777009046, + "grad_norm": 0.2473953664302826, + "learning_rate": 8.459179326959784e-08, + "loss": 0.1582, + "step": 10406 + }, + { + "epoch": 2.7692921766897287, + "grad_norm": 0.27409037947654724, + "learning_rate": 8.457510670346975e-08, + "loss": 0.1844, + "step": 10407 + }, + { + "epoch": 2.7695582756785524, + "grad_norm": 0.278496116399765, + "learning_rate": 8.455842057729099e-08, + "loss": 0.1621, + "step": 10408 + }, + { + "epoch": 2.769824374667376, + "grad_norm": 0.2840573489665985, + "learning_rate": 8.454173489153741e-08, + "loss": 0.1945, + "step": 10409 + }, + { + "epoch": 2.7700904736562, + "grad_norm": 0.3481525182723999, + "learning_rate": 8.4525049646685e-08, + "loss": 0.1735, + "step": 10410 + }, + { + "epoch": 2.770356572645024, + "grad_norm": 0.25926947593688965, + "learning_rate": 8.450836484320959e-08, + "loss": 0.1831, + "step": 10411 + }, + { + "epoch": 2.7706226716338476, + "grad_norm": 0.4239814877510071, + "learning_rate": 8.449168048158715e-08, + "loss": 0.1633, + "step": 10412 + }, + { + "epoch": 2.7708887706226717, + "grad_norm": 0.3158589005470276, + "learning_rate": 8.447499656229343e-08, + "loss": 0.1781, + "step": 10413 + }, + { + "epoch": 2.7711548696114954, + "grad_norm": 0.4179513454437256, + "learning_rate": 8.44583130858044e-08, + "loss": 0.2127, + "step": 10414 + }, + { + "epoch": 2.771420968600319, + "grad_norm": 0.3106321096420288, + "learning_rate": 8.444163005259583e-08, + "loss": 0.1914, + "step": 10415 + }, + { + "epoch": 2.771687067589143, + "grad_norm": 0.2774759531021118, + "learning_rate": 8.442494746314362e-08, + "loss": 0.1667, + "step": 10416 + }, + { + "epoch": 2.771953166577967, + "grad_norm": 0.27024394273757935, + "learning_rate": 8.440826531792354e-08, + "loss": 0.1765, + "step": 10417 + }, + { + "epoch": 2.772219265566791, + "grad_norm": 0.2811765968799591, + "learning_rate": 8.439158361741142e-08, + "loss": 0.1731, + "step": 10418 + }, + { + "epoch": 2.7724853645556147, + "grad_norm": 0.27740031480789185, + "learning_rate": 8.437490236208304e-08, + "loss": 0.1722, + "step": 10419 + }, + { + "epoch": 2.772751463544439, + "grad_norm": 0.2709689140319824, + "learning_rate": 8.435822155241422e-08, + "loss": 0.1708, + "step": 10420 + }, + { + "epoch": 2.7730175625332625, + "grad_norm": 0.3568779528141022, + "learning_rate": 8.434154118888068e-08, + "loss": 0.1928, + "step": 10421 + }, + { + "epoch": 2.773283661522086, + "grad_norm": 0.33729833364486694, + "learning_rate": 8.432486127195819e-08, + "loss": 0.1719, + "step": 10422 + }, + { + "epoch": 2.7735497605109103, + "grad_norm": 0.475970983505249, + "learning_rate": 8.430818180212258e-08, + "loss": 0.1993, + "step": 10423 + }, + { + "epoch": 2.773815859499734, + "grad_norm": 0.2834009528160095, + "learning_rate": 8.429150277984945e-08, + "loss": 0.1916, + "step": 10424 + }, + { + "epoch": 2.7740819584885577, + "grad_norm": 0.3395739793777466, + "learning_rate": 8.427482420561462e-08, + "loss": 0.1614, + "step": 10425 + }, + { + "epoch": 2.774348057477382, + "grad_norm": 0.28718215227127075, + "learning_rate": 8.425814607989372e-08, + "loss": 0.1837, + "step": 10426 + }, + { + "epoch": 2.7746141564662055, + "grad_norm": 0.26930561661720276, + "learning_rate": 8.424146840316255e-08, + "loss": 0.1734, + "step": 10427 + }, + { + "epoch": 2.774880255455029, + "grad_norm": 0.26505404710769653, + "learning_rate": 8.422479117589665e-08, + "loss": 0.1777, + "step": 10428 + }, + { + "epoch": 2.7751463544438533, + "grad_norm": 0.3214995563030243, + "learning_rate": 8.42081143985718e-08, + "loss": 0.1879, + "step": 10429 + }, + { + "epoch": 2.775412453432677, + "grad_norm": 0.2993544638156891, + "learning_rate": 8.41914380716636e-08, + "loss": 0.2061, + "step": 10430 + }, + { + "epoch": 2.7756785524215006, + "grad_norm": 0.4574788808822632, + "learning_rate": 8.417476219564773e-08, + "loss": 0.204, + "step": 10431 + }, + { + "epoch": 2.7759446514103248, + "grad_norm": 0.3588140606880188, + "learning_rate": 8.415808677099978e-08, + "loss": 0.1768, + "step": 10432 + }, + { + "epoch": 2.7762107503991484, + "grad_norm": 0.5174379348754883, + "learning_rate": 8.414141179819543e-08, + "loss": 0.1792, + "step": 10433 + }, + { + "epoch": 2.776476849387972, + "grad_norm": 0.28769171237945557, + "learning_rate": 8.412473727771019e-08, + "loss": 0.1832, + "step": 10434 + }, + { + "epoch": 2.7767429483767962, + "grad_norm": 0.2607186734676361, + "learning_rate": 8.410806321001973e-08, + "loss": 0.1657, + "step": 10435 + }, + { + "epoch": 2.77700904736562, + "grad_norm": 0.39834603667259216, + "learning_rate": 8.409138959559958e-08, + "loss": 0.1874, + "step": 10436 + }, + { + "epoch": 2.7772751463544436, + "grad_norm": 0.29829922318458557, + "learning_rate": 8.407471643492536e-08, + "loss": 0.1685, + "step": 10437 + }, + { + "epoch": 2.7775412453432677, + "grad_norm": 0.3400152921676636, + "learning_rate": 8.405804372847256e-08, + "loss": 0.2185, + "step": 10438 + }, + { + "epoch": 2.7778073443320914, + "grad_norm": 0.2776865065097809, + "learning_rate": 8.404137147671677e-08, + "loss": 0.1747, + "step": 10439 + }, + { + "epoch": 2.778073443320915, + "grad_norm": 0.3262992203235626, + "learning_rate": 8.402469968013348e-08, + "loss": 0.1798, + "step": 10440 + }, + { + "epoch": 2.7783395423097392, + "grad_norm": 0.4853196144104004, + "learning_rate": 8.40080283391982e-08, + "loss": 0.1919, + "step": 10441 + }, + { + "epoch": 2.778605641298563, + "grad_norm": 0.26406970620155334, + "learning_rate": 8.399135745438651e-08, + "loss": 0.1695, + "step": 10442 + }, + { + "epoch": 2.778871740287387, + "grad_norm": 0.2985631227493286, + "learning_rate": 8.397468702617375e-08, + "loss": 0.1655, + "step": 10443 + }, + { + "epoch": 2.7791378392762107, + "grad_norm": 0.4811404347419739, + "learning_rate": 8.395801705503555e-08, + "loss": 0.1954, + "step": 10444 + }, + { + "epoch": 2.779403938265035, + "grad_norm": 0.280945748090744, + "learning_rate": 8.394134754144727e-08, + "loss": 0.1852, + "step": 10445 + }, + { + "epoch": 2.7796700372538585, + "grad_norm": 0.39471811056137085, + "learning_rate": 8.392467848588441e-08, + "loss": 0.171, + "step": 10446 + }, + { + "epoch": 2.779936136242682, + "grad_norm": 0.40026435256004333, + "learning_rate": 8.390800988882236e-08, + "loss": 0.191, + "step": 10447 + }, + { + "epoch": 2.7802022352315063, + "grad_norm": 0.43457022309303284, + "learning_rate": 8.389134175073662e-08, + "loss": 0.1755, + "step": 10448 + }, + { + "epoch": 2.78046833422033, + "grad_norm": 0.3333605229854584, + "learning_rate": 8.38746740721025e-08, + "loss": 0.1847, + "step": 10449 + }, + { + "epoch": 2.7807344332091537, + "grad_norm": 0.39092251658439636, + "learning_rate": 8.385800685339547e-08, + "loss": 0.1912, + "step": 10450 + }, + { + "epoch": 2.781000532197978, + "grad_norm": 0.3985823392868042, + "learning_rate": 8.384134009509085e-08, + "loss": 0.1892, + "step": 10451 + }, + { + "epoch": 2.7812666311868015, + "grad_norm": 0.37943902611732483, + "learning_rate": 8.38246737976641e-08, + "loss": 0.188, + "step": 10452 + }, + { + "epoch": 2.781532730175625, + "grad_norm": 0.2798739969730377, + "learning_rate": 8.380800796159049e-08, + "loss": 0.1721, + "step": 10453 + }, + { + "epoch": 2.7817988291644493, + "grad_norm": 0.2834521532058716, + "learning_rate": 8.37913425873454e-08, + "loss": 0.1746, + "step": 10454 + }, + { + "epoch": 2.782064928153273, + "grad_norm": 0.29558059573173523, + "learning_rate": 8.377467767540415e-08, + "loss": 0.1708, + "step": 10455 + }, + { + "epoch": 2.7823310271420967, + "grad_norm": 0.4335280656814575, + "learning_rate": 8.37580132262421e-08, + "loss": 0.1958, + "step": 10456 + }, + { + "epoch": 2.782597126130921, + "grad_norm": 0.297942578792572, + "learning_rate": 8.374134924033445e-08, + "loss": 0.1833, + "step": 10457 + }, + { + "epoch": 2.7828632251197445, + "grad_norm": 0.3328496813774109, + "learning_rate": 8.372468571815663e-08, + "loss": 0.1736, + "step": 10458 + }, + { + "epoch": 2.783129324108568, + "grad_norm": 0.3238033354282379, + "learning_rate": 8.37080226601838e-08, + "loss": 0.1913, + "step": 10459 + }, + { + "epoch": 2.7833954230973923, + "grad_norm": 0.2616664171218872, + "learning_rate": 8.369136006689126e-08, + "loss": 0.1669, + "step": 10460 + }, + { + "epoch": 2.783661522086216, + "grad_norm": 0.3587585985660553, + "learning_rate": 8.367469793875432e-08, + "loss": 0.1798, + "step": 10461 + }, + { + "epoch": 2.7839276210750397, + "grad_norm": 0.2784512937068939, + "learning_rate": 8.365803627624813e-08, + "loss": 0.1857, + "step": 10462 + }, + { + "epoch": 2.784193720063864, + "grad_norm": 0.28461602330207825, + "learning_rate": 8.3641375079848e-08, + "loss": 0.1895, + "step": 10463 + }, + { + "epoch": 2.7844598190526875, + "grad_norm": 0.3176383972167969, + "learning_rate": 8.362471435002905e-08, + "loss": 0.1765, + "step": 10464 + }, + { + "epoch": 2.7847259180415116, + "grad_norm": 0.27046874165534973, + "learning_rate": 8.360805408726655e-08, + "loss": 0.1698, + "step": 10465 + }, + { + "epoch": 2.7849920170303353, + "grad_norm": 0.2863980233669281, + "learning_rate": 8.359139429203564e-08, + "loss": 0.1948, + "step": 10466 + }, + { + "epoch": 2.7852581160191594, + "grad_norm": 0.303558349609375, + "learning_rate": 8.357473496481154e-08, + "loss": 0.1645, + "step": 10467 + }, + { + "epoch": 2.785524215007983, + "grad_norm": 0.41179171204566956, + "learning_rate": 8.355807610606931e-08, + "loss": 0.2076, + "step": 10468 + }, + { + "epoch": 2.7857903139968068, + "grad_norm": 0.340788871049881, + "learning_rate": 8.354141771628422e-08, + "loss": 0.1808, + "step": 10469 + }, + { + "epoch": 2.786056412985631, + "grad_norm": 0.3812620937824249, + "learning_rate": 8.352475979593132e-08, + "loss": 0.1697, + "step": 10470 + }, + { + "epoch": 2.7863225119744546, + "grad_norm": 0.2871691882610321, + "learning_rate": 8.350810234548576e-08, + "loss": 0.1897, + "step": 10471 + }, + { + "epoch": 2.7865886109632783, + "grad_norm": 0.29765981435775757, + "learning_rate": 8.34914453654226e-08, + "loss": 0.1941, + "step": 10472 + }, + { + "epoch": 2.7868547099521024, + "grad_norm": 0.33916088938713074, + "learning_rate": 8.347478885621704e-08, + "loss": 0.1717, + "step": 10473 + }, + { + "epoch": 2.787120808940926, + "grad_norm": 0.2759247124195099, + "learning_rate": 8.345813281834402e-08, + "loss": 0.1826, + "step": 10474 + }, + { + "epoch": 2.7873869079297497, + "grad_norm": 0.4103658199310303, + "learning_rate": 8.344147725227869e-08, + "loss": 0.1841, + "step": 10475 + }, + { + "epoch": 2.787653006918574, + "grad_norm": 0.2690265476703644, + "learning_rate": 8.342482215849606e-08, + "loss": 0.176, + "step": 10476 + }, + { + "epoch": 2.7879191059073976, + "grad_norm": 0.2924601435661316, + "learning_rate": 8.340816753747118e-08, + "loss": 0.1694, + "step": 10477 + }, + { + "epoch": 2.7881852048962212, + "grad_norm": 0.27653637528419495, + "learning_rate": 8.339151338967911e-08, + "loss": 0.1814, + "step": 10478 + }, + { + "epoch": 2.7884513038850454, + "grad_norm": 0.32584619522094727, + "learning_rate": 8.33748597155948e-08, + "loss": 0.1771, + "step": 10479 + }, + { + "epoch": 2.788717402873869, + "grad_norm": 0.27264803647994995, + "learning_rate": 8.33582065156933e-08, + "loss": 0.1735, + "step": 10480 + }, + { + "epoch": 2.7889835018626927, + "grad_norm": 0.2872890830039978, + "learning_rate": 8.334155379044953e-08, + "loss": 0.162, + "step": 10481 + }, + { + "epoch": 2.789249600851517, + "grad_norm": 0.3332408368587494, + "learning_rate": 8.332490154033853e-08, + "loss": 0.1937, + "step": 10482 + }, + { + "epoch": 2.7895156998403405, + "grad_norm": 0.2694719433784485, + "learning_rate": 8.33082497658352e-08, + "loss": 0.1624, + "step": 10483 + }, + { + "epoch": 2.789781798829164, + "grad_norm": 0.36454400420188904, + "learning_rate": 8.329159846741456e-08, + "loss": 0.191, + "step": 10484 + }, + { + "epoch": 2.7900478978179883, + "grad_norm": 0.31775644421577454, + "learning_rate": 8.327494764555143e-08, + "loss": 0.1749, + "step": 10485 + }, + { + "epoch": 2.790313996806812, + "grad_norm": 0.26824334263801575, + "learning_rate": 8.32582973007208e-08, + "loss": 0.1743, + "step": 10486 + }, + { + "epoch": 2.7905800957956357, + "grad_norm": 0.2974388599395752, + "learning_rate": 8.324164743339754e-08, + "loss": 0.2112, + "step": 10487 + }, + { + "epoch": 2.79084619478446, + "grad_norm": 0.37693485617637634, + "learning_rate": 8.322499804405658e-08, + "loss": 0.1971, + "step": 10488 + }, + { + "epoch": 2.7911122937732835, + "grad_norm": 0.3543013334274292, + "learning_rate": 8.320834913317273e-08, + "loss": 0.1728, + "step": 10489 + }, + { + "epoch": 2.7913783927621076, + "grad_norm": 0.2651215195655823, + "learning_rate": 8.31917007012209e-08, + "loss": 0.1709, + "step": 10490 + }, + { + "epoch": 2.7916444917509313, + "grad_norm": 0.34379544854164124, + "learning_rate": 8.31750527486759e-08, + "loss": 0.1874, + "step": 10491 + }, + { + "epoch": 2.7919105907397554, + "grad_norm": 0.3557575047016144, + "learning_rate": 8.315840527601263e-08, + "loss": 0.1698, + "step": 10492 + }, + { + "epoch": 2.792176689728579, + "grad_norm": 0.3743952214717865, + "learning_rate": 8.314175828370584e-08, + "loss": 0.1835, + "step": 10493 + }, + { + "epoch": 2.792442788717403, + "grad_norm": 0.3372432589530945, + "learning_rate": 8.312511177223036e-08, + "loss": 0.1872, + "step": 10494 + }, + { + "epoch": 2.792708887706227, + "grad_norm": 0.2945190966129303, + "learning_rate": 8.310846574206097e-08, + "loss": 0.2005, + "step": 10495 + }, + { + "epoch": 2.7929749866950506, + "grad_norm": 1.0567848682403564, + "learning_rate": 8.309182019367246e-08, + "loss": 0.1698, + "step": 10496 + }, + { + "epoch": 2.7932410856838743, + "grad_norm": 0.3007723093032837, + "learning_rate": 8.307517512753962e-08, + "loss": 0.1716, + "step": 10497 + }, + { + "epoch": 2.7935071846726984, + "grad_norm": 0.344837486743927, + "learning_rate": 8.305853054413716e-08, + "loss": 0.1587, + "step": 10498 + }, + { + "epoch": 2.793773283661522, + "grad_norm": 0.276276171207428, + "learning_rate": 8.304188644393987e-08, + "loss": 0.1674, + "step": 10499 + }, + { + "epoch": 2.794039382650346, + "grad_norm": 0.2678810656070709, + "learning_rate": 8.302524282742241e-08, + "loss": 0.1774, + "step": 10500 + }, + { + "epoch": 2.79430548163917, + "grad_norm": 0.24742119014263153, + "learning_rate": 8.300859969505953e-08, + "loss": 0.1817, + "step": 10501 + }, + { + "epoch": 2.7945715806279936, + "grad_norm": 0.3271794319152832, + "learning_rate": 8.29919570473259e-08, + "loss": 0.1751, + "step": 10502 + }, + { + "epoch": 2.7948376796168173, + "grad_norm": 0.4007332921028137, + "learning_rate": 8.297531488469626e-08, + "loss": 0.1836, + "step": 10503 + }, + { + "epoch": 2.7951037786056414, + "grad_norm": 0.30453622341156006, + "learning_rate": 8.295867320764518e-08, + "loss": 0.186, + "step": 10504 + }, + { + "epoch": 2.795369877594465, + "grad_norm": 0.26934099197387695, + "learning_rate": 8.29420320166474e-08, + "loss": 0.1634, + "step": 10505 + }, + { + "epoch": 2.7956359765832888, + "grad_norm": 0.2804326117038727, + "learning_rate": 8.292539131217751e-08, + "loss": 0.1713, + "step": 10506 + }, + { + "epoch": 2.795902075572113, + "grad_norm": 0.26690420508384705, + "learning_rate": 8.290875109471021e-08, + "loss": 0.17, + "step": 10507 + }, + { + "epoch": 2.7961681745609366, + "grad_norm": 0.32560473680496216, + "learning_rate": 8.289211136471998e-08, + "loss": 0.1728, + "step": 10508 + }, + { + "epoch": 2.7964342735497603, + "grad_norm": 0.30156633257865906, + "learning_rate": 8.287547212268155e-08, + "loss": 0.1807, + "step": 10509 + }, + { + "epoch": 2.7967003725385844, + "grad_norm": 0.26747003197669983, + "learning_rate": 8.285883336906944e-08, + "loss": 0.176, + "step": 10510 + }, + { + "epoch": 2.796966471527408, + "grad_norm": 0.27022814750671387, + "learning_rate": 8.284219510435825e-08, + "loss": 0.1669, + "step": 10511 + }, + { + "epoch": 2.797232570516232, + "grad_norm": 0.32871636748313904, + "learning_rate": 8.28255573290225e-08, + "loss": 0.2073, + "step": 10512 + }, + { + "epoch": 2.797498669505056, + "grad_norm": 0.31724879145622253, + "learning_rate": 8.280892004353678e-08, + "loss": 0.1674, + "step": 10513 + }, + { + "epoch": 2.7977647684938796, + "grad_norm": 0.35643360018730164, + "learning_rate": 8.279228324837558e-08, + "loss": 0.1849, + "step": 10514 + }, + { + "epoch": 2.7980308674827037, + "grad_norm": 0.2692541182041168, + "learning_rate": 8.27756469440134e-08, + "loss": 0.18, + "step": 10515 + }, + { + "epoch": 2.7982969664715274, + "grad_norm": 0.2987518310546875, + "learning_rate": 8.27590111309248e-08, + "loss": 0.1821, + "step": 10516 + }, + { + "epoch": 2.7985630654603515, + "grad_norm": 0.2713576555252075, + "learning_rate": 8.274237580958423e-08, + "loss": 0.1623, + "step": 10517 + }, + { + "epoch": 2.798829164449175, + "grad_norm": 0.2824353277683258, + "learning_rate": 8.272574098046621e-08, + "loss": 0.1647, + "step": 10518 + }, + { + "epoch": 2.799095263437999, + "grad_norm": 0.2946239113807678, + "learning_rate": 8.270910664404509e-08, + "loss": 0.1848, + "step": 10519 + }, + { + "epoch": 2.799361362426823, + "grad_norm": 0.2547696530818939, + "learning_rate": 8.269247280079546e-08, + "loss": 0.1753, + "step": 10520 + }, + { + "epoch": 2.7996274614156467, + "grad_norm": 0.35617417097091675, + "learning_rate": 8.267583945119161e-08, + "loss": 0.1859, + "step": 10521 + }, + { + "epoch": 2.7998935604044703, + "grad_norm": 0.26957836747169495, + "learning_rate": 8.265920659570806e-08, + "loss": 0.1639, + "step": 10522 + }, + { + "epoch": 2.8001596593932945, + "grad_norm": 0.2727157771587372, + "learning_rate": 8.264257423481916e-08, + "loss": 0.1718, + "step": 10523 + }, + { + "epoch": 2.800425758382118, + "grad_norm": 0.45694056153297424, + "learning_rate": 8.262594236899936e-08, + "loss": 0.1889, + "step": 10524 + }, + { + "epoch": 2.800691857370942, + "grad_norm": 0.305189847946167, + "learning_rate": 8.260931099872292e-08, + "loss": 0.1922, + "step": 10525 + }, + { + "epoch": 2.800957956359766, + "grad_norm": 0.27970245480537415, + "learning_rate": 8.259268012446433e-08, + "loss": 0.1769, + "step": 10526 + }, + { + "epoch": 2.8012240553485896, + "grad_norm": 0.4782428741455078, + "learning_rate": 8.257604974669783e-08, + "loss": 0.1838, + "step": 10527 + }, + { + "epoch": 2.8014901543374133, + "grad_norm": 0.2730681002140045, + "learning_rate": 8.255941986589784e-08, + "loss": 0.1669, + "step": 10528 + }, + { + "epoch": 2.8017562533262375, + "grad_norm": 0.40765050053596497, + "learning_rate": 8.254279048253861e-08, + "loss": 0.1879, + "step": 10529 + }, + { + "epoch": 2.802022352315061, + "grad_norm": 0.2370264083147049, + "learning_rate": 8.252616159709447e-08, + "loss": 0.1494, + "step": 10530 + }, + { + "epoch": 2.802288451303885, + "grad_norm": 0.29171937704086304, + "learning_rate": 8.25095332100397e-08, + "loss": 0.1792, + "step": 10531 + }, + { + "epoch": 2.802554550292709, + "grad_norm": 0.24427616596221924, + "learning_rate": 8.249290532184859e-08, + "loss": 0.1617, + "step": 10532 + }, + { + "epoch": 2.8028206492815326, + "grad_norm": 0.35463351011276245, + "learning_rate": 8.247627793299541e-08, + "loss": 0.1759, + "step": 10533 + }, + { + "epoch": 2.8030867482703563, + "grad_norm": 0.27498194575309753, + "learning_rate": 8.245965104395437e-08, + "loss": 0.1815, + "step": 10534 + }, + { + "epoch": 2.8033528472591804, + "grad_norm": 0.44449251890182495, + "learning_rate": 8.244302465519978e-08, + "loss": 0.1801, + "step": 10535 + }, + { + "epoch": 2.803618946248004, + "grad_norm": 0.3134472668170929, + "learning_rate": 8.242639876720576e-08, + "loss": 0.1638, + "step": 10536 + }, + { + "epoch": 2.8038850452368282, + "grad_norm": 0.3128843903541565, + "learning_rate": 8.240977338044657e-08, + "loss": 0.1854, + "step": 10537 + }, + { + "epoch": 2.804151144225652, + "grad_norm": 0.26711606979370117, + "learning_rate": 8.239314849539637e-08, + "loss": 0.1764, + "step": 10538 + }, + { + "epoch": 2.804417243214476, + "grad_norm": 0.299376517534256, + "learning_rate": 8.23765241125294e-08, + "loss": 0.1844, + "step": 10539 + }, + { + "epoch": 2.8046833422032997, + "grad_norm": 0.27163341641426086, + "learning_rate": 8.235990023231972e-08, + "loss": 0.1853, + "step": 10540 + }, + { + "epoch": 2.8049494411921234, + "grad_norm": 0.2560073733329773, + "learning_rate": 8.234327685524158e-08, + "loss": 0.1711, + "step": 10541 + }, + { + "epoch": 2.8052155401809475, + "grad_norm": 0.3277401030063629, + "learning_rate": 8.232665398176901e-08, + "loss": 0.1682, + "step": 10542 + }, + { + "epoch": 2.8054816391697712, + "grad_norm": 0.37835991382598877, + "learning_rate": 8.231003161237624e-08, + "loss": 0.1848, + "step": 10543 + }, + { + "epoch": 2.805747738158595, + "grad_norm": 0.38146838545799255, + "learning_rate": 8.229340974753728e-08, + "loss": 0.198, + "step": 10544 + }, + { + "epoch": 2.806013837147419, + "grad_norm": 0.36065733432769775, + "learning_rate": 8.227678838772628e-08, + "loss": 0.1812, + "step": 10545 + }, + { + "epoch": 2.8062799361362427, + "grad_norm": 0.3281749486923218, + "learning_rate": 8.226016753341726e-08, + "loss": 0.1827, + "step": 10546 + }, + { + "epoch": 2.8065460351250664, + "grad_norm": 0.316364586353302, + "learning_rate": 8.224354718508433e-08, + "loss": 0.1648, + "step": 10547 + }, + { + "epoch": 2.8068121341138905, + "grad_norm": 0.27309077978134155, + "learning_rate": 8.22269273432015e-08, + "loss": 0.1734, + "step": 10548 + }, + { + "epoch": 2.807078233102714, + "grad_norm": 0.24792872369289398, + "learning_rate": 8.221030800824286e-08, + "loss": 0.1568, + "step": 10549 + }, + { + "epoch": 2.807344332091538, + "grad_norm": 0.35528379678726196, + "learning_rate": 8.219368918068234e-08, + "loss": 0.181, + "step": 10550 + }, + { + "epoch": 2.807610431080362, + "grad_norm": 0.2868589162826538, + "learning_rate": 8.2177070860994e-08, + "loss": 0.1712, + "step": 10551 + }, + { + "epoch": 2.8078765300691857, + "grad_norm": 0.2920004427433014, + "learning_rate": 8.216045304965182e-08, + "loss": 0.1863, + "step": 10552 + }, + { + "epoch": 2.8081426290580094, + "grad_norm": 0.28044769167900085, + "learning_rate": 8.214383574712976e-08, + "loss": 0.1608, + "step": 10553 + }, + { + "epoch": 2.8084087280468335, + "grad_norm": 0.25365760922431946, + "learning_rate": 8.212721895390184e-08, + "loss": 0.1669, + "step": 10554 + }, + { + "epoch": 2.808674827035657, + "grad_norm": 0.3244633376598358, + "learning_rate": 8.21106026704419e-08, + "loss": 0.1636, + "step": 10555 + }, + { + "epoch": 2.808940926024481, + "grad_norm": 0.296663373708725, + "learning_rate": 8.209398689722396e-08, + "loss": 0.1721, + "step": 10556 + }, + { + "epoch": 2.809207025013305, + "grad_norm": 0.26848459243774414, + "learning_rate": 8.207737163472186e-08, + "loss": 0.1816, + "step": 10557 + }, + { + "epoch": 2.8094731240021287, + "grad_norm": 0.35068753361701965, + "learning_rate": 8.206075688340958e-08, + "loss": 0.1889, + "step": 10558 + }, + { + "epoch": 2.809739222990953, + "grad_norm": 0.2656135857105255, + "learning_rate": 8.204414264376094e-08, + "loss": 0.1854, + "step": 10559 + }, + { + "epoch": 2.8100053219797765, + "grad_norm": 0.34284302592277527, + "learning_rate": 8.202752891624988e-08, + "loss": 0.1786, + "step": 10560 + }, + { + "epoch": 2.8102714209686, + "grad_norm": 0.372684508562088, + "learning_rate": 8.201091570135016e-08, + "loss": 0.1955, + "step": 10561 + }, + { + "epoch": 2.8105375199574243, + "grad_norm": 0.26142311096191406, + "learning_rate": 8.199430299953572e-08, + "loss": 0.1613, + "step": 10562 + }, + { + "epoch": 2.810803618946248, + "grad_norm": 0.26300325989723206, + "learning_rate": 8.197769081128033e-08, + "loss": 0.1565, + "step": 10563 + }, + { + "epoch": 2.811069717935072, + "grad_norm": 0.2692282199859619, + "learning_rate": 8.196107913705786e-08, + "loss": 0.176, + "step": 10564 + }, + { + "epoch": 2.811335816923896, + "grad_norm": 0.28618109226226807, + "learning_rate": 8.194446797734203e-08, + "loss": 0.1641, + "step": 10565 + }, + { + "epoch": 2.8116019159127195, + "grad_norm": 0.30429142713546753, + "learning_rate": 8.192785733260668e-08, + "loss": 0.1776, + "step": 10566 + }, + { + "epoch": 2.8118680149015436, + "grad_norm": 0.3170310854911804, + "learning_rate": 8.191124720332554e-08, + "loss": 0.1899, + "step": 10567 + }, + { + "epoch": 2.8121341138903673, + "grad_norm": 0.37730497121810913, + "learning_rate": 8.189463758997242e-08, + "loss": 0.1834, + "step": 10568 + }, + { + "epoch": 2.812400212879191, + "grad_norm": 0.28359442949295044, + "learning_rate": 8.187802849302102e-08, + "loss": 0.1818, + "step": 10569 + }, + { + "epoch": 2.812666311868015, + "grad_norm": 0.32867541909217834, + "learning_rate": 8.186141991294502e-08, + "loss": 0.1839, + "step": 10570 + }, + { + "epoch": 2.8129324108568388, + "grad_norm": 0.2800534665584564, + "learning_rate": 8.184481185021826e-08, + "loss": 0.1726, + "step": 10571 + }, + { + "epoch": 2.8131985098456624, + "grad_norm": 0.30309632420539856, + "learning_rate": 8.18282043053143e-08, + "loss": 0.1781, + "step": 10572 + }, + { + "epoch": 2.8134646088344866, + "grad_norm": 0.3299563229084015, + "learning_rate": 8.181159727870693e-08, + "loss": 0.1811, + "step": 10573 + }, + { + "epoch": 2.8137307078233103, + "grad_norm": 0.37422046065330505, + "learning_rate": 8.179499077086972e-08, + "loss": 0.198, + "step": 10574 + }, + { + "epoch": 2.813996806812134, + "grad_norm": 0.3123112916946411, + "learning_rate": 8.177838478227643e-08, + "loss": 0.2047, + "step": 10575 + }, + { + "epoch": 2.814262905800958, + "grad_norm": 0.3350107967853546, + "learning_rate": 8.176177931340057e-08, + "loss": 0.1668, + "step": 10576 + }, + { + "epoch": 2.8145290047897817, + "grad_norm": 0.4065619111061096, + "learning_rate": 8.174517436471586e-08, + "loss": 0.1785, + "step": 10577 + }, + { + "epoch": 2.8147951037786054, + "grad_norm": 0.26537373661994934, + "learning_rate": 8.172856993669585e-08, + "loss": 0.1698, + "step": 10578 + }, + { + "epoch": 2.8150612027674295, + "grad_norm": 0.25398769974708557, + "learning_rate": 8.171196602981419e-08, + "loss": 0.1611, + "step": 10579 + }, + { + "epoch": 2.8153273017562532, + "grad_norm": 0.3988801836967468, + "learning_rate": 8.16953626445444e-08, + "loss": 0.1754, + "step": 10580 + }, + { + "epoch": 2.815593400745077, + "grad_norm": 0.32178109884262085, + "learning_rate": 8.167875978136006e-08, + "loss": 0.1758, + "step": 10581 + }, + { + "epoch": 2.815859499733901, + "grad_norm": 0.8432336449623108, + "learning_rate": 8.166215744073472e-08, + "loss": 0.1895, + "step": 10582 + }, + { + "epoch": 2.8161255987227247, + "grad_norm": 0.279258668422699, + "learning_rate": 8.164555562314195e-08, + "loss": 0.1852, + "step": 10583 + }, + { + "epoch": 2.816391697711549, + "grad_norm": 0.37325015664100647, + "learning_rate": 8.162895432905516e-08, + "loss": 0.1797, + "step": 10584 + }, + { + "epoch": 2.8166577967003725, + "grad_norm": 0.2662089467048645, + "learning_rate": 8.1612353558948e-08, + "loss": 0.1806, + "step": 10585 + }, + { + "epoch": 2.8169238956891967, + "grad_norm": 0.27609124779701233, + "learning_rate": 8.159575331329383e-08, + "loss": 0.1628, + "step": 10586 + }, + { + "epoch": 2.8171899946780203, + "grad_norm": 0.3655288517475128, + "learning_rate": 8.157915359256618e-08, + "loss": 0.2018, + "step": 10587 + }, + { + "epoch": 2.817456093666844, + "grad_norm": 0.3428556025028229, + "learning_rate": 8.156255439723853e-08, + "loss": 0.1997, + "step": 10588 + }, + { + "epoch": 2.817722192655668, + "grad_norm": 0.3874095380306244, + "learning_rate": 8.154595572778425e-08, + "loss": 0.1889, + "step": 10589 + }, + { + "epoch": 2.817988291644492, + "grad_norm": 0.34003910422325134, + "learning_rate": 8.152935758467688e-08, + "loss": 0.1639, + "step": 10590 + }, + { + "epoch": 2.8182543906333155, + "grad_norm": 0.28527307510375977, + "learning_rate": 8.151275996838971e-08, + "loss": 0.1966, + "step": 10591 + }, + { + "epoch": 2.8185204896221396, + "grad_norm": 0.4845508337020874, + "learning_rate": 8.149616287939622e-08, + "loss": 0.1945, + "step": 10592 + }, + { + "epoch": 2.8187865886109633, + "grad_norm": 0.29114124178886414, + "learning_rate": 8.147956631816974e-08, + "loss": 0.1956, + "step": 10593 + }, + { + "epoch": 2.819052687599787, + "grad_norm": 0.28818270564079285, + "learning_rate": 8.146297028518371e-08, + "loss": 0.1753, + "step": 10594 + }, + { + "epoch": 2.819318786588611, + "grad_norm": 0.333130806684494, + "learning_rate": 8.144637478091139e-08, + "loss": 0.1921, + "step": 10595 + }, + { + "epoch": 2.819584885577435, + "grad_norm": 0.40521952509880066, + "learning_rate": 8.142977980582621e-08, + "loss": 0.1927, + "step": 10596 + }, + { + "epoch": 2.8198509845662585, + "grad_norm": 0.2721473276615143, + "learning_rate": 8.14131853604014e-08, + "loss": 0.1769, + "step": 10597 + }, + { + "epoch": 2.8201170835550826, + "grad_norm": 0.2988218665122986, + "learning_rate": 8.139659144511036e-08, + "loss": 0.1788, + "step": 10598 + }, + { + "epoch": 2.8203831825439063, + "grad_norm": 0.2544262409210205, + "learning_rate": 8.137999806042628e-08, + "loss": 0.1749, + "step": 10599 + }, + { + "epoch": 2.82064928153273, + "grad_norm": 0.27414894104003906, + "learning_rate": 8.136340520682256e-08, + "loss": 0.1676, + "step": 10600 + }, + { + "epoch": 2.820915380521554, + "grad_norm": 0.3396824300289154, + "learning_rate": 8.134681288477234e-08, + "loss": 0.2052, + "step": 10601 + }, + { + "epoch": 2.821181479510378, + "grad_norm": 0.2818636894226074, + "learning_rate": 8.133022109474894e-08, + "loss": 0.1858, + "step": 10602 + }, + { + "epoch": 2.8214475784992015, + "grad_norm": 0.4165731370449066, + "learning_rate": 8.131362983722556e-08, + "loss": 0.1812, + "step": 10603 + }, + { + "epoch": 2.8217136774880256, + "grad_norm": 0.2840396463871002, + "learning_rate": 8.129703911267547e-08, + "loss": 0.2043, + "step": 10604 + }, + { + "epoch": 2.8219797764768493, + "grad_norm": 0.3034663200378418, + "learning_rate": 8.12804489215718e-08, + "loss": 0.1856, + "step": 10605 + }, + { + "epoch": 2.822245875465673, + "grad_norm": 0.25617367029190063, + "learning_rate": 8.126385926438775e-08, + "loss": 0.1606, + "step": 10606 + }, + { + "epoch": 2.822511974454497, + "grad_norm": 0.27608075737953186, + "learning_rate": 8.124727014159654e-08, + "loss": 0.1712, + "step": 10607 + }, + { + "epoch": 2.8227780734433208, + "grad_norm": 0.3333911895751953, + "learning_rate": 8.123068155367125e-08, + "loss": 0.18, + "step": 10608 + }, + { + "epoch": 2.823044172432145, + "grad_norm": 0.2868543267250061, + "learning_rate": 8.12140935010851e-08, + "loss": 0.1859, + "step": 10609 + }, + { + "epoch": 2.8233102714209686, + "grad_norm": 0.2946104109287262, + "learning_rate": 8.119750598431117e-08, + "loss": 0.1976, + "step": 10610 + }, + { + "epoch": 2.8235763704097927, + "grad_norm": 0.43290281295776367, + "learning_rate": 8.11809190038226e-08, + "loss": 0.1834, + "step": 10611 + }, + { + "epoch": 2.8238424693986164, + "grad_norm": 0.291607141494751, + "learning_rate": 8.116433256009243e-08, + "loss": 0.1798, + "step": 10612 + }, + { + "epoch": 2.82410856838744, + "grad_norm": 0.2857663035392761, + "learning_rate": 8.114774665359378e-08, + "loss": 0.1873, + "step": 10613 + }, + { + "epoch": 2.824374667376264, + "grad_norm": 0.26957616209983826, + "learning_rate": 8.11311612847997e-08, + "loss": 0.1642, + "step": 10614 + }, + { + "epoch": 2.824640766365088, + "grad_norm": 0.46370720863342285, + "learning_rate": 8.111457645418328e-08, + "loss": 0.1971, + "step": 10615 + }, + { + "epoch": 2.8249068653539116, + "grad_norm": 0.29146692156791687, + "learning_rate": 8.109799216221748e-08, + "loss": 0.1814, + "step": 10616 + }, + { + "epoch": 2.8251729643427357, + "grad_norm": 0.39024871587753296, + "learning_rate": 8.108140840937536e-08, + "loss": 0.1773, + "step": 10617 + }, + { + "epoch": 2.8254390633315594, + "grad_norm": 0.2593112885951996, + "learning_rate": 8.106482519612992e-08, + "loss": 0.1697, + "step": 10618 + }, + { + "epoch": 2.825705162320383, + "grad_norm": 0.3026682138442993, + "learning_rate": 8.104824252295416e-08, + "loss": 0.2083, + "step": 10619 + }, + { + "epoch": 2.825971261309207, + "grad_norm": 0.25452128052711487, + "learning_rate": 8.1031660390321e-08, + "loss": 0.1599, + "step": 10620 + }, + { + "epoch": 2.826237360298031, + "grad_norm": 0.26124271750450134, + "learning_rate": 8.101507879870346e-08, + "loss": 0.1825, + "step": 10621 + }, + { + "epoch": 2.8265034592868545, + "grad_norm": 0.27776581048965454, + "learning_rate": 8.099849774857442e-08, + "loss": 0.1808, + "step": 10622 + }, + { + "epoch": 2.8267695582756787, + "grad_norm": 0.27809467911720276, + "learning_rate": 8.098191724040684e-08, + "loss": 0.1739, + "step": 10623 + }, + { + "epoch": 2.8270356572645023, + "grad_norm": 0.27420246601104736, + "learning_rate": 8.096533727467366e-08, + "loss": 0.1727, + "step": 10624 + }, + { + "epoch": 2.827301756253326, + "grad_norm": 0.7609682083129883, + "learning_rate": 8.094875785184772e-08, + "loss": 0.1652, + "step": 10625 + }, + { + "epoch": 2.82756785524215, + "grad_norm": 0.2637545168399811, + "learning_rate": 8.093217897240195e-08, + "loss": 0.1695, + "step": 10626 + }, + { + "epoch": 2.827833954230974, + "grad_norm": 0.2563680410385132, + "learning_rate": 8.091560063680914e-08, + "loss": 0.1637, + "step": 10627 + }, + { + "epoch": 2.8281000532197975, + "grad_norm": 0.2667503356933594, + "learning_rate": 8.089902284554221e-08, + "loss": 0.1791, + "step": 10628 + }, + { + "epoch": 2.8283661522086216, + "grad_norm": 0.30598172545433044, + "learning_rate": 8.088244559907393e-08, + "loss": 0.1833, + "step": 10629 + }, + { + "epoch": 2.8286322511974453, + "grad_norm": 0.2715127170085907, + "learning_rate": 8.086586889787721e-08, + "loss": 0.1641, + "step": 10630 + }, + { + "epoch": 2.8288983501862695, + "grad_norm": 0.26324769854545593, + "learning_rate": 8.084929274242473e-08, + "loss": 0.1781, + "step": 10631 + }, + { + "epoch": 2.829164449175093, + "grad_norm": 0.4808658957481384, + "learning_rate": 8.083271713318938e-08, + "loss": 0.199, + "step": 10632 + }, + { + "epoch": 2.829430548163917, + "grad_norm": 0.26110678911209106, + "learning_rate": 8.081614207064386e-08, + "loss": 0.1836, + "step": 10633 + }, + { + "epoch": 2.829696647152741, + "grad_norm": 0.29424959421157837, + "learning_rate": 8.079956755526099e-08, + "loss": 0.176, + "step": 10634 + }, + { + "epoch": 2.8299627461415646, + "grad_norm": 0.2762165665626526, + "learning_rate": 8.07829935875134e-08, + "loss": 0.1615, + "step": 10635 + }, + { + "epoch": 2.8302288451303887, + "grad_norm": 0.3753720819950104, + "learning_rate": 8.076642016787395e-08, + "loss": 0.1743, + "step": 10636 + }, + { + "epoch": 2.8304949441192124, + "grad_norm": 0.28233861923217773, + "learning_rate": 8.074984729681524e-08, + "loss": 0.1706, + "step": 10637 + }, + { + "epoch": 2.830761043108036, + "grad_norm": 0.4562457799911499, + "learning_rate": 8.073327497481002e-08, + "loss": 0.19, + "step": 10638 + }, + { + "epoch": 2.8310271420968602, + "grad_norm": 0.3289196491241455, + "learning_rate": 8.071670320233095e-08, + "loss": 0.1869, + "step": 10639 + }, + { + "epoch": 2.831293241085684, + "grad_norm": 0.2819974720478058, + "learning_rate": 8.070013197985072e-08, + "loss": 0.181, + "step": 10640 + }, + { + "epoch": 2.8315593400745076, + "grad_norm": 0.3678383529186249, + "learning_rate": 8.06835613078419e-08, + "loss": 0.1953, + "step": 10641 + }, + { + "epoch": 2.8318254390633317, + "grad_norm": 0.2580477297306061, + "learning_rate": 8.066699118677715e-08, + "loss": 0.1721, + "step": 10642 + }, + { + "epoch": 2.8320915380521554, + "grad_norm": 0.2818179428577423, + "learning_rate": 8.065042161712914e-08, + "loss": 0.2006, + "step": 10643 + }, + { + "epoch": 2.832357637040979, + "grad_norm": 0.3856876790523529, + "learning_rate": 8.06338525993704e-08, + "loss": 0.186, + "step": 10644 + }, + { + "epoch": 2.832623736029803, + "grad_norm": 0.2569543123245239, + "learning_rate": 8.061728413397358e-08, + "loss": 0.1537, + "step": 10645 + }, + { + "epoch": 2.832889835018627, + "grad_norm": 0.3349413275718689, + "learning_rate": 8.060071622141114e-08, + "loss": 0.1823, + "step": 10646 + }, + { + "epoch": 2.8331559340074506, + "grad_norm": 0.2636508047580719, + "learning_rate": 8.058414886215576e-08, + "loss": 0.19, + "step": 10647 + }, + { + "epoch": 2.8334220329962747, + "grad_norm": 0.4350545108318329, + "learning_rate": 8.056758205667986e-08, + "loss": 0.2031, + "step": 10648 + }, + { + "epoch": 2.8336881319850984, + "grad_norm": 0.25885874032974243, + "learning_rate": 8.055101580545604e-08, + "loss": 0.1677, + "step": 10649 + }, + { + "epoch": 2.833954230973922, + "grad_norm": 0.2588551342487335, + "learning_rate": 8.053445010895674e-08, + "loss": 0.1677, + "step": 10650 + }, + { + "epoch": 2.834220329962746, + "grad_norm": 0.4870411157608032, + "learning_rate": 8.051788496765452e-08, + "loss": 0.2061, + "step": 10651 + }, + { + "epoch": 2.83448642895157, + "grad_norm": 0.28800898790359497, + "learning_rate": 8.050132038202178e-08, + "loss": 0.1902, + "step": 10652 + }, + { + "epoch": 2.8347525279403936, + "grad_norm": 0.2878630459308624, + "learning_rate": 8.048475635253102e-08, + "loss": 0.1781, + "step": 10653 + }, + { + "epoch": 2.8350186269292177, + "grad_norm": 0.3397081196308136, + "learning_rate": 8.046819287965465e-08, + "loss": 0.1911, + "step": 10654 + }, + { + "epoch": 2.8352847259180414, + "grad_norm": 0.2503981590270996, + "learning_rate": 8.045162996386514e-08, + "loss": 0.1628, + "step": 10655 + }, + { + "epoch": 2.8355508249068655, + "grad_norm": 0.3398151397705078, + "learning_rate": 8.043506760563483e-08, + "loss": 0.1816, + "step": 10656 + }, + { + "epoch": 2.835816923895689, + "grad_norm": 0.27111274003982544, + "learning_rate": 8.041850580543618e-08, + "loss": 0.1721, + "step": 10657 + }, + { + "epoch": 2.8360830228845133, + "grad_norm": 0.3760269582271576, + "learning_rate": 8.04019445637415e-08, + "loss": 0.1869, + "step": 10658 + }, + { + "epoch": 2.836349121873337, + "grad_norm": 0.31168749928474426, + "learning_rate": 8.038538388102324e-08, + "loss": 0.1919, + "step": 10659 + }, + { + "epoch": 2.8366152208621607, + "grad_norm": 0.34386754035949707, + "learning_rate": 8.036882375775363e-08, + "loss": 0.1756, + "step": 10660 + }, + { + "epoch": 2.836881319850985, + "grad_norm": 0.35024890303611755, + "learning_rate": 8.035226419440507e-08, + "loss": 0.1752, + "step": 10661 + }, + { + "epoch": 2.8371474188398085, + "grad_norm": 0.3219085931777954, + "learning_rate": 8.03357051914499e-08, + "loss": 0.1879, + "step": 10662 + }, + { + "epoch": 2.837413517828632, + "grad_norm": 0.2657706141471863, + "learning_rate": 8.031914674936034e-08, + "loss": 0.1918, + "step": 10663 + }, + { + "epoch": 2.8376796168174563, + "grad_norm": 0.27296140789985657, + "learning_rate": 8.030258886860873e-08, + "loss": 0.172, + "step": 10664 + }, + { + "epoch": 2.83794571580628, + "grad_norm": 0.2552797496318817, + "learning_rate": 8.02860315496673e-08, + "loss": 0.1571, + "step": 10665 + }, + { + "epoch": 2.8382118147951036, + "grad_norm": 0.2785777151584625, + "learning_rate": 8.026947479300834e-08, + "loss": 0.1908, + "step": 10666 + }, + { + "epoch": 2.8384779137839278, + "grad_norm": 0.36216798424720764, + "learning_rate": 8.025291859910403e-08, + "loss": 0.1766, + "step": 10667 + }, + { + "epoch": 2.8387440127727515, + "grad_norm": 0.326865017414093, + "learning_rate": 8.023636296842662e-08, + "loss": 0.185, + "step": 10668 + }, + { + "epoch": 2.839010111761575, + "grad_norm": 0.30035680532455444, + "learning_rate": 8.021980790144826e-08, + "loss": 0.2024, + "step": 10669 + }, + { + "epoch": 2.8392762107503993, + "grad_norm": 0.28223976492881775, + "learning_rate": 8.020325339864124e-08, + "loss": 0.1869, + "step": 10670 + }, + { + "epoch": 2.839542309739223, + "grad_norm": 0.2711271345615387, + "learning_rate": 8.018669946047761e-08, + "loss": 0.1756, + "step": 10671 + }, + { + "epoch": 2.8398084087280466, + "grad_norm": 0.27274465560913086, + "learning_rate": 8.01701460874296e-08, + "loss": 0.1761, + "step": 10672 + }, + { + "epoch": 2.8400745077168708, + "grad_norm": 0.26775220036506653, + "learning_rate": 8.01535932799693e-08, + "loss": 0.1644, + "step": 10673 + }, + { + "epoch": 2.8403406067056944, + "grad_norm": 0.26409098505973816, + "learning_rate": 8.013704103856887e-08, + "loss": 0.1645, + "step": 10674 + }, + { + "epoch": 2.840606705694518, + "grad_norm": 0.25016161799430847, + "learning_rate": 8.012048936370036e-08, + "loss": 0.1678, + "step": 10675 + }, + { + "epoch": 2.8408728046833422, + "grad_norm": 0.31171026825904846, + "learning_rate": 8.010393825583594e-08, + "loss": 0.1999, + "step": 10676 + }, + { + "epoch": 2.841138903672166, + "grad_norm": 0.3048734962940216, + "learning_rate": 8.008738771544759e-08, + "loss": 0.18, + "step": 10677 + }, + { + "epoch": 2.84140500266099, + "grad_norm": 0.31833121180534363, + "learning_rate": 8.007083774300741e-08, + "loss": 0.1821, + "step": 10678 + }, + { + "epoch": 2.8416711016498137, + "grad_norm": 0.3507077395915985, + "learning_rate": 8.005428833898745e-08, + "loss": 0.1733, + "step": 10679 + }, + { + "epoch": 2.8419372006386374, + "grad_norm": 0.2698923945426941, + "learning_rate": 8.00377395038597e-08, + "loss": 0.1859, + "step": 10680 + }, + { + "epoch": 2.8422032996274615, + "grad_norm": 0.2863818407058716, + "learning_rate": 8.002119123809623e-08, + "loss": 0.1806, + "step": 10681 + }, + { + "epoch": 2.8424693986162852, + "grad_norm": 0.3011619448661804, + "learning_rate": 8.000464354216895e-08, + "loss": 0.1707, + "step": 10682 + }, + { + "epoch": 2.8427354976051094, + "grad_norm": 0.28310438990592957, + "learning_rate": 7.998809641654987e-08, + "loss": 0.1855, + "step": 10683 + }, + { + "epoch": 2.843001596593933, + "grad_norm": 0.28002363443374634, + "learning_rate": 7.997154986171094e-08, + "loss": 0.1731, + "step": 10684 + }, + { + "epoch": 2.8432676955827567, + "grad_norm": 0.32278886437416077, + "learning_rate": 7.995500387812413e-08, + "loss": 0.1804, + "step": 10685 + }, + { + "epoch": 2.843533794571581, + "grad_norm": 0.286295622587204, + "learning_rate": 7.993845846626131e-08, + "loss": 0.1783, + "step": 10686 + }, + { + "epoch": 2.8437998935604045, + "grad_norm": 0.28836721181869507, + "learning_rate": 7.992191362659447e-08, + "loss": 0.1619, + "step": 10687 + }, + { + "epoch": 2.844065992549228, + "grad_norm": 0.2669072449207306, + "learning_rate": 7.990536935959541e-08, + "loss": 0.1578, + "step": 10688 + }, + { + "epoch": 2.8443320915380523, + "grad_norm": 0.3203587532043457, + "learning_rate": 7.988882566573608e-08, + "loss": 0.1775, + "step": 10689 + }, + { + "epoch": 2.844598190526876, + "grad_norm": 0.29406026005744934, + "learning_rate": 7.987228254548828e-08, + "loss": 0.1886, + "step": 10690 + }, + { + "epoch": 2.8448642895156997, + "grad_norm": 0.2851187288761139, + "learning_rate": 7.985573999932393e-08, + "loss": 0.1699, + "step": 10691 + }, + { + "epoch": 2.845130388504524, + "grad_norm": 0.3005323112010956, + "learning_rate": 7.983919802771475e-08, + "loss": 0.1967, + "step": 10692 + }, + { + "epoch": 2.8453964874933475, + "grad_norm": 0.24940745532512665, + "learning_rate": 7.982265663113266e-08, + "loss": 0.1688, + "step": 10693 + }, + { + "epoch": 2.845662586482171, + "grad_norm": 0.3496321737766266, + "learning_rate": 7.980611581004937e-08, + "loss": 0.1791, + "step": 10694 + }, + { + "epoch": 2.8459286854709953, + "grad_norm": 0.3707628548145294, + "learning_rate": 7.978957556493675e-08, + "loss": 0.1747, + "step": 10695 + }, + { + "epoch": 2.846194784459819, + "grad_norm": 0.26607000827789307, + "learning_rate": 7.977303589626642e-08, + "loss": 0.1814, + "step": 10696 + }, + { + "epoch": 2.8464608834486427, + "grad_norm": 0.30571603775024414, + "learning_rate": 7.975649680451023e-08, + "loss": 0.1674, + "step": 10697 + }, + { + "epoch": 2.846726982437467, + "grad_norm": 0.2969324290752411, + "learning_rate": 7.973995829013994e-08, + "loss": 0.1656, + "step": 10698 + }, + { + "epoch": 2.8469930814262905, + "grad_norm": 0.2631176710128784, + "learning_rate": 7.972342035362717e-08, + "loss": 0.1792, + "step": 10699 + }, + { + "epoch": 2.847259180415114, + "grad_norm": 0.3326854109764099, + "learning_rate": 7.970688299544366e-08, + "loss": 0.1866, + "step": 10700 + }, + { + "epoch": 2.8475252794039383, + "grad_norm": 0.31584641337394714, + "learning_rate": 7.969034621606108e-08, + "loss": 0.1774, + "step": 10701 + }, + { + "epoch": 2.847791378392762, + "grad_norm": 0.32813504338264465, + "learning_rate": 7.967381001595114e-08, + "loss": 0.1856, + "step": 10702 + }, + { + "epoch": 2.848057477381586, + "grad_norm": 0.35172516107559204, + "learning_rate": 7.965727439558539e-08, + "loss": 0.1942, + "step": 10703 + }, + { + "epoch": 2.84832357637041, + "grad_norm": 0.28318628668785095, + "learning_rate": 7.964073935543555e-08, + "loss": 0.1737, + "step": 10704 + }, + { + "epoch": 2.848589675359234, + "grad_norm": 0.3279268741607666, + "learning_rate": 7.962420489597316e-08, + "loss": 0.1673, + "step": 10705 + }, + { + "epoch": 2.8488557743480576, + "grad_norm": 0.2767091393470764, + "learning_rate": 7.960767101766989e-08, + "loss": 0.2041, + "step": 10706 + }, + { + "epoch": 2.8491218733368813, + "grad_norm": 0.34573647379875183, + "learning_rate": 7.959113772099726e-08, + "loss": 0.187, + "step": 10707 + }, + { + "epoch": 2.8493879723257054, + "grad_norm": 0.3676900565624237, + "learning_rate": 7.957460500642687e-08, + "loss": 0.1811, + "step": 10708 + }, + { + "epoch": 2.849654071314529, + "grad_norm": 0.25531235337257385, + "learning_rate": 7.955807287443021e-08, + "loss": 0.1793, + "step": 10709 + }, + { + "epoch": 2.8499201703033528, + "grad_norm": 0.29974353313446045, + "learning_rate": 7.954154132547892e-08, + "loss": 0.1761, + "step": 10710 + }, + { + "epoch": 2.850186269292177, + "grad_norm": 0.30755072832107544, + "learning_rate": 7.952501036004437e-08, + "loss": 0.1593, + "step": 10711 + }, + { + "epoch": 2.8504523682810006, + "grad_norm": 0.34835106134414673, + "learning_rate": 7.95084799785982e-08, + "loss": 0.1807, + "step": 10712 + }, + { + "epoch": 2.8507184672698243, + "grad_norm": 0.3883298635482788, + "learning_rate": 7.949195018161176e-08, + "loss": 0.189, + "step": 10713 + }, + { + "epoch": 2.8509845662586484, + "grad_norm": 0.35225579142570496, + "learning_rate": 7.947542096955661e-08, + "loss": 0.2023, + "step": 10714 + }, + { + "epoch": 2.851250665247472, + "grad_norm": 0.2667955458164215, + "learning_rate": 7.945889234290415e-08, + "loss": 0.1765, + "step": 10715 + }, + { + "epoch": 2.8515167642362957, + "grad_norm": 0.30048373341560364, + "learning_rate": 7.944236430212581e-08, + "loss": 0.1654, + "step": 10716 + }, + { + "epoch": 2.85178286322512, + "grad_norm": 0.29515692591667175, + "learning_rate": 7.942583684769306e-08, + "loss": 0.1822, + "step": 10717 + }, + { + "epoch": 2.8520489622139436, + "grad_norm": 0.2852095067501068, + "learning_rate": 7.940930998007722e-08, + "loss": 0.1879, + "step": 10718 + }, + { + "epoch": 2.8523150612027672, + "grad_norm": 0.2676110565662384, + "learning_rate": 7.939278369974972e-08, + "loss": 0.1657, + "step": 10719 + }, + { + "epoch": 2.8525811601915914, + "grad_norm": 0.41387197375297546, + "learning_rate": 7.937625800718187e-08, + "loss": 0.1934, + "step": 10720 + }, + { + "epoch": 2.852847259180415, + "grad_norm": 0.3270620107650757, + "learning_rate": 7.93597329028451e-08, + "loss": 0.1799, + "step": 10721 + }, + { + "epoch": 2.8531133581692387, + "grad_norm": 0.2878035008907318, + "learning_rate": 7.934320838721063e-08, + "loss": 0.1811, + "step": 10722 + }, + { + "epoch": 2.853379457158063, + "grad_norm": 0.33621928095817566, + "learning_rate": 7.93266844607499e-08, + "loss": 0.2085, + "step": 10723 + }, + { + "epoch": 2.8536455561468865, + "grad_norm": 0.2849123179912567, + "learning_rate": 7.931016112393408e-08, + "loss": 0.1804, + "step": 10724 + }, + { + "epoch": 2.85391165513571, + "grad_norm": 0.2677036225795746, + "learning_rate": 7.929363837723455e-08, + "loss": 0.1793, + "step": 10725 + }, + { + "epoch": 2.8541777541245343, + "grad_norm": 0.2738090455532074, + "learning_rate": 7.92771162211225e-08, + "loss": 0.1732, + "step": 10726 + }, + { + "epoch": 2.854443853113358, + "grad_norm": 0.27254149317741394, + "learning_rate": 7.926059465606924e-08, + "loss": 0.1687, + "step": 10727 + }, + { + "epoch": 2.854709952102182, + "grad_norm": 0.30782350897789, + "learning_rate": 7.924407368254593e-08, + "loss": 0.1759, + "step": 10728 + }, + { + "epoch": 2.854976051091006, + "grad_norm": 0.32774609327316284, + "learning_rate": 7.922755330102384e-08, + "loss": 0.1845, + "step": 10729 + }, + { + "epoch": 2.85524215007983, + "grad_norm": 0.33499324321746826, + "learning_rate": 7.921103351197412e-08, + "loss": 0.1914, + "step": 10730 + }, + { + "epoch": 2.8555082490686536, + "grad_norm": 0.48181912302970886, + "learning_rate": 7.919451431586802e-08, + "loss": 0.1821, + "step": 10731 + }, + { + "epoch": 2.8557743480574773, + "grad_norm": 0.3720126748085022, + "learning_rate": 7.91779957131766e-08, + "loss": 0.2108, + "step": 10732 + }, + { + "epoch": 2.8560404470463014, + "grad_norm": 0.2968122959136963, + "learning_rate": 7.916147770437105e-08, + "loss": 0.1729, + "step": 10733 + }, + { + "epoch": 2.856306546035125, + "grad_norm": 0.35554274916648865, + "learning_rate": 7.914496028992254e-08, + "loss": 0.1818, + "step": 10734 + }, + { + "epoch": 2.856572645023949, + "grad_norm": 0.3019019663333893, + "learning_rate": 7.91284434703021e-08, + "loss": 0.1618, + "step": 10735 + }, + { + "epoch": 2.856838744012773, + "grad_norm": 0.39675402641296387, + "learning_rate": 7.91119272459809e-08, + "loss": 0.1808, + "step": 10736 + }, + { + "epoch": 2.8571048430015966, + "grad_norm": 0.2515259385108948, + "learning_rate": 7.909541161742995e-08, + "loss": 0.1638, + "step": 10737 + }, + { + "epoch": 2.8573709419904203, + "grad_norm": 0.25277599692344666, + "learning_rate": 7.907889658512039e-08, + "loss": 0.1658, + "step": 10738 + }, + { + "epoch": 2.8576370409792444, + "grad_norm": 0.3788427412509918, + "learning_rate": 7.906238214952317e-08, + "loss": 0.1744, + "step": 10739 + }, + { + "epoch": 2.857903139968068, + "grad_norm": 0.2780704200267792, + "learning_rate": 7.904586831110938e-08, + "loss": 0.1791, + "step": 10740 + }, + { + "epoch": 2.858169238956892, + "grad_norm": 0.4064999520778656, + "learning_rate": 7.902935507034998e-08, + "loss": 0.1658, + "step": 10741 + }, + { + "epoch": 2.858435337945716, + "grad_norm": 0.4016541838645935, + "learning_rate": 7.901284242771604e-08, + "loss": 0.1764, + "step": 10742 + }, + { + "epoch": 2.8587014369345396, + "grad_norm": 0.2776281535625458, + "learning_rate": 7.899633038367843e-08, + "loss": 0.1696, + "step": 10743 + }, + { + "epoch": 2.8589675359233633, + "grad_norm": 0.4420602023601532, + "learning_rate": 7.897981893870818e-08, + "loss": 0.1926, + "step": 10744 + }, + { + "epoch": 2.8592336349121874, + "grad_norm": 0.28763577342033386, + "learning_rate": 7.89633080932762e-08, + "loss": 0.1855, + "step": 10745 + }, + { + "epoch": 2.859499733901011, + "grad_norm": 0.30307096242904663, + "learning_rate": 7.894679784785345e-08, + "loss": 0.1815, + "step": 10746 + }, + { + "epoch": 2.8597658328898348, + "grad_norm": 0.29368191957473755, + "learning_rate": 7.893028820291077e-08, + "loss": 0.1689, + "step": 10747 + }, + { + "epoch": 2.860031931878659, + "grad_norm": 0.2661118805408478, + "learning_rate": 7.891377915891911e-08, + "loss": 0.1667, + "step": 10748 + }, + { + "epoch": 2.8602980308674826, + "grad_norm": 0.32203245162963867, + "learning_rate": 7.88972707163493e-08, + "loss": 0.1769, + "step": 10749 + }, + { + "epoch": 2.8605641298563067, + "grad_norm": 0.3300568759441376, + "learning_rate": 7.888076287567223e-08, + "loss": 0.1755, + "step": 10750 + }, + { + "epoch": 2.8608302288451304, + "grad_norm": 0.28651878237724304, + "learning_rate": 7.886425563735869e-08, + "loss": 0.1917, + "step": 10751 + }, + { + "epoch": 2.861096327833954, + "grad_norm": 0.3771492540836334, + "learning_rate": 7.884774900187953e-08, + "loss": 0.1914, + "step": 10752 + }, + { + "epoch": 2.861362426822778, + "grad_norm": 0.3354969918727875, + "learning_rate": 7.88312429697056e-08, + "loss": 0.1895, + "step": 10753 + }, + { + "epoch": 2.861628525811602, + "grad_norm": 0.3412073850631714, + "learning_rate": 7.881473754130761e-08, + "loss": 0.185, + "step": 10754 + }, + { + "epoch": 2.861894624800426, + "grad_norm": 0.3107288181781769, + "learning_rate": 7.879823271715635e-08, + "loss": 0.1773, + "step": 10755 + }, + { + "epoch": 2.8621607237892497, + "grad_norm": 0.25930002331733704, + "learning_rate": 7.878172849772257e-08, + "loss": 0.1555, + "step": 10756 + }, + { + "epoch": 2.8624268227780734, + "grad_norm": 0.3192456066608429, + "learning_rate": 7.876522488347705e-08, + "loss": 0.161, + "step": 10757 + }, + { + "epoch": 2.8626929217668975, + "grad_norm": 0.39214423298835754, + "learning_rate": 7.874872187489044e-08, + "loss": 0.1839, + "step": 10758 + }, + { + "epoch": 2.862959020755721, + "grad_norm": 0.3078918159008026, + "learning_rate": 7.873221947243347e-08, + "loss": 0.174, + "step": 10759 + }, + { + "epoch": 2.863225119744545, + "grad_norm": 0.44434428215026855, + "learning_rate": 7.871571767657679e-08, + "loss": 0.1731, + "step": 10760 + }, + { + "epoch": 2.863491218733369, + "grad_norm": 0.4656757712364197, + "learning_rate": 7.869921648779116e-08, + "loss": 0.1871, + "step": 10761 + }, + { + "epoch": 2.8637573177221927, + "grad_norm": 0.32854709029197693, + "learning_rate": 7.868271590654707e-08, + "loss": 0.1925, + "step": 10762 + }, + { + "epoch": 2.8640234167110163, + "grad_norm": 0.2671881914138794, + "learning_rate": 7.866621593331533e-08, + "loss": 0.1634, + "step": 10763 + }, + { + "epoch": 2.8642895156998405, + "grad_norm": 0.30433788895606995, + "learning_rate": 7.86497165685664e-08, + "loss": 0.1838, + "step": 10764 + }, + { + "epoch": 2.864555614688664, + "grad_norm": 0.26679477095603943, + "learning_rate": 7.863321781277097e-08, + "loss": 0.1798, + "step": 10765 + }, + { + "epoch": 2.864821713677488, + "grad_norm": 0.26083093881607056, + "learning_rate": 7.861671966639958e-08, + "loss": 0.1713, + "step": 10766 + }, + { + "epoch": 2.865087812666312, + "grad_norm": 0.3513680100440979, + "learning_rate": 7.860022212992282e-08, + "loss": 0.1771, + "step": 10767 + }, + { + "epoch": 2.8653539116551356, + "grad_norm": 0.2974477708339691, + "learning_rate": 7.858372520381118e-08, + "loss": 0.1814, + "step": 10768 + }, + { + "epoch": 2.8656200106439593, + "grad_norm": 0.29427585005760193, + "learning_rate": 7.856722888853524e-08, + "loss": 0.1909, + "step": 10769 + }, + { + "epoch": 2.8658861096327835, + "grad_norm": 0.27203282713890076, + "learning_rate": 7.855073318456546e-08, + "loss": 0.1752, + "step": 10770 + }, + { + "epoch": 2.866152208621607, + "grad_norm": 0.34831175208091736, + "learning_rate": 7.853423809237235e-08, + "loss": 0.1796, + "step": 10771 + }, + { + "epoch": 2.866418307610431, + "grad_norm": 0.41880714893341064, + "learning_rate": 7.851774361242644e-08, + "loss": 0.1975, + "step": 10772 + }, + { + "epoch": 2.866684406599255, + "grad_norm": 0.3294469118118286, + "learning_rate": 7.850124974519807e-08, + "loss": 0.1818, + "step": 10773 + }, + { + "epoch": 2.8669505055880786, + "grad_norm": 0.2835482656955719, + "learning_rate": 7.84847564911578e-08, + "loss": 0.1947, + "step": 10774 + }, + { + "epoch": 2.8672166045769027, + "grad_norm": 0.42415717244148254, + "learning_rate": 7.846826385077597e-08, + "loss": 0.201, + "step": 10775 + }, + { + "epoch": 2.8674827035657264, + "grad_norm": 0.3527891933917999, + "learning_rate": 7.845177182452303e-08, + "loss": 0.1857, + "step": 10776 + }, + { + "epoch": 2.8677488025545506, + "grad_norm": 0.32263460755348206, + "learning_rate": 7.843528041286932e-08, + "loss": 0.1822, + "step": 10777 + }, + { + "epoch": 2.8680149015433742, + "grad_norm": 0.33006155490875244, + "learning_rate": 7.841878961628528e-08, + "loss": 0.182, + "step": 10778 + }, + { + "epoch": 2.868281000532198, + "grad_norm": 0.2664697766304016, + "learning_rate": 7.840229943524116e-08, + "loss": 0.1688, + "step": 10779 + }, + { + "epoch": 2.868547099521022, + "grad_norm": 0.27780601382255554, + "learning_rate": 7.838580987020738e-08, + "loss": 0.1825, + "step": 10780 + }, + { + "epoch": 2.8688131985098457, + "grad_norm": 0.3284033536911011, + "learning_rate": 7.836932092165421e-08, + "loss": 0.175, + "step": 10781 + }, + { + "epoch": 2.8690792974986694, + "grad_norm": 0.44812625646591187, + "learning_rate": 7.835283259005199e-08, + "loss": 0.1828, + "step": 10782 + }, + { + "epoch": 2.8693453964874935, + "grad_norm": 0.2785613238811493, + "learning_rate": 7.833634487587094e-08, + "loss": 0.1787, + "step": 10783 + }, + { + "epoch": 2.869611495476317, + "grad_norm": 0.2697860300540924, + "learning_rate": 7.831985777958137e-08, + "loss": 0.1734, + "step": 10784 + }, + { + "epoch": 2.869877594465141, + "grad_norm": 0.27024227380752563, + "learning_rate": 7.830337130165349e-08, + "loss": 0.1576, + "step": 10785 + }, + { + "epoch": 2.870143693453965, + "grad_norm": 0.3038247227668762, + "learning_rate": 7.828688544255761e-08, + "loss": 0.1959, + "step": 10786 + }, + { + "epoch": 2.8704097924427887, + "grad_norm": 0.3116103410720825, + "learning_rate": 7.827040020276381e-08, + "loss": 0.1725, + "step": 10787 + }, + { + "epoch": 2.8706758914316124, + "grad_norm": 0.28749880194664, + "learning_rate": 7.825391558274237e-08, + "loss": 0.1801, + "step": 10788 + }, + { + "epoch": 2.8709419904204365, + "grad_norm": 0.28613510727882385, + "learning_rate": 7.823743158296348e-08, + "loss": 0.1821, + "step": 10789 + }, + { + "epoch": 2.87120808940926, + "grad_norm": 0.3611481785774231, + "learning_rate": 7.822094820389725e-08, + "loss": 0.1738, + "step": 10790 + }, + { + "epoch": 2.871474188398084, + "grad_norm": 0.28871527314186096, + "learning_rate": 7.820446544601382e-08, + "loss": 0.177, + "step": 10791 + }, + { + "epoch": 2.871740287386908, + "grad_norm": 0.299005389213562, + "learning_rate": 7.818798330978334e-08, + "loss": 0.1848, + "step": 10792 + }, + { + "epoch": 2.8720063863757317, + "grad_norm": 0.41918957233428955, + "learning_rate": 7.817150179567592e-08, + "loss": 0.1862, + "step": 10793 + }, + { + "epoch": 2.8722724853645554, + "grad_norm": 0.3059704899787903, + "learning_rate": 7.81550209041616e-08, + "loss": 0.1966, + "step": 10794 + }, + { + "epoch": 2.8725385843533795, + "grad_norm": 0.3556734025478363, + "learning_rate": 7.813854063571048e-08, + "loss": 0.1845, + "step": 10795 + }, + { + "epoch": 2.872804683342203, + "grad_norm": 0.2630341649055481, + "learning_rate": 7.812206099079259e-08, + "loss": 0.1647, + "step": 10796 + }, + { + "epoch": 2.8730707823310273, + "grad_norm": 0.2824721932411194, + "learning_rate": 7.810558196987802e-08, + "loss": 0.191, + "step": 10797 + }, + { + "epoch": 2.873336881319851, + "grad_norm": 0.29613572359085083, + "learning_rate": 7.80891035734367e-08, + "loss": 0.1855, + "step": 10798 + }, + { + "epoch": 2.8736029803086747, + "grad_norm": 0.29674625396728516, + "learning_rate": 7.807262580193869e-08, + "loss": 0.1951, + "step": 10799 + }, + { + "epoch": 2.873869079297499, + "grad_norm": 0.4414457380771637, + "learning_rate": 7.805614865585395e-08, + "loss": 0.1782, + "step": 10800 + }, + { + "epoch": 2.8741351782863225, + "grad_norm": 0.24321302771568298, + "learning_rate": 7.803967213565245e-08, + "loss": 0.1599, + "step": 10801 + }, + { + "epoch": 2.8744012772751466, + "grad_norm": 0.2802196443080902, + "learning_rate": 7.802319624180409e-08, + "loss": 0.1913, + "step": 10802 + }, + { + "epoch": 2.8746673762639703, + "grad_norm": 0.37871459126472473, + "learning_rate": 7.800672097477889e-08, + "loss": 0.1897, + "step": 10803 + }, + { + "epoch": 2.874933475252794, + "grad_norm": 0.2761460542678833, + "learning_rate": 7.799024633504667e-08, + "loss": 0.1711, + "step": 10804 + }, + { + "epoch": 2.875199574241618, + "grad_norm": 0.36535680294036865, + "learning_rate": 7.797377232307735e-08, + "loss": 0.185, + "step": 10805 + }, + { + "epoch": 2.8754656732304418, + "grad_norm": 0.5302501320838928, + "learning_rate": 7.795729893934078e-08, + "loss": 0.1711, + "step": 10806 + }, + { + "epoch": 2.8757317722192655, + "grad_norm": 0.2496652901172638, + "learning_rate": 7.794082618430685e-08, + "loss": 0.1634, + "step": 10807 + }, + { + "epoch": 2.8759978712080896, + "grad_norm": 0.2522977590560913, + "learning_rate": 7.792435405844544e-08, + "loss": 0.1635, + "step": 10808 + }, + { + "epoch": 2.8762639701969133, + "grad_norm": 0.27042171359062195, + "learning_rate": 7.790788256222625e-08, + "loss": 0.169, + "step": 10809 + }, + { + "epoch": 2.876530069185737, + "grad_norm": 0.27289676666259766, + "learning_rate": 7.789141169611915e-08, + "loss": 0.165, + "step": 10810 + }, + { + "epoch": 2.876796168174561, + "grad_norm": 0.2813898026943207, + "learning_rate": 7.787494146059391e-08, + "loss": 0.1651, + "step": 10811 + }, + { + "epoch": 2.8770622671633848, + "grad_norm": 0.2882651388645172, + "learning_rate": 7.785847185612034e-08, + "loss": 0.1776, + "step": 10812 + }, + { + "epoch": 2.8773283661522084, + "grad_norm": 0.2660444676876068, + "learning_rate": 7.784200288316808e-08, + "loss": 0.1864, + "step": 10813 + }, + { + "epoch": 2.8775944651410326, + "grad_norm": 0.29299232363700867, + "learning_rate": 7.782553454220701e-08, + "loss": 0.1734, + "step": 10814 + }, + { + "epoch": 2.8778605641298562, + "grad_norm": 0.29865220189094543, + "learning_rate": 7.780906683370669e-08, + "loss": 0.1756, + "step": 10815 + }, + { + "epoch": 2.87812666311868, + "grad_norm": 0.3835856020450592, + "learning_rate": 7.779259975813691e-08, + "loss": 0.1928, + "step": 10816 + }, + { + "epoch": 2.878392762107504, + "grad_norm": 0.35403773188591003, + "learning_rate": 7.777613331596728e-08, + "loss": 0.1798, + "step": 10817 + }, + { + "epoch": 2.8786588610963277, + "grad_norm": 0.3250838816165924, + "learning_rate": 7.775966750766754e-08, + "loss": 0.1784, + "step": 10818 + }, + { + "epoch": 2.8789249600851514, + "grad_norm": 0.2750466763973236, + "learning_rate": 7.774320233370724e-08, + "loss": 0.1767, + "step": 10819 + }, + { + "epoch": 2.8791910590739755, + "grad_norm": 0.2711116373538971, + "learning_rate": 7.772673779455606e-08, + "loss": 0.1714, + "step": 10820 + }, + { + "epoch": 2.8794571580627992, + "grad_norm": 0.33012866973876953, + "learning_rate": 7.771027389068354e-08, + "loss": 0.2093, + "step": 10821 + }, + { + "epoch": 2.8797232570516234, + "grad_norm": 0.31064698100090027, + "learning_rate": 7.769381062255936e-08, + "loss": 0.1876, + "step": 10822 + }, + { + "epoch": 2.879989356040447, + "grad_norm": 0.35300466418266296, + "learning_rate": 7.767734799065298e-08, + "loss": 0.1912, + "step": 10823 + }, + { + "epoch": 2.880255455029271, + "grad_norm": 0.3146970272064209, + "learning_rate": 7.766088599543398e-08, + "loss": 0.1899, + "step": 10824 + }, + { + "epoch": 2.880521554018095, + "grad_norm": 0.3046693801879883, + "learning_rate": 7.764442463737197e-08, + "loss": 0.1885, + "step": 10825 + }, + { + "epoch": 2.8807876530069185, + "grad_norm": 0.27994564175605774, + "learning_rate": 7.762796391693636e-08, + "loss": 0.1638, + "step": 10826 + }, + { + "epoch": 2.8810537519957427, + "grad_norm": 0.2995885908603668, + "learning_rate": 7.76115038345967e-08, + "loss": 0.201, + "step": 10827 + }, + { + "epoch": 2.8813198509845663, + "grad_norm": 0.2851881980895996, + "learning_rate": 7.759504439082242e-08, + "loss": 0.1897, + "step": 10828 + }, + { + "epoch": 2.88158594997339, + "grad_norm": 0.23587007820606232, + "learning_rate": 7.757858558608305e-08, + "loss": 0.1575, + "step": 10829 + }, + { + "epoch": 2.881852048962214, + "grad_norm": 0.27364224195480347, + "learning_rate": 7.756212742084794e-08, + "loss": 0.1713, + "step": 10830 + }, + { + "epoch": 2.882118147951038, + "grad_norm": 0.26258137822151184, + "learning_rate": 7.754566989558656e-08, + "loss": 0.1645, + "step": 10831 + }, + { + "epoch": 2.8823842469398615, + "grad_norm": 0.26861393451690674, + "learning_rate": 7.752921301076829e-08, + "loss": 0.1684, + "step": 10832 + }, + { + "epoch": 2.8826503459286856, + "grad_norm": 0.2624102532863617, + "learning_rate": 7.751275676686258e-08, + "loss": 0.1638, + "step": 10833 + }, + { + "epoch": 2.8829164449175093, + "grad_norm": 0.7598856687545776, + "learning_rate": 7.74963011643387e-08, + "loss": 0.1973, + "step": 10834 + }, + { + "epoch": 2.883182543906333, + "grad_norm": 0.5326431393623352, + "learning_rate": 7.747984620366605e-08, + "loss": 0.2192, + "step": 10835 + }, + { + "epoch": 2.883448642895157, + "grad_norm": 0.28531795740127563, + "learning_rate": 7.746339188531394e-08, + "loss": 0.1751, + "step": 10836 + }, + { + "epoch": 2.883714741883981, + "grad_norm": 0.282336950302124, + "learning_rate": 7.744693820975173e-08, + "loss": 0.1815, + "step": 10837 + }, + { + "epoch": 2.8839808408728045, + "grad_norm": 0.28510582447052, + "learning_rate": 7.74304851774486e-08, + "loss": 0.1938, + "step": 10838 + }, + { + "epoch": 2.8842469398616286, + "grad_norm": 0.3067948520183563, + "learning_rate": 7.741403278887396e-08, + "loss": 0.1778, + "step": 10839 + }, + { + "epoch": 2.8845130388504523, + "grad_norm": 0.29236453771591187, + "learning_rate": 7.739758104449696e-08, + "loss": 0.1878, + "step": 10840 + }, + { + "epoch": 2.884779137839276, + "grad_norm": 0.2893909513950348, + "learning_rate": 7.738112994478692e-08, + "loss": 0.1892, + "step": 10841 + }, + { + "epoch": 2.8850452368281, + "grad_norm": 0.2671110928058624, + "learning_rate": 7.736467949021296e-08, + "loss": 0.1726, + "step": 10842 + }, + { + "epoch": 2.885311335816924, + "grad_norm": 0.27988678216934204, + "learning_rate": 7.734822968124438e-08, + "loss": 0.1616, + "step": 10843 + }, + { + "epoch": 2.8855774348057475, + "grad_norm": 0.43911752104759216, + "learning_rate": 7.733178051835033e-08, + "loss": 0.1867, + "step": 10844 + }, + { + "epoch": 2.8858435337945716, + "grad_norm": 0.3383169174194336, + "learning_rate": 7.731533200199994e-08, + "loss": 0.1769, + "step": 10845 + }, + { + "epoch": 2.8861096327833953, + "grad_norm": 0.344913512468338, + "learning_rate": 7.729888413266239e-08, + "loss": 0.188, + "step": 10846 + }, + { + "epoch": 2.8863757317722194, + "grad_norm": 0.31704291701316833, + "learning_rate": 7.728243691080677e-08, + "loss": 0.1706, + "step": 10847 + }, + { + "epoch": 2.886641830761043, + "grad_norm": 0.32184794545173645, + "learning_rate": 7.726599033690226e-08, + "loss": 0.1764, + "step": 10848 + }, + { + "epoch": 2.886907929749867, + "grad_norm": 0.3236120343208313, + "learning_rate": 7.724954441141787e-08, + "loss": 0.1905, + "step": 10849 + }, + { + "epoch": 2.887174028738691, + "grad_norm": 0.27476584911346436, + "learning_rate": 7.723309913482273e-08, + "loss": 0.1828, + "step": 10850 + }, + { + "epoch": 2.8874401277275146, + "grad_norm": 0.3223634362220764, + "learning_rate": 7.721665450758583e-08, + "loss": 0.1779, + "step": 10851 + }, + { + "epoch": 2.8877062267163387, + "grad_norm": 0.2876710295677185, + "learning_rate": 7.720021053017626e-08, + "loss": 0.1782, + "step": 10852 + }, + { + "epoch": 2.8879723257051624, + "grad_norm": 0.2732040584087372, + "learning_rate": 7.718376720306301e-08, + "loss": 0.1764, + "step": 10853 + }, + { + "epoch": 2.888238424693986, + "grad_norm": 0.2796255052089691, + "learning_rate": 7.716732452671512e-08, + "loss": 0.1903, + "step": 10854 + }, + { + "epoch": 2.88850452368281, + "grad_norm": 0.2722060978412628, + "learning_rate": 7.715088250160149e-08, + "loss": 0.1939, + "step": 10855 + }, + { + "epoch": 2.888770622671634, + "grad_norm": 0.288821280002594, + "learning_rate": 7.713444112819114e-08, + "loss": 0.1868, + "step": 10856 + }, + { + "epoch": 2.8890367216604576, + "grad_norm": 0.3079127371311188, + "learning_rate": 7.711800040695297e-08, + "loss": 0.1796, + "step": 10857 + }, + { + "epoch": 2.8893028206492817, + "grad_norm": 0.26209020614624023, + "learning_rate": 7.710156033835598e-08, + "loss": 0.1719, + "step": 10858 + }, + { + "epoch": 2.8895689196381054, + "grad_norm": 0.40605974197387695, + "learning_rate": 7.708512092286899e-08, + "loss": 0.1656, + "step": 10859 + }, + { + "epoch": 2.889835018626929, + "grad_norm": 0.27099183201789856, + "learning_rate": 7.706868216096091e-08, + "loss": 0.185, + "step": 10860 + }, + { + "epoch": 2.890101117615753, + "grad_norm": 0.27924787998199463, + "learning_rate": 7.705224405310061e-08, + "loss": 0.1647, + "step": 10861 + }, + { + "epoch": 2.890367216604577, + "grad_norm": 0.3689245879650116, + "learning_rate": 7.703580659975695e-08, + "loss": 0.1877, + "step": 10862 + }, + { + "epoch": 2.8906333155934005, + "grad_norm": 0.27210646867752075, + "learning_rate": 7.701936980139875e-08, + "loss": 0.1797, + "step": 10863 + }, + { + "epoch": 2.8908994145822247, + "grad_norm": 0.8427523970603943, + "learning_rate": 7.700293365849482e-08, + "loss": 0.1765, + "step": 10864 + }, + { + "epoch": 2.8911655135710483, + "grad_norm": 0.2550513744354248, + "learning_rate": 7.698649817151399e-08, + "loss": 0.1712, + "step": 10865 + }, + { + "epoch": 2.891431612559872, + "grad_norm": 0.28943848609924316, + "learning_rate": 7.697006334092494e-08, + "loss": 0.1944, + "step": 10866 + }, + { + "epoch": 2.891697711548696, + "grad_norm": 0.2899145483970642, + "learning_rate": 7.695362916719651e-08, + "loss": 0.1591, + "step": 10867 + }, + { + "epoch": 2.89196381053752, + "grad_norm": 0.2633129358291626, + "learning_rate": 7.69371956507974e-08, + "loss": 0.1651, + "step": 10868 + }, + { + "epoch": 2.892229909526344, + "grad_norm": 0.2628125846385956, + "learning_rate": 7.692076279219637e-08, + "loss": 0.1658, + "step": 10869 + }, + { + "epoch": 2.8924960085151676, + "grad_norm": 0.31410178542137146, + "learning_rate": 7.690433059186205e-08, + "loss": 0.1699, + "step": 10870 + }, + { + "epoch": 2.8927621075039913, + "grad_norm": 0.3494108319282532, + "learning_rate": 7.688789905026317e-08, + "loss": 0.1666, + "step": 10871 + }, + { + "epoch": 2.8930282064928154, + "grad_norm": 0.27175432443618774, + "learning_rate": 7.687146816786836e-08, + "loss": 0.1821, + "step": 10872 + }, + { + "epoch": 2.893294305481639, + "grad_norm": 0.45157885551452637, + "learning_rate": 7.68550379451463e-08, + "loss": 0.2125, + "step": 10873 + }, + { + "epoch": 2.8935604044704633, + "grad_norm": 0.29307258129119873, + "learning_rate": 7.683860838256558e-08, + "loss": 0.185, + "step": 10874 + }, + { + "epoch": 2.893826503459287, + "grad_norm": 0.3067690134048462, + "learning_rate": 7.682217948059481e-08, + "loss": 0.1987, + "step": 10875 + }, + { + "epoch": 2.8940926024481106, + "grad_norm": 0.30017557740211487, + "learning_rate": 7.680575123970257e-08, + "loss": 0.1656, + "step": 10876 + }, + { + "epoch": 2.8943587014369347, + "grad_norm": 0.37197941541671753, + "learning_rate": 7.678932366035745e-08, + "loss": 0.1758, + "step": 10877 + }, + { + "epoch": 2.8946248004257584, + "grad_norm": 0.29378461837768555, + "learning_rate": 7.677289674302797e-08, + "loss": 0.1724, + "step": 10878 + }, + { + "epoch": 2.894890899414582, + "grad_norm": 0.29401740431785583, + "learning_rate": 7.675647048818267e-08, + "loss": 0.1742, + "step": 10879 + }, + { + "epoch": 2.8951569984034062, + "grad_norm": 0.2724069356918335, + "learning_rate": 7.674004489629011e-08, + "loss": 0.1756, + "step": 10880 + }, + { + "epoch": 2.89542309739223, + "grad_norm": 0.3640657961368561, + "learning_rate": 7.672361996781871e-08, + "loss": 0.1821, + "step": 10881 + }, + { + "epoch": 2.8956891963810536, + "grad_norm": 0.4258633553981781, + "learning_rate": 7.670719570323697e-08, + "loss": 0.1807, + "step": 10882 + }, + { + "epoch": 2.8959552953698777, + "grad_norm": 0.29914772510528564, + "learning_rate": 7.669077210301333e-08, + "loss": 0.1859, + "step": 10883 + }, + { + "epoch": 2.8962213943587014, + "grad_norm": 0.2969382107257843, + "learning_rate": 7.667434916761626e-08, + "loss": 0.1744, + "step": 10884 + }, + { + "epoch": 2.896487493347525, + "grad_norm": 0.4010900855064392, + "learning_rate": 7.665792689751413e-08, + "loss": 0.1861, + "step": 10885 + }, + { + "epoch": 2.896753592336349, + "grad_norm": 0.3393593728542328, + "learning_rate": 7.664150529317538e-08, + "loss": 0.1804, + "step": 10886 + }, + { + "epoch": 2.897019691325173, + "grad_norm": 0.4850502908229828, + "learning_rate": 7.662508435506834e-08, + "loss": 0.1887, + "step": 10887 + }, + { + "epoch": 2.8972857903139966, + "grad_norm": 0.34549400210380554, + "learning_rate": 7.660866408366144e-08, + "loss": 0.191, + "step": 10888 + }, + { + "epoch": 2.8975518893028207, + "grad_norm": 0.2986025810241699, + "learning_rate": 7.65922444794229e-08, + "loss": 0.1771, + "step": 10889 + }, + { + "epoch": 2.8978179882916444, + "grad_norm": 0.299163818359375, + "learning_rate": 7.657582554282118e-08, + "loss": 0.1946, + "step": 10890 + }, + { + "epoch": 2.898084087280468, + "grad_norm": 0.32673853635787964, + "learning_rate": 7.655940727432447e-08, + "loss": 0.1903, + "step": 10891 + }, + { + "epoch": 2.898350186269292, + "grad_norm": 0.3314315378665924, + "learning_rate": 7.654298967440113e-08, + "loss": 0.1938, + "step": 10892 + }, + { + "epoch": 2.898616285258116, + "grad_norm": 0.2693154215812683, + "learning_rate": 7.652657274351937e-08, + "loss": 0.1728, + "step": 10893 + }, + { + "epoch": 2.89888238424694, + "grad_norm": 0.3052581250667572, + "learning_rate": 7.65101564821475e-08, + "loss": 0.1658, + "step": 10894 + }, + { + "epoch": 2.8991484832357637, + "grad_norm": 0.3333931565284729, + "learning_rate": 7.649374089075364e-08, + "loss": 0.1836, + "step": 10895 + }, + { + "epoch": 2.899414582224588, + "grad_norm": 0.32253313064575195, + "learning_rate": 7.647732596980609e-08, + "loss": 0.1619, + "step": 10896 + }, + { + "epoch": 2.8996806812134115, + "grad_norm": 0.33315691351890564, + "learning_rate": 7.646091171977298e-08, + "loss": 0.1709, + "step": 10897 + }, + { + "epoch": 2.899946780202235, + "grad_norm": 0.36415934562683105, + "learning_rate": 7.64444981411225e-08, + "loss": 0.1764, + "step": 10898 + }, + { + "epoch": 2.9002128791910593, + "grad_norm": 0.2767408788204193, + "learning_rate": 7.642808523432286e-08, + "loss": 0.1841, + "step": 10899 + }, + { + "epoch": 2.900478978179883, + "grad_norm": 0.2879583239555359, + "learning_rate": 7.641167299984204e-08, + "loss": 0.1664, + "step": 10900 + }, + { + "epoch": 2.9007450771687067, + "grad_norm": 0.30046170949935913, + "learning_rate": 7.639526143814833e-08, + "loss": 0.1753, + "step": 10901 + }, + { + "epoch": 2.901011176157531, + "grad_norm": 0.3283626437187195, + "learning_rate": 7.637885054970968e-08, + "loss": 0.1689, + "step": 10902 + }, + { + "epoch": 2.9012772751463545, + "grad_norm": 0.3555300235748291, + "learning_rate": 7.636244033499421e-08, + "loss": 0.1775, + "step": 10903 + }, + { + "epoch": 2.901543374135178, + "grad_norm": 0.4060944616794586, + "learning_rate": 7.634603079446999e-08, + "loss": 0.1943, + "step": 10904 + }, + { + "epoch": 2.9018094731240023, + "grad_norm": 0.4724177420139313, + "learning_rate": 7.632962192860506e-08, + "loss": 0.1611, + "step": 10905 + }, + { + "epoch": 2.902075572112826, + "grad_norm": 0.3226189911365509, + "learning_rate": 7.631321373786736e-08, + "loss": 0.1992, + "step": 10906 + }, + { + "epoch": 2.9023416711016496, + "grad_norm": 0.3160596191883087, + "learning_rate": 7.629680622272498e-08, + "loss": 0.1948, + "step": 10907 + }, + { + "epoch": 2.9026077700904738, + "grad_norm": 0.3424290120601654, + "learning_rate": 7.628039938364584e-08, + "loss": 0.1692, + "step": 10908 + }, + { + "epoch": 2.9028738690792975, + "grad_norm": 0.2818063199520111, + "learning_rate": 7.626399322109793e-08, + "loss": 0.1774, + "step": 10909 + }, + { + "epoch": 2.903139968068121, + "grad_norm": 0.26121991872787476, + "learning_rate": 7.624758773554914e-08, + "loss": 0.1849, + "step": 10910 + }, + { + "epoch": 2.9034060670569453, + "grad_norm": 0.42540374398231506, + "learning_rate": 7.623118292746742e-08, + "loss": 0.1689, + "step": 10911 + }, + { + "epoch": 2.903672166045769, + "grad_norm": 0.35236093401908875, + "learning_rate": 7.621477879732065e-08, + "loss": 0.1711, + "step": 10912 + }, + { + "epoch": 2.9039382650345926, + "grad_norm": 0.36756622791290283, + "learning_rate": 7.619837534557678e-08, + "loss": 0.1686, + "step": 10913 + }, + { + "epoch": 2.9042043640234168, + "grad_norm": 0.272882878780365, + "learning_rate": 7.618197257270353e-08, + "loss": 0.166, + "step": 10914 + }, + { + "epoch": 2.9044704630122404, + "grad_norm": 0.2608303427696228, + "learning_rate": 7.616557047916889e-08, + "loss": 0.1652, + "step": 10915 + }, + { + "epoch": 2.9047365620010646, + "grad_norm": 0.3107356131076813, + "learning_rate": 7.614916906544058e-08, + "loss": 0.2012, + "step": 10916 + }, + { + "epoch": 2.9050026609898882, + "grad_norm": 0.3536480665206909, + "learning_rate": 7.613276833198643e-08, + "loss": 0.1791, + "step": 10917 + }, + { + "epoch": 2.905268759978712, + "grad_norm": 0.3954502046108246, + "learning_rate": 7.611636827927424e-08, + "loss": 0.2071, + "step": 10918 + }, + { + "epoch": 2.905534858967536, + "grad_norm": 0.26999008655548096, + "learning_rate": 7.609996890777175e-08, + "loss": 0.1649, + "step": 10919 + }, + { + "epoch": 2.9058009579563597, + "grad_norm": 0.2842584252357483, + "learning_rate": 7.608357021794676e-08, + "loss": 0.1835, + "step": 10920 + }, + { + "epoch": 2.906067056945184, + "grad_norm": 0.38268783688545227, + "learning_rate": 7.60671722102669e-08, + "loss": 0.1688, + "step": 10921 + }, + { + "epoch": 2.9063331559340075, + "grad_norm": 0.26900145411491394, + "learning_rate": 7.605077488519995e-08, + "loss": 0.1681, + "step": 10922 + }, + { + "epoch": 2.906599254922831, + "grad_norm": 0.26798075437545776, + "learning_rate": 7.603437824321353e-08, + "loss": 0.1797, + "step": 10923 + }, + { + "epoch": 2.9068653539116553, + "grad_norm": 0.4361986517906189, + "learning_rate": 7.601798228477539e-08, + "loss": 0.1948, + "step": 10924 + }, + { + "epoch": 2.907131452900479, + "grad_norm": 0.2683769464492798, + "learning_rate": 7.600158701035309e-08, + "loss": 0.1954, + "step": 10925 + }, + { + "epoch": 2.9073975518893027, + "grad_norm": 0.28301316499710083, + "learning_rate": 7.598519242041432e-08, + "loss": 0.1891, + "step": 10926 + }, + { + "epoch": 2.907663650878127, + "grad_norm": 0.2711033523082733, + "learning_rate": 7.596879851542665e-08, + "loss": 0.1699, + "step": 10927 + }, + { + "epoch": 2.9079297498669505, + "grad_norm": 0.26825445890426636, + "learning_rate": 7.595240529585768e-08, + "loss": 0.1808, + "step": 10928 + }, + { + "epoch": 2.908195848855774, + "grad_norm": 0.39202553033828735, + "learning_rate": 7.593601276217497e-08, + "loss": 0.2025, + "step": 10929 + }, + { + "epoch": 2.9084619478445983, + "grad_norm": 0.2867361009120941, + "learning_rate": 7.591962091484612e-08, + "loss": 0.1918, + "step": 10930 + }, + { + "epoch": 2.908728046833422, + "grad_norm": 0.29476165771484375, + "learning_rate": 7.590322975433856e-08, + "loss": 0.1718, + "step": 10931 + }, + { + "epoch": 2.9089941458222457, + "grad_norm": 0.37478339672088623, + "learning_rate": 7.58868392811199e-08, + "loss": 0.1832, + "step": 10932 + }, + { + "epoch": 2.90926024481107, + "grad_norm": 0.5184133052825928, + "learning_rate": 7.587044949565755e-08, + "loss": 0.199, + "step": 10933 + }, + { + "epoch": 2.9095263437998935, + "grad_norm": 0.26582005620002747, + "learning_rate": 7.5854060398419e-08, + "loss": 0.1754, + "step": 10934 + }, + { + "epoch": 2.909792442788717, + "grad_norm": 0.3891560733318329, + "learning_rate": 7.583767198987177e-08, + "loss": 0.1801, + "step": 10935 + }, + { + "epoch": 2.9100585417775413, + "grad_norm": 0.355021208524704, + "learning_rate": 7.582128427048319e-08, + "loss": 0.1688, + "step": 10936 + }, + { + "epoch": 2.910324640766365, + "grad_norm": 0.2822258770465851, + "learning_rate": 7.580489724072073e-08, + "loss": 0.168, + "step": 10937 + }, + { + "epoch": 2.9105907397551887, + "grad_norm": 0.26679331064224243, + "learning_rate": 7.578851090105176e-08, + "loss": 0.1634, + "step": 10938 + }, + { + "epoch": 2.910856838744013, + "grad_norm": 0.26859578490257263, + "learning_rate": 7.577212525194368e-08, + "loss": 0.1708, + "step": 10939 + }, + { + "epoch": 2.9111229377328365, + "grad_norm": 0.40181347727775574, + "learning_rate": 7.575574029386379e-08, + "loss": 0.1815, + "step": 10940 + }, + { + "epoch": 2.9113890367216606, + "grad_norm": 0.3684786856174469, + "learning_rate": 7.57393560272795e-08, + "loss": 0.1945, + "step": 10941 + }, + { + "epoch": 2.9116551357104843, + "grad_norm": 0.2611311078071594, + "learning_rate": 7.572297245265805e-08, + "loss": 0.1789, + "step": 10942 + }, + { + "epoch": 2.9119212346993084, + "grad_norm": 0.28356054425239563, + "learning_rate": 7.570658957046675e-08, + "loss": 0.1663, + "step": 10943 + }, + { + "epoch": 2.912187333688132, + "grad_norm": 0.2821877598762512, + "learning_rate": 7.569020738117288e-08, + "loss": 0.1859, + "step": 10944 + }, + { + "epoch": 2.9124534326769558, + "grad_norm": 0.445758193731308, + "learning_rate": 7.567382588524374e-08, + "loss": 0.1784, + "step": 10945 + }, + { + "epoch": 2.91271953166578, + "grad_norm": 0.27560290694236755, + "learning_rate": 7.565744508314646e-08, + "loss": 0.1702, + "step": 10946 + }, + { + "epoch": 2.9129856306546036, + "grad_norm": 0.24663382768630981, + "learning_rate": 7.564106497534834e-08, + "loss": 0.1603, + "step": 10947 + }, + { + "epoch": 2.9132517296434273, + "grad_norm": 0.26892974972724915, + "learning_rate": 7.562468556231653e-08, + "loss": 0.1829, + "step": 10948 + }, + { + "epoch": 2.9135178286322514, + "grad_norm": 0.40861210227012634, + "learning_rate": 7.560830684451826e-08, + "loss": 0.1785, + "step": 10949 + }, + { + "epoch": 2.913783927621075, + "grad_norm": 0.34881681203842163, + "learning_rate": 7.55919288224206e-08, + "loss": 0.1733, + "step": 10950 + }, + { + "epoch": 2.9140500266098988, + "grad_norm": 0.2749529480934143, + "learning_rate": 7.557555149649076e-08, + "loss": 0.1679, + "step": 10951 + }, + { + "epoch": 2.914316125598723, + "grad_norm": 0.27808836102485657, + "learning_rate": 7.55591748671958e-08, + "loss": 0.1638, + "step": 10952 + }, + { + "epoch": 2.9145822245875466, + "grad_norm": 0.27574723958969116, + "learning_rate": 7.554279893500284e-08, + "loss": 0.1769, + "step": 10953 + }, + { + "epoch": 2.9148483235763702, + "grad_norm": 0.33249542117118835, + "learning_rate": 7.552642370037897e-08, + "loss": 0.1695, + "step": 10954 + }, + { + "epoch": 2.9151144225651944, + "grad_norm": 0.2798928916454315, + "learning_rate": 7.551004916379121e-08, + "loss": 0.1764, + "step": 10955 + }, + { + "epoch": 2.915380521554018, + "grad_norm": 0.2953132688999176, + "learning_rate": 7.549367532570667e-08, + "loss": 0.1831, + "step": 10956 + }, + { + "epoch": 2.9156466205428417, + "grad_norm": 0.2687094509601593, + "learning_rate": 7.547730218659224e-08, + "loss": 0.1638, + "step": 10957 + }, + { + "epoch": 2.915912719531666, + "grad_norm": 0.2707637846469879, + "learning_rate": 7.546092974691503e-08, + "loss": 0.1807, + "step": 10958 + }, + { + "epoch": 2.9161788185204895, + "grad_norm": 0.3804275691509247, + "learning_rate": 7.544455800714193e-08, + "loss": 0.1855, + "step": 10959 + }, + { + "epoch": 2.9164449175093132, + "grad_norm": 0.272461861371994, + "learning_rate": 7.542818696773998e-08, + "loss": 0.1714, + "step": 10960 + }, + { + "epoch": 2.9167110164981374, + "grad_norm": 0.31090110540390015, + "learning_rate": 7.541181662917603e-08, + "loss": 0.1812, + "step": 10961 + }, + { + "epoch": 2.916977115486961, + "grad_norm": 0.49069473147392273, + "learning_rate": 7.539544699191706e-08, + "loss": 0.1803, + "step": 10962 + }, + { + "epoch": 2.9172432144757847, + "grad_norm": 0.37120649218559265, + "learning_rate": 7.537907805642991e-08, + "loss": 0.1758, + "step": 10963 + }, + { + "epoch": 2.917509313464609, + "grad_norm": 0.3312615752220154, + "learning_rate": 7.536270982318153e-08, + "loss": 0.1723, + "step": 10964 + }, + { + "epoch": 2.9177754124534325, + "grad_norm": 0.270477294921875, + "learning_rate": 7.534634229263867e-08, + "loss": 0.1808, + "step": 10965 + }, + { + "epoch": 2.9180415114422567, + "grad_norm": 0.2854403555393219, + "learning_rate": 7.53299754652683e-08, + "loss": 0.1901, + "step": 10966 + }, + { + "epoch": 2.9183076104310803, + "grad_norm": 0.37698808312416077, + "learning_rate": 7.531360934153712e-08, + "loss": 0.1885, + "step": 10967 + }, + { + "epoch": 2.9185737094199045, + "grad_norm": 0.4097413420677185, + "learning_rate": 7.529724392191198e-08, + "loss": 0.169, + "step": 10968 + }, + { + "epoch": 2.918839808408728, + "grad_norm": 0.2673322558403015, + "learning_rate": 7.528087920685962e-08, + "loss": 0.1762, + "step": 10969 + }, + { + "epoch": 2.919105907397552, + "grad_norm": 0.3690468668937683, + "learning_rate": 7.526451519684685e-08, + "loss": 0.1931, + "step": 10970 + }, + { + "epoch": 2.919372006386376, + "grad_norm": 0.34479308128356934, + "learning_rate": 7.524815189234035e-08, + "loss": 0.184, + "step": 10971 + }, + { + "epoch": 2.9196381053751996, + "grad_norm": 0.2712118327617645, + "learning_rate": 7.523178929380685e-08, + "loss": 0.1895, + "step": 10972 + }, + { + "epoch": 2.9199042043640233, + "grad_norm": 0.3157205581665039, + "learning_rate": 7.521542740171307e-08, + "loss": 0.1955, + "step": 10973 + }, + { + "epoch": 2.9201703033528474, + "grad_norm": 0.26466265320777893, + "learning_rate": 7.519906621652563e-08, + "loss": 0.1663, + "step": 10974 + }, + { + "epoch": 2.920436402341671, + "grad_norm": 0.431514710187912, + "learning_rate": 7.518270573871127e-08, + "loss": 0.178, + "step": 10975 + }, + { + "epoch": 2.920702501330495, + "grad_norm": 0.32568085193634033, + "learning_rate": 7.516634596873653e-08, + "loss": 0.182, + "step": 10976 + }, + { + "epoch": 2.920968600319319, + "grad_norm": 0.2943209409713745, + "learning_rate": 7.51499869070681e-08, + "loss": 0.1784, + "step": 10977 + }, + { + "epoch": 2.9212346993081426, + "grad_norm": 0.27634990215301514, + "learning_rate": 7.513362855417251e-08, + "loss": 0.1726, + "step": 10978 + }, + { + "epoch": 2.9215007982969663, + "grad_norm": 0.2768780589103699, + "learning_rate": 7.511727091051638e-08, + "loss": 0.1595, + "step": 10979 + }, + { + "epoch": 2.9217668972857904, + "grad_norm": 0.44378483295440674, + "learning_rate": 7.510091397656624e-08, + "loss": 0.2063, + "step": 10980 + }, + { + "epoch": 2.922032996274614, + "grad_norm": 0.27426064014434814, + "learning_rate": 7.508455775278867e-08, + "loss": 0.1803, + "step": 10981 + }, + { + "epoch": 2.922299095263438, + "grad_norm": 0.2533174157142639, + "learning_rate": 7.50682022396501e-08, + "loss": 0.1681, + "step": 10982 + }, + { + "epoch": 2.922565194252262, + "grad_norm": 0.25904345512390137, + "learning_rate": 7.505184743761709e-08, + "loss": 0.1639, + "step": 10983 + }, + { + "epoch": 2.9228312932410856, + "grad_norm": 0.29131564497947693, + "learning_rate": 7.503549334715607e-08, + "loss": 0.1674, + "step": 10984 + }, + { + "epoch": 2.9230973922299093, + "grad_norm": 0.2902502715587616, + "learning_rate": 7.501913996873354e-08, + "loss": 0.1905, + "step": 10985 + }, + { + "epoch": 2.9233634912187334, + "grad_norm": 0.3799320459365845, + "learning_rate": 7.500278730281586e-08, + "loss": 0.1713, + "step": 10986 + }, + { + "epoch": 2.923629590207557, + "grad_norm": 0.2837916910648346, + "learning_rate": 7.49864353498695e-08, + "loss": 0.1853, + "step": 10987 + }, + { + "epoch": 2.923895689196381, + "grad_norm": 0.27312129735946655, + "learning_rate": 7.497008411036083e-08, + "loss": 0.1818, + "step": 10988 + }, + { + "epoch": 2.924161788185205, + "grad_norm": 0.3466397523880005, + "learning_rate": 7.49537335847562e-08, + "loss": 0.2003, + "step": 10989 + }, + { + "epoch": 2.9244278871740286, + "grad_norm": 0.41741490364074707, + "learning_rate": 7.493738377352203e-08, + "loss": 0.202, + "step": 10990 + }, + { + "epoch": 2.9246939861628527, + "grad_norm": 0.35362738370895386, + "learning_rate": 7.492103467712459e-08, + "loss": 0.1833, + "step": 10991 + }, + { + "epoch": 2.9249600851516764, + "grad_norm": 0.2547684609889984, + "learning_rate": 7.490468629603021e-08, + "loss": 0.1694, + "step": 10992 + }, + { + "epoch": 2.9252261841405005, + "grad_norm": 0.30729609727859497, + "learning_rate": 7.488833863070516e-08, + "loss": 0.1937, + "step": 10993 + }, + { + "epoch": 2.925492283129324, + "grad_norm": 0.34865015745162964, + "learning_rate": 7.487199168161573e-08, + "loss": 0.1785, + "step": 10994 + }, + { + "epoch": 2.925758382118148, + "grad_norm": 0.312100350856781, + "learning_rate": 7.485564544922816e-08, + "loss": 0.1716, + "step": 10995 + }, + { + "epoch": 2.926024481106972, + "grad_norm": 0.33421093225479126, + "learning_rate": 7.48392999340087e-08, + "loss": 0.1862, + "step": 10996 + }, + { + "epoch": 2.9262905800957957, + "grad_norm": 0.34656423330307007, + "learning_rate": 7.482295513642349e-08, + "loss": 0.1669, + "step": 10997 + }, + { + "epoch": 2.9265566790846194, + "grad_norm": 0.2956163287162781, + "learning_rate": 7.480661105693878e-08, + "loss": 0.2043, + "step": 10998 + }, + { + "epoch": 2.9268227780734435, + "grad_norm": 0.30840545892715454, + "learning_rate": 7.479026769602071e-08, + "loss": 0.1873, + "step": 10999 + }, + { + "epoch": 2.927088877062267, + "grad_norm": 0.26427289843559265, + "learning_rate": 7.477392505413546e-08, + "loss": 0.1776, + "step": 11000 + }, + { + "epoch": 2.927354976051091, + "grad_norm": 0.34797435998916626, + "learning_rate": 7.475758313174912e-08, + "loss": 0.1694, + "step": 11001 + }, + { + "epoch": 2.927621075039915, + "grad_norm": 0.2857874631881714, + "learning_rate": 7.47412419293278e-08, + "loss": 0.191, + "step": 11002 + }, + { + "epoch": 2.9278871740287387, + "grad_norm": 0.32573601603507996, + "learning_rate": 7.472490144733757e-08, + "loss": 0.1877, + "step": 11003 + }, + { + "epoch": 2.9281532730175623, + "grad_norm": 0.2719838619232178, + "learning_rate": 7.470856168624453e-08, + "loss": 0.1618, + "step": 11004 + }, + { + "epoch": 2.9284193720063865, + "grad_norm": 0.27262553572654724, + "learning_rate": 7.46922226465147e-08, + "loss": 0.181, + "step": 11005 + }, + { + "epoch": 2.92868547099521, + "grad_norm": 0.3663425147533417, + "learning_rate": 7.467588432861414e-08, + "loss": 0.186, + "step": 11006 + }, + { + "epoch": 2.928951569984034, + "grad_norm": 0.289768785238266, + "learning_rate": 7.465954673300876e-08, + "loss": 0.1793, + "step": 11007 + }, + { + "epoch": 2.929217668972858, + "grad_norm": 0.3273252248764038, + "learning_rate": 7.464320986016462e-08, + "loss": 0.1845, + "step": 11008 + }, + { + "epoch": 2.9294837679616816, + "grad_norm": 0.2625802159309387, + "learning_rate": 7.462687371054768e-08, + "loss": 0.1632, + "step": 11009 + }, + { + "epoch": 2.9297498669505053, + "grad_norm": 0.2763980031013489, + "learning_rate": 7.461053828462385e-08, + "loss": 0.1874, + "step": 11010 + }, + { + "epoch": 2.9300159659393294, + "grad_norm": 0.27964097261428833, + "learning_rate": 7.459420358285909e-08, + "loss": 0.1708, + "step": 11011 + }, + { + "epoch": 2.930282064928153, + "grad_norm": 0.3606792092323303, + "learning_rate": 7.457786960571924e-08, + "loss": 0.1828, + "step": 11012 + }, + { + "epoch": 2.9305481639169773, + "grad_norm": 0.26058024168014526, + "learning_rate": 7.456153635367022e-08, + "loss": 0.1662, + "step": 11013 + }, + { + "epoch": 2.930814262905801, + "grad_norm": 0.35128095746040344, + "learning_rate": 7.454520382717786e-08, + "loss": 0.1961, + "step": 11014 + }, + { + "epoch": 2.931080361894625, + "grad_norm": 0.3300124704837799, + "learning_rate": 7.452887202670805e-08, + "loss": 0.1867, + "step": 11015 + }, + { + "epoch": 2.9313464608834487, + "grad_norm": 0.3508167266845703, + "learning_rate": 7.451254095272652e-08, + "loss": 0.1747, + "step": 11016 + }, + { + "epoch": 2.9316125598722724, + "grad_norm": 0.294653981924057, + "learning_rate": 7.449621060569917e-08, + "loss": 0.1925, + "step": 11017 + }, + { + "epoch": 2.9318786588610966, + "grad_norm": 0.9360138773918152, + "learning_rate": 7.447988098609169e-08, + "loss": 0.1863, + "step": 11018 + }, + { + "epoch": 2.9321447578499202, + "grad_norm": 0.4193425178527832, + "learning_rate": 7.446355209436987e-08, + "loss": 0.1837, + "step": 11019 + }, + { + "epoch": 2.932410856838744, + "grad_norm": 0.28256380558013916, + "learning_rate": 7.444722393099942e-08, + "loss": 0.1761, + "step": 11020 + }, + { + "epoch": 2.932676955827568, + "grad_norm": 0.3451201915740967, + "learning_rate": 7.443089649644612e-08, + "loss": 0.1972, + "step": 11021 + }, + { + "epoch": 2.9329430548163917, + "grad_norm": 0.333486407995224, + "learning_rate": 7.441456979117555e-08, + "loss": 0.1961, + "step": 11022 + }, + { + "epoch": 2.9332091538052154, + "grad_norm": 0.398857444524765, + "learning_rate": 7.439824381565347e-08, + "loss": 0.1762, + "step": 11023 + }, + { + "epoch": 2.9334752527940395, + "grad_norm": 0.2626563608646393, + "learning_rate": 7.438191857034548e-08, + "loss": 0.1802, + "step": 11024 + }, + { + "epoch": 2.933741351782863, + "grad_norm": 0.26065608859062195, + "learning_rate": 7.436559405571723e-08, + "loss": 0.1814, + "step": 11025 + }, + { + "epoch": 2.934007450771687, + "grad_norm": 0.3719266951084137, + "learning_rate": 7.434927027223439e-08, + "loss": 0.1859, + "step": 11026 + }, + { + "epoch": 2.934273549760511, + "grad_norm": 0.27191293239593506, + "learning_rate": 7.433294722036241e-08, + "loss": 0.1681, + "step": 11027 + }, + { + "epoch": 2.9345396487493347, + "grad_norm": 0.2897367775440216, + "learning_rate": 7.431662490056698e-08, + "loss": 0.1727, + "step": 11028 + }, + { + "epoch": 2.9348057477381584, + "grad_norm": 0.2894541323184967, + "learning_rate": 7.430030331331358e-08, + "loss": 0.1775, + "step": 11029 + }, + { + "epoch": 2.9350718467269825, + "grad_norm": 0.3472191393375397, + "learning_rate": 7.428398245906776e-08, + "loss": 0.1926, + "step": 11030 + }, + { + "epoch": 2.935337945715806, + "grad_norm": 0.2968413233757019, + "learning_rate": 7.4267662338295e-08, + "loss": 0.1599, + "step": 11031 + }, + { + "epoch": 2.93560404470463, + "grad_norm": 0.2627648711204529, + "learning_rate": 7.425134295146085e-08, + "loss": 0.1679, + "step": 11032 + }, + { + "epoch": 2.935870143693454, + "grad_norm": 0.4085259735584259, + "learning_rate": 7.423502429903069e-08, + "loss": 0.1745, + "step": 11033 + }, + { + "epoch": 2.9361362426822777, + "grad_norm": 0.2704102694988251, + "learning_rate": 7.421870638146999e-08, + "loss": 0.1849, + "step": 11034 + }, + { + "epoch": 2.936402341671102, + "grad_norm": 0.255359947681427, + "learning_rate": 7.420238919924416e-08, + "loss": 0.173, + "step": 11035 + }, + { + "epoch": 2.9366684406599255, + "grad_norm": 0.2708660364151001, + "learning_rate": 7.418607275281867e-08, + "loss": 0.1631, + "step": 11036 + }, + { + "epoch": 2.936934539648749, + "grad_norm": 0.3839189410209656, + "learning_rate": 7.416975704265879e-08, + "loss": 0.1896, + "step": 11037 + }, + { + "epoch": 2.9372006386375733, + "grad_norm": 0.3345904052257538, + "learning_rate": 7.415344206922994e-08, + "loss": 0.1764, + "step": 11038 + }, + { + "epoch": 2.937466737626397, + "grad_norm": 0.2628374695777893, + "learning_rate": 7.413712783299743e-08, + "loss": 0.1832, + "step": 11039 + }, + { + "epoch": 2.937732836615221, + "grad_norm": 0.3245428204536438, + "learning_rate": 7.412081433442665e-08, + "loss": 0.1855, + "step": 11040 + }, + { + "epoch": 2.937998935604045, + "grad_norm": 0.4561520218849182, + "learning_rate": 7.410450157398275e-08, + "loss": 0.2063, + "step": 11041 + }, + { + "epoch": 2.9382650345928685, + "grad_norm": 0.2719150185585022, + "learning_rate": 7.408818955213117e-08, + "loss": 0.1808, + "step": 11042 + }, + { + "epoch": 2.9385311335816926, + "grad_norm": 0.2911061644554138, + "learning_rate": 7.4071878269337e-08, + "loss": 0.1846, + "step": 11043 + }, + { + "epoch": 2.9387972325705163, + "grad_norm": 0.3307470977306366, + "learning_rate": 7.405556772606558e-08, + "loss": 0.1884, + "step": 11044 + }, + { + "epoch": 2.93906333155934, + "grad_norm": 0.2530484199523926, + "learning_rate": 7.403925792278212e-08, + "loss": 0.1658, + "step": 11045 + }, + { + "epoch": 2.939329430548164, + "grad_norm": 0.26701366901397705, + "learning_rate": 7.402294885995173e-08, + "loss": 0.1804, + "step": 11046 + }, + { + "epoch": 2.9395955295369878, + "grad_norm": 0.3342253863811493, + "learning_rate": 7.400664053803968e-08, + "loss": 0.159, + "step": 11047 + }, + { + "epoch": 2.9398616285258115, + "grad_norm": 0.33235132694244385, + "learning_rate": 7.399033295751102e-08, + "loss": 0.1767, + "step": 11048 + }, + { + "epoch": 2.9401277275146356, + "grad_norm": 0.27587810158729553, + "learning_rate": 7.397402611883094e-08, + "loss": 0.1696, + "step": 11049 + }, + { + "epoch": 2.9403938265034593, + "grad_norm": 0.326406329870224, + "learning_rate": 7.395772002246451e-08, + "loss": 0.1832, + "step": 11050 + }, + { + "epoch": 2.940659925492283, + "grad_norm": 0.5529881715774536, + "learning_rate": 7.394141466887687e-08, + "loss": 0.1926, + "step": 11051 + }, + { + "epoch": 2.940926024481107, + "grad_norm": 0.3127451241016388, + "learning_rate": 7.392511005853296e-08, + "loss": 0.1878, + "step": 11052 + }, + { + "epoch": 2.9411921234699308, + "grad_norm": 0.3678264915943146, + "learning_rate": 7.390880619189796e-08, + "loss": 0.1676, + "step": 11053 + }, + { + "epoch": 2.9414582224587544, + "grad_norm": 0.25603345036506653, + "learning_rate": 7.389250306943679e-08, + "loss": 0.1708, + "step": 11054 + }, + { + "epoch": 2.9417243214475786, + "grad_norm": 0.4712657928466797, + "learning_rate": 7.387620069161451e-08, + "loss": 0.2051, + "step": 11055 + }, + { + "epoch": 2.9419904204364022, + "grad_norm": 0.264342337846756, + "learning_rate": 7.385989905889605e-08, + "loss": 0.1707, + "step": 11056 + }, + { + "epoch": 2.942256519425226, + "grad_norm": 0.2667854428291321, + "learning_rate": 7.384359817174642e-08, + "loss": 0.1787, + "step": 11057 + }, + { + "epoch": 2.94252261841405, + "grad_norm": 0.27487242221832275, + "learning_rate": 7.38272980306305e-08, + "loss": 0.1731, + "step": 11058 + }, + { + "epoch": 2.9427887174028737, + "grad_norm": 0.3649277091026306, + "learning_rate": 7.381099863601323e-08, + "loss": 0.1903, + "step": 11059 + }, + { + "epoch": 2.943054816391698, + "grad_norm": 0.43564116954803467, + "learning_rate": 7.379469998835948e-08, + "loss": 0.2048, + "step": 11060 + }, + { + "epoch": 2.9433209153805215, + "grad_norm": 0.2888026833534241, + "learning_rate": 7.377840208813418e-08, + "loss": 0.1916, + "step": 11061 + }, + { + "epoch": 2.9435870143693457, + "grad_norm": 0.4089389443397522, + "learning_rate": 7.376210493580211e-08, + "loss": 0.1922, + "step": 11062 + }, + { + "epoch": 2.9438531133581693, + "grad_norm": 0.27452409267425537, + "learning_rate": 7.374580853182811e-08, + "loss": 0.178, + "step": 11063 + }, + { + "epoch": 2.944119212346993, + "grad_norm": 0.32413920760154724, + "learning_rate": 7.372951287667701e-08, + "loss": 0.1817, + "step": 11064 + }, + { + "epoch": 2.944385311335817, + "grad_norm": 0.35027727484703064, + "learning_rate": 7.371321797081359e-08, + "loss": 0.1912, + "step": 11065 + }, + { + "epoch": 2.944651410324641, + "grad_norm": 0.27700358629226685, + "learning_rate": 7.369692381470263e-08, + "loss": 0.1887, + "step": 11066 + }, + { + "epoch": 2.9449175093134645, + "grad_norm": 0.34878331422805786, + "learning_rate": 7.368063040880882e-08, + "loss": 0.1978, + "step": 11067 + }, + { + "epoch": 2.9451836083022886, + "grad_norm": 0.28116145730018616, + "learning_rate": 7.366433775359696e-08, + "loss": 0.1812, + "step": 11068 + }, + { + "epoch": 2.9454497072911123, + "grad_norm": 0.443296879529953, + "learning_rate": 7.364804584953166e-08, + "loss": 0.18, + "step": 11069 + }, + { + "epoch": 2.945715806279936, + "grad_norm": 0.2905201315879822, + "learning_rate": 7.363175469707768e-08, + "loss": 0.169, + "step": 11070 + }, + { + "epoch": 2.94598190526876, + "grad_norm": 0.284313827753067, + "learning_rate": 7.361546429669959e-08, + "loss": 0.1921, + "step": 11071 + }, + { + "epoch": 2.946248004257584, + "grad_norm": 0.2760848104953766, + "learning_rate": 7.359917464886214e-08, + "loss": 0.1756, + "step": 11072 + }, + { + "epoch": 2.9465141032464075, + "grad_norm": 0.4047538936138153, + "learning_rate": 7.358288575402983e-08, + "loss": 0.1864, + "step": 11073 + }, + { + "epoch": 2.9467802022352316, + "grad_norm": 0.4152393937110901, + "learning_rate": 7.35665976126673e-08, + "loss": 0.1813, + "step": 11074 + }, + { + "epoch": 2.9470463012240553, + "grad_norm": 0.3716292977333069, + "learning_rate": 7.355031022523912e-08, + "loss": 0.1995, + "step": 11075 + }, + { + "epoch": 2.947312400212879, + "grad_norm": 0.27409398555755615, + "learning_rate": 7.35340235922099e-08, + "loss": 0.1787, + "step": 11076 + }, + { + "epoch": 2.947578499201703, + "grad_norm": 0.27697446942329407, + "learning_rate": 7.351773771404403e-08, + "loss": 0.1884, + "step": 11077 + }, + { + "epoch": 2.947844598190527, + "grad_norm": 0.3430256247520447, + "learning_rate": 7.350145259120614e-08, + "loss": 0.1835, + "step": 11078 + }, + { + "epoch": 2.9481106971793505, + "grad_norm": 0.3596770763397217, + "learning_rate": 7.348516822416064e-08, + "loss": 0.1873, + "step": 11079 + }, + { + "epoch": 2.9483767961681746, + "grad_norm": 0.2835884094238281, + "learning_rate": 7.346888461337202e-08, + "loss": 0.1641, + "step": 11080 + }, + { + "epoch": 2.9486428951569983, + "grad_norm": 0.2646729350090027, + "learning_rate": 7.345260175930476e-08, + "loss": 0.1687, + "step": 11081 + }, + { + "epoch": 2.948908994145822, + "grad_norm": 0.27036938071250916, + "learning_rate": 7.34363196624232e-08, + "loss": 0.1793, + "step": 11082 + }, + { + "epoch": 2.949175093134646, + "grad_norm": 0.3543289005756378, + "learning_rate": 7.342003832319186e-08, + "loss": 0.1919, + "step": 11083 + }, + { + "epoch": 2.94944119212347, + "grad_norm": 0.3719744086265564, + "learning_rate": 7.340375774207497e-08, + "loss": 0.1951, + "step": 11084 + }, + { + "epoch": 2.949707291112294, + "grad_norm": 0.38826894760131836, + "learning_rate": 7.338747791953699e-08, + "loss": 0.177, + "step": 11085 + }, + { + "epoch": 2.9499733901011176, + "grad_norm": 0.4100255072116852, + "learning_rate": 7.33711988560422e-08, + "loss": 0.1735, + "step": 11086 + }, + { + "epoch": 2.9502394890899417, + "grad_norm": 0.30021828413009644, + "learning_rate": 7.335492055205495e-08, + "loss": 0.1864, + "step": 11087 + }, + { + "epoch": 2.9505055880787654, + "grad_norm": 0.2713022530078888, + "learning_rate": 7.333864300803949e-08, + "loss": 0.1658, + "step": 11088 + }, + { + "epoch": 2.950771687067589, + "grad_norm": 0.2841747999191284, + "learning_rate": 7.332236622446014e-08, + "loss": 0.1775, + "step": 11089 + }, + { + "epoch": 2.951037786056413, + "grad_norm": 0.3769529461860657, + "learning_rate": 7.330609020178107e-08, + "loss": 0.182, + "step": 11090 + }, + { + "epoch": 2.951303885045237, + "grad_norm": 0.2561681866645813, + "learning_rate": 7.32898149404666e-08, + "loss": 0.1641, + "step": 11091 + }, + { + "epoch": 2.9515699840340606, + "grad_norm": 0.4381997585296631, + "learning_rate": 7.327354044098083e-08, + "loss": 0.1968, + "step": 11092 + }, + { + "epoch": 2.9518360830228847, + "grad_norm": 0.2673215866088867, + "learning_rate": 7.325726670378806e-08, + "loss": 0.1785, + "step": 11093 + }, + { + "epoch": 2.9521021820117084, + "grad_norm": 0.33481815457344055, + "learning_rate": 7.324099372935235e-08, + "loss": 0.1841, + "step": 11094 + }, + { + "epoch": 2.952368281000532, + "grad_norm": 0.2690214216709137, + "learning_rate": 7.322472151813788e-08, + "loss": 0.166, + "step": 11095 + }, + { + "epoch": 2.952634379989356, + "grad_norm": 0.3921566605567932, + "learning_rate": 7.320845007060875e-08, + "loss": 0.2027, + "step": 11096 + }, + { + "epoch": 2.95290047897818, + "grad_norm": 0.35480839014053345, + "learning_rate": 7.319217938722912e-08, + "loss": 0.1805, + "step": 11097 + }, + { + "epoch": 2.9531665779670035, + "grad_norm": 0.3590143024921417, + "learning_rate": 7.317590946846295e-08, + "loss": 0.176, + "step": 11098 + }, + { + "epoch": 2.9534326769558277, + "grad_norm": 0.2680712640285492, + "learning_rate": 7.315964031477436e-08, + "loss": 0.1695, + "step": 11099 + }, + { + "epoch": 2.9536987759446514, + "grad_norm": 0.40839654207229614, + "learning_rate": 7.31433719266274e-08, + "loss": 0.1979, + "step": 11100 + }, + { + "epoch": 2.953964874933475, + "grad_norm": 0.4544067680835724, + "learning_rate": 7.312710430448601e-08, + "loss": 0.1882, + "step": 11101 + }, + { + "epoch": 2.954230973922299, + "grad_norm": 0.28667426109313965, + "learning_rate": 7.311083744881427e-08, + "loss": 0.1731, + "step": 11102 + }, + { + "epoch": 2.954497072911123, + "grad_norm": 0.40015867352485657, + "learning_rate": 7.309457136007603e-08, + "loss": 0.183, + "step": 11103 + }, + { + "epoch": 2.9547631718999465, + "grad_norm": 0.27972549200057983, + "learning_rate": 7.307830603873533e-08, + "loss": 0.1758, + "step": 11104 + }, + { + "epoch": 2.9550292708887707, + "grad_norm": 0.4022723138332367, + "learning_rate": 7.306204148525602e-08, + "loss": 0.1913, + "step": 11105 + }, + { + "epoch": 2.9552953698775943, + "grad_norm": 0.3025408983230591, + "learning_rate": 7.304577770010204e-08, + "loss": 0.1935, + "step": 11106 + }, + { + "epoch": 2.9555614688664185, + "grad_norm": 0.2887934148311615, + "learning_rate": 7.302951468373723e-08, + "loss": 0.1835, + "step": 11107 + }, + { + "epoch": 2.955827567855242, + "grad_norm": 0.2679644823074341, + "learning_rate": 7.301325243662553e-08, + "loss": 0.1837, + "step": 11108 + }, + { + "epoch": 2.9560936668440663, + "grad_norm": 0.29290589690208435, + "learning_rate": 7.299699095923066e-08, + "loss": 0.1902, + "step": 11109 + }, + { + "epoch": 2.95635976583289, + "grad_norm": 0.27778661251068115, + "learning_rate": 7.298073025201649e-08, + "loss": 0.1864, + "step": 11110 + }, + { + "epoch": 2.9566258648217136, + "grad_norm": 0.3118220567703247, + "learning_rate": 7.29644703154468e-08, + "loss": 0.1899, + "step": 11111 + }, + { + "epoch": 2.9568919638105378, + "grad_norm": 0.3674958050251007, + "learning_rate": 7.294821114998538e-08, + "loss": 0.1632, + "step": 11112 + }, + { + "epoch": 2.9571580627993614, + "grad_norm": 0.43813031911849976, + "learning_rate": 7.293195275609589e-08, + "loss": 0.2124, + "step": 11113 + }, + { + "epoch": 2.957424161788185, + "grad_norm": 0.3400527834892273, + "learning_rate": 7.291569513424216e-08, + "loss": 0.1983, + "step": 11114 + }, + { + "epoch": 2.9576902607770093, + "grad_norm": 0.3890751004219055, + "learning_rate": 7.289943828488783e-08, + "loss": 0.1864, + "step": 11115 + }, + { + "epoch": 2.957956359765833, + "grad_norm": 0.26367855072021484, + "learning_rate": 7.288318220849661e-08, + "loss": 0.1797, + "step": 11116 + }, + { + "epoch": 2.9582224587546566, + "grad_norm": 0.33383363485336304, + "learning_rate": 7.286692690553208e-08, + "loss": 0.1744, + "step": 11117 + }, + { + "epoch": 2.9584885577434807, + "grad_norm": 0.42285528779029846, + "learning_rate": 7.285067237645797e-08, + "loss": 0.1921, + "step": 11118 + }, + { + "epoch": 2.9587546567323044, + "grad_norm": 0.32711490988731384, + "learning_rate": 7.283441862173788e-08, + "loss": 0.1649, + "step": 11119 + }, + { + "epoch": 2.959020755721128, + "grad_norm": 0.39997658133506775, + "learning_rate": 7.281816564183534e-08, + "loss": 0.2015, + "step": 11120 + }, + { + "epoch": 2.9592868547099522, + "grad_norm": 0.2630200684070587, + "learning_rate": 7.280191343721396e-08, + "loss": 0.1713, + "step": 11121 + }, + { + "epoch": 2.959552953698776, + "grad_norm": 0.35118457674980164, + "learning_rate": 7.278566200833727e-08, + "loss": 0.202, + "step": 11122 + }, + { + "epoch": 2.9598190526875996, + "grad_norm": 0.28241240978240967, + "learning_rate": 7.276941135566883e-08, + "loss": 0.1831, + "step": 11123 + }, + { + "epoch": 2.9600851516764237, + "grad_norm": 0.27326419949531555, + "learning_rate": 7.275316147967208e-08, + "loss": 0.193, + "step": 11124 + }, + { + "epoch": 2.9603512506652474, + "grad_norm": 0.42993998527526855, + "learning_rate": 7.273691238081055e-08, + "loss": 0.2167, + "step": 11125 + }, + { + "epoch": 2.960617349654071, + "grad_norm": 0.2655320465564728, + "learning_rate": 7.272066405954767e-08, + "loss": 0.1659, + "step": 11126 + }, + { + "epoch": 2.960883448642895, + "grad_norm": 0.33142390847206116, + "learning_rate": 7.270441651634692e-08, + "loss": 0.1714, + "step": 11127 + }, + { + "epoch": 2.961149547631719, + "grad_norm": 0.26440131664276123, + "learning_rate": 7.268816975167164e-08, + "loss": 0.1563, + "step": 11128 + }, + { + "epoch": 2.9614156466205426, + "grad_norm": 0.2518135607242584, + "learning_rate": 7.267192376598528e-08, + "loss": 0.1614, + "step": 11129 + }, + { + "epoch": 2.9616817456093667, + "grad_norm": 0.40051397681236267, + "learning_rate": 7.265567855975116e-08, + "loss": 0.1733, + "step": 11130 + }, + { + "epoch": 2.9619478445981904, + "grad_norm": 0.2863367795944214, + "learning_rate": 7.263943413343267e-08, + "loss": 0.18, + "step": 11131 + }, + { + "epoch": 2.9622139435870145, + "grad_norm": 0.3106626570224762, + "learning_rate": 7.26231904874931e-08, + "loss": 0.1856, + "step": 11132 + }, + { + "epoch": 2.962480042575838, + "grad_norm": 1.5991548299789429, + "learning_rate": 7.260694762239582e-08, + "loss": 0.1939, + "step": 11133 + }, + { + "epoch": 2.9627461415646623, + "grad_norm": 0.33037546277046204, + "learning_rate": 7.259070553860401e-08, + "loss": 0.1985, + "step": 11134 + }, + { + "epoch": 2.963012240553486, + "grad_norm": 0.2824364900588989, + "learning_rate": 7.257446423658097e-08, + "loss": 0.1911, + "step": 11135 + }, + { + "epoch": 2.9632783395423097, + "grad_norm": 0.286933034658432, + "learning_rate": 7.255822371679e-08, + "loss": 0.1906, + "step": 11136 + }, + { + "epoch": 2.963544438531134, + "grad_norm": 0.43505892157554626, + "learning_rate": 7.254198397969418e-08, + "loss": 0.1893, + "step": 11137 + }, + { + "epoch": 2.9638105375199575, + "grad_norm": 0.27578631043434143, + "learning_rate": 7.252574502575685e-08, + "loss": 0.1795, + "step": 11138 + }, + { + "epoch": 2.964076636508781, + "grad_norm": 0.2920863926410675, + "learning_rate": 7.250950685544105e-08, + "loss": 0.1712, + "step": 11139 + }, + { + "epoch": 2.9643427354976053, + "grad_norm": 0.3798171877861023, + "learning_rate": 7.249326946920999e-08, + "loss": 0.1992, + "step": 11140 + }, + { + "epoch": 2.964608834486429, + "grad_norm": 0.2635485827922821, + "learning_rate": 7.247703286752678e-08, + "loss": 0.1705, + "step": 11141 + }, + { + "epoch": 2.9648749334752527, + "grad_norm": 0.29520469903945923, + "learning_rate": 7.246079705085456e-08, + "loss": 0.1963, + "step": 11142 + }, + { + "epoch": 2.965141032464077, + "grad_norm": 0.34290921688079834, + "learning_rate": 7.24445620196563e-08, + "loss": 0.1815, + "step": 11143 + }, + { + "epoch": 2.9654071314529005, + "grad_norm": 0.47195255756378174, + "learning_rate": 7.242832777439519e-08, + "loss": 0.1964, + "step": 11144 + }, + { + "epoch": 2.965673230441724, + "grad_norm": 0.2590904235839844, + "learning_rate": 7.241209431553417e-08, + "loss": 0.1665, + "step": 11145 + }, + { + "epoch": 2.9659393294305483, + "grad_norm": 0.29148411750793457, + "learning_rate": 7.239586164353628e-08, + "loss": 0.1807, + "step": 11146 + }, + { + "epoch": 2.966205428419372, + "grad_norm": 0.27417224645614624, + "learning_rate": 7.23796297588645e-08, + "loss": 0.1784, + "step": 11147 + }, + { + "epoch": 2.9664715274081956, + "grad_norm": 0.3327831029891968, + "learning_rate": 7.236339866198184e-08, + "loss": 0.1938, + "step": 11148 + }, + { + "epoch": 2.9667376263970198, + "grad_norm": 0.27578088641166687, + "learning_rate": 7.23471683533512e-08, + "loss": 0.181, + "step": 11149 + }, + { + "epoch": 2.9670037253858434, + "grad_norm": 0.3271535336971283, + "learning_rate": 7.23309388334355e-08, + "loss": 0.1911, + "step": 11150 + }, + { + "epoch": 2.967269824374667, + "grad_norm": 0.26412901282310486, + "learning_rate": 7.231471010269764e-08, + "loss": 0.1637, + "step": 11151 + }, + { + "epoch": 2.9675359233634913, + "grad_norm": 0.3147240877151489, + "learning_rate": 7.229848216160055e-08, + "loss": 0.1707, + "step": 11152 + }, + { + "epoch": 2.967802022352315, + "grad_norm": 0.33888378739356995, + "learning_rate": 7.2282255010607e-08, + "loss": 0.1819, + "step": 11153 + }, + { + "epoch": 2.968068121341139, + "grad_norm": 0.4302504360675812, + "learning_rate": 7.226602865017983e-08, + "loss": 0.1919, + "step": 11154 + }, + { + "epoch": 2.9683342203299627, + "grad_norm": 0.37139469385147095, + "learning_rate": 7.224980308078197e-08, + "loss": 0.1847, + "step": 11155 + }, + { + "epoch": 2.9686003193187864, + "grad_norm": 0.2596697509288788, + "learning_rate": 7.223357830287605e-08, + "loss": 0.174, + "step": 11156 + }, + { + "epoch": 2.9688664183076106, + "grad_norm": 0.26248791813850403, + "learning_rate": 7.221735431692493e-08, + "loss": 0.1612, + "step": 11157 + }, + { + "epoch": 2.9691325172964342, + "grad_norm": 0.34463047981262207, + "learning_rate": 7.22011311233913e-08, + "loss": 0.1836, + "step": 11158 + }, + { + "epoch": 2.9693986162852584, + "grad_norm": 0.26263153553009033, + "learning_rate": 7.218490872273795e-08, + "loss": 0.1717, + "step": 11159 + }, + { + "epoch": 2.969664715274082, + "grad_norm": 0.27347949147224426, + "learning_rate": 7.216868711542747e-08, + "loss": 0.1838, + "step": 11160 + }, + { + "epoch": 2.9699308142629057, + "grad_norm": 0.2805301547050476, + "learning_rate": 7.215246630192262e-08, + "loss": 0.1817, + "step": 11161 + }, + { + "epoch": 2.97019691325173, + "grad_norm": 0.38960129022598267, + "learning_rate": 7.2136246282686e-08, + "loss": 0.1804, + "step": 11162 + }, + { + "epoch": 2.9704630122405535, + "grad_norm": 0.42130047082901, + "learning_rate": 7.21200270581803e-08, + "loss": 0.2097, + "step": 11163 + }, + { + "epoch": 2.970729111229377, + "grad_norm": 0.3739185333251953, + "learning_rate": 7.210380862886804e-08, + "loss": 0.1901, + "step": 11164 + }, + { + "epoch": 2.9709952102182013, + "grad_norm": 0.3032660484313965, + "learning_rate": 7.208759099521187e-08, + "loss": 0.1806, + "step": 11165 + }, + { + "epoch": 2.971261309207025, + "grad_norm": 0.26638278365135193, + "learning_rate": 7.207137415767429e-08, + "loss": 0.1681, + "step": 11166 + }, + { + "epoch": 2.9715274081958487, + "grad_norm": 0.27865979075431824, + "learning_rate": 7.205515811671793e-08, + "loss": 0.1891, + "step": 11167 + }, + { + "epoch": 2.971793507184673, + "grad_norm": 0.24183253943920135, + "learning_rate": 7.203894287280517e-08, + "loss": 0.157, + "step": 11168 + }, + { + "epoch": 2.9720596061734965, + "grad_norm": 0.32912278175354004, + "learning_rate": 7.202272842639866e-08, + "loss": 0.1721, + "step": 11169 + }, + { + "epoch": 2.97232570516232, + "grad_norm": 0.35488542914390564, + "learning_rate": 7.200651477796073e-08, + "loss": 0.1916, + "step": 11170 + }, + { + "epoch": 2.9725918041511443, + "grad_norm": 0.28952157497406006, + "learning_rate": 7.199030192795393e-08, + "loss": 0.1742, + "step": 11171 + }, + { + "epoch": 2.972857903139968, + "grad_norm": 0.28900468349456787, + "learning_rate": 7.197408987684058e-08, + "loss": 0.1834, + "step": 11172 + }, + { + "epoch": 2.9731240021287917, + "grad_norm": 0.348275363445282, + "learning_rate": 7.195787862508315e-08, + "loss": 0.1799, + "step": 11173 + }, + { + "epoch": 2.973390101117616, + "grad_norm": 0.350002259016037, + "learning_rate": 7.194166817314407e-08, + "loss": 0.1722, + "step": 11174 + }, + { + "epoch": 2.9736562001064395, + "grad_norm": 0.30955970287323, + "learning_rate": 7.192545852148557e-08, + "loss": 0.1964, + "step": 11175 + }, + { + "epoch": 2.973922299095263, + "grad_norm": 0.4223420023918152, + "learning_rate": 7.190924967057006e-08, + "loss": 0.1903, + "step": 11176 + }, + { + "epoch": 2.9741883980840873, + "grad_norm": 0.2857452929019928, + "learning_rate": 7.189304162085983e-08, + "loss": 0.187, + "step": 11177 + }, + { + "epoch": 2.974454497072911, + "grad_norm": 0.2501344084739685, + "learning_rate": 7.187683437281719e-08, + "loss": 0.1543, + "step": 11178 + }, + { + "epoch": 2.974720596061735, + "grad_norm": 0.36208412051200867, + "learning_rate": 7.186062792690435e-08, + "loss": 0.1867, + "step": 11179 + }, + { + "epoch": 2.974986695050559, + "grad_norm": 0.29509371519088745, + "learning_rate": 7.18444222835836e-08, + "loss": 0.1711, + "step": 11180 + }, + { + "epoch": 2.975252794039383, + "grad_norm": 0.2793835699558258, + "learning_rate": 7.182821744331713e-08, + "loss": 0.1827, + "step": 11181 + }, + { + "epoch": 2.9755188930282066, + "grad_norm": 0.26888537406921387, + "learning_rate": 7.181201340656716e-08, + "loss": 0.1823, + "step": 11182 + }, + { + "epoch": 2.9757849920170303, + "grad_norm": 0.3920501470565796, + "learning_rate": 7.179581017379584e-08, + "loss": 0.2013, + "step": 11183 + }, + { + "epoch": 2.9760510910058544, + "grad_norm": 0.46277448534965515, + "learning_rate": 7.177960774546535e-08, + "loss": 0.2032, + "step": 11184 + }, + { + "epoch": 2.976317189994678, + "grad_norm": 0.28011342883110046, + "learning_rate": 7.176340612203776e-08, + "loss": 0.201, + "step": 11185 + }, + { + "epoch": 2.9765832889835018, + "grad_norm": 0.468174546957016, + "learning_rate": 7.174720530397523e-08, + "loss": 0.1883, + "step": 11186 + }, + { + "epoch": 2.976849387972326, + "grad_norm": 0.2816665768623352, + "learning_rate": 7.17310052917398e-08, + "loss": 0.1746, + "step": 11187 + }, + { + "epoch": 2.9771154869611496, + "grad_norm": 0.29564306139945984, + "learning_rate": 7.171480608579357e-08, + "loss": 0.1801, + "step": 11188 + }, + { + "epoch": 2.9773815859499733, + "grad_norm": 0.2888266444206238, + "learning_rate": 7.169860768659852e-08, + "loss": 0.173, + "step": 11189 + }, + { + "epoch": 2.9776476849387974, + "grad_norm": 0.2621660828590393, + "learning_rate": 7.16824100946167e-08, + "loss": 0.1693, + "step": 11190 + }, + { + "epoch": 2.977913783927621, + "grad_norm": 0.3463725745677948, + "learning_rate": 7.166621331031009e-08, + "loss": 0.1826, + "step": 11191 + }, + { + "epoch": 2.9781798829164448, + "grad_norm": 0.26775696873664856, + "learning_rate": 7.165001733414065e-08, + "loss": 0.1821, + "step": 11192 + }, + { + "epoch": 2.978445981905269, + "grad_norm": 0.2795574963092804, + "learning_rate": 7.163382216657032e-08, + "loss": 0.1826, + "step": 11193 + }, + { + "epoch": 2.9787120808940926, + "grad_norm": 0.2781037390232086, + "learning_rate": 7.161762780806102e-08, + "loss": 0.1761, + "step": 11194 + }, + { + "epoch": 2.9789781798829162, + "grad_norm": 0.562252402305603, + "learning_rate": 7.16014342590747e-08, + "loss": 0.201, + "step": 11195 + }, + { + "epoch": 2.9792442788717404, + "grad_norm": 0.25765106081962585, + "learning_rate": 7.158524152007314e-08, + "loss": 0.1726, + "step": 11196 + }, + { + "epoch": 2.979510377860564, + "grad_norm": 0.2643512189388275, + "learning_rate": 7.156904959151824e-08, + "loss": 0.1788, + "step": 11197 + }, + { + "epoch": 2.9797764768493877, + "grad_norm": 0.2698691189289093, + "learning_rate": 7.155285847387181e-08, + "loss": 0.1642, + "step": 11198 + }, + { + "epoch": 2.980042575838212, + "grad_norm": 0.3090139329433441, + "learning_rate": 7.15366681675957e-08, + "loss": 0.1865, + "step": 11199 + }, + { + "epoch": 2.9803086748270355, + "grad_norm": 0.2659570574760437, + "learning_rate": 7.152047867315162e-08, + "loss": 0.1758, + "step": 11200 + }, + { + "epoch": 2.9805747738158592, + "grad_norm": 0.282043993473053, + "learning_rate": 7.150428999100138e-08, + "loss": 0.1948, + "step": 11201 + }, + { + "epoch": 2.9808408728046834, + "grad_norm": 0.45840322971343994, + "learning_rate": 7.148810212160668e-08, + "loss": 0.1855, + "step": 11202 + }, + { + "epoch": 2.981106971793507, + "grad_norm": 0.37466052174568176, + "learning_rate": 7.147191506542926e-08, + "loss": 0.1933, + "step": 11203 + }, + { + "epoch": 2.981373070782331, + "grad_norm": 0.37121185660362244, + "learning_rate": 7.145572882293078e-08, + "loss": 0.173, + "step": 11204 + }, + { + "epoch": 2.981639169771155, + "grad_norm": 0.2752670347690582, + "learning_rate": 7.143954339457294e-08, + "loss": 0.1748, + "step": 11205 + }, + { + "epoch": 2.981905268759979, + "grad_norm": 0.26805299520492554, + "learning_rate": 7.142335878081733e-08, + "loss": 0.1623, + "step": 11206 + }, + { + "epoch": 2.9821713677488026, + "grad_norm": 0.3369075357913971, + "learning_rate": 7.140717498212562e-08, + "loss": 0.1801, + "step": 11207 + }, + { + "epoch": 2.9824374667376263, + "grad_norm": 0.5763930678367615, + "learning_rate": 7.139099199895936e-08, + "loss": 0.1838, + "step": 11208 + }, + { + "epoch": 2.9827035657264505, + "grad_norm": 0.25740402936935425, + "learning_rate": 7.137480983178014e-08, + "loss": 0.1684, + "step": 11209 + }, + { + "epoch": 2.982969664715274, + "grad_norm": 0.3488497734069824, + "learning_rate": 7.135862848104957e-08, + "loss": 0.1984, + "step": 11210 + }, + { + "epoch": 2.983235763704098, + "grad_norm": 0.3313016891479492, + "learning_rate": 7.134244794722907e-08, + "loss": 0.2009, + "step": 11211 + }, + { + "epoch": 2.983501862692922, + "grad_norm": 0.27849042415618896, + "learning_rate": 7.132626823078021e-08, + "loss": 0.1885, + "step": 11212 + }, + { + "epoch": 2.9837679616817456, + "grad_norm": 0.2651318311691284, + "learning_rate": 7.131008933216442e-08, + "loss": 0.1673, + "step": 11213 + }, + { + "epoch": 2.9840340606705693, + "grad_norm": 0.365951806306839, + "learning_rate": 7.129391125184324e-08, + "loss": 0.1827, + "step": 11214 + }, + { + "epoch": 2.9843001596593934, + "grad_norm": 0.3850024938583374, + "learning_rate": 7.127773399027799e-08, + "loss": 0.1893, + "step": 11215 + }, + { + "epoch": 2.984566258648217, + "grad_norm": 0.3443423807621002, + "learning_rate": 7.126155754793016e-08, + "loss": 0.1731, + "step": 11216 + }, + { + "epoch": 2.984832357637041, + "grad_norm": 0.27683719992637634, + "learning_rate": 7.124538192526108e-08, + "loss": 0.1776, + "step": 11217 + }, + { + "epoch": 2.985098456625865, + "grad_norm": 0.262460857629776, + "learning_rate": 7.122920712273217e-08, + "loss": 0.1736, + "step": 11218 + }, + { + "epoch": 2.9853645556146886, + "grad_norm": 0.30477091670036316, + "learning_rate": 7.12130331408047e-08, + "loss": 0.1737, + "step": 11219 + }, + { + "epoch": 2.9856306546035123, + "grad_norm": 0.4678972065448761, + "learning_rate": 7.119685997994007e-08, + "loss": 0.1849, + "step": 11220 + }, + { + "epoch": 2.9858967535923364, + "grad_norm": 0.3059116005897522, + "learning_rate": 7.11806876405995e-08, + "loss": 0.1774, + "step": 11221 + }, + { + "epoch": 2.98616285258116, + "grad_norm": 0.31627559661865234, + "learning_rate": 7.116451612324428e-08, + "loss": 0.1799, + "step": 11222 + }, + { + "epoch": 2.986428951569984, + "grad_norm": 0.3056657314300537, + "learning_rate": 7.114834542833566e-08, + "loss": 0.1846, + "step": 11223 + }, + { + "epoch": 2.986695050558808, + "grad_norm": 0.41888558864593506, + "learning_rate": 7.113217555633488e-08, + "loss": 0.1883, + "step": 11224 + }, + { + "epoch": 2.9869611495476316, + "grad_norm": 0.26553934812545776, + "learning_rate": 7.111600650770309e-08, + "loss": 0.1812, + "step": 11225 + }, + { + "epoch": 2.9872272485364557, + "grad_norm": 0.3482203185558319, + "learning_rate": 7.109983828290146e-08, + "loss": 0.1671, + "step": 11226 + }, + { + "epoch": 2.9874933475252794, + "grad_norm": 0.2780846655368805, + "learning_rate": 7.108367088239122e-08, + "loss": 0.1733, + "step": 11227 + }, + { + "epoch": 2.9877594465141035, + "grad_norm": 0.33624786138534546, + "learning_rate": 7.106750430663342e-08, + "loss": 0.1579, + "step": 11228 + }, + { + "epoch": 2.988025545502927, + "grad_norm": 0.41141465306282043, + "learning_rate": 7.105133855608923e-08, + "loss": 0.1939, + "step": 11229 + }, + { + "epoch": 2.988291644491751, + "grad_norm": 0.29538610577583313, + "learning_rate": 7.103517363121965e-08, + "loss": 0.1795, + "step": 11230 + }, + { + "epoch": 2.988557743480575, + "grad_norm": 0.28935539722442627, + "learning_rate": 7.101900953248578e-08, + "loss": 0.1796, + "step": 11231 + }, + { + "epoch": 2.9888238424693987, + "grad_norm": 0.34202128648757935, + "learning_rate": 7.100284626034865e-08, + "loss": 0.1815, + "step": 11232 + }, + { + "epoch": 2.9890899414582224, + "grad_norm": 0.43702423572540283, + "learning_rate": 7.098668381526929e-08, + "loss": 0.1869, + "step": 11233 + }, + { + "epoch": 2.9893560404470465, + "grad_norm": 0.2828308641910553, + "learning_rate": 7.097052219770865e-08, + "loss": 0.1711, + "step": 11234 + }, + { + "epoch": 2.98962213943587, + "grad_norm": 0.2792584300041199, + "learning_rate": 7.095436140812774e-08, + "loss": 0.174, + "step": 11235 + }, + { + "epoch": 2.989888238424694, + "grad_norm": 0.27412715554237366, + "learning_rate": 7.093820144698742e-08, + "loss": 0.172, + "step": 11236 + }, + { + "epoch": 2.990154337413518, + "grad_norm": 0.25905299186706543, + "learning_rate": 7.092204231474868e-08, + "loss": 0.1735, + "step": 11237 + }, + { + "epoch": 2.9904204364023417, + "grad_norm": 0.3966778814792633, + "learning_rate": 7.090588401187236e-08, + "loss": 0.1956, + "step": 11238 + }, + { + "epoch": 2.9906865353911654, + "grad_norm": 0.26800256967544556, + "learning_rate": 7.08897265388194e-08, + "loss": 0.1803, + "step": 11239 + }, + { + "epoch": 2.9909526343799895, + "grad_norm": 0.2747739553451538, + "learning_rate": 7.087356989605053e-08, + "loss": 0.1762, + "step": 11240 + }, + { + "epoch": 2.991218733368813, + "grad_norm": 0.41953539848327637, + "learning_rate": 7.085741408402666e-08, + "loss": 0.1853, + "step": 11241 + }, + { + "epoch": 2.991484832357637, + "grad_norm": 0.31110668182373047, + "learning_rate": 7.084125910320855e-08, + "loss": 0.1866, + "step": 11242 + }, + { + "epoch": 2.991750931346461, + "grad_norm": 0.2881096601486206, + "learning_rate": 7.082510495405702e-08, + "loss": 0.1876, + "step": 11243 + }, + { + "epoch": 2.9920170303352847, + "grad_norm": 0.2711055874824524, + "learning_rate": 7.080895163703273e-08, + "loss": 0.1599, + "step": 11244 + }, + { + "epoch": 2.9922831293241083, + "grad_norm": 0.35942399501800537, + "learning_rate": 7.079279915259648e-08, + "loss": 0.1962, + "step": 11245 + }, + { + "epoch": 2.9925492283129325, + "grad_norm": 0.2733455002307892, + "learning_rate": 7.077664750120898e-08, + "loss": 0.1806, + "step": 11246 + }, + { + "epoch": 2.992815327301756, + "grad_norm": 0.2727212905883789, + "learning_rate": 7.076049668333084e-08, + "loss": 0.1867, + "step": 11247 + }, + { + "epoch": 2.99308142629058, + "grad_norm": 0.2576775848865509, + "learning_rate": 7.074434669942278e-08, + "loss": 0.1628, + "step": 11248 + }, + { + "epoch": 2.993347525279404, + "grad_norm": 0.38098132610321045, + "learning_rate": 7.072819754994536e-08, + "loss": 0.1916, + "step": 11249 + }, + { + "epoch": 2.9936136242682276, + "grad_norm": 0.26022985577583313, + "learning_rate": 7.07120492353593e-08, + "loss": 0.1657, + "step": 11250 + }, + { + "epoch": 2.9938797232570518, + "grad_norm": 0.2927623391151428, + "learning_rate": 7.069590175612505e-08, + "loss": 0.1811, + "step": 11251 + }, + { + "epoch": 2.9941458222458754, + "grad_norm": 0.38146623969078064, + "learning_rate": 7.067975511270326e-08, + "loss": 0.1781, + "step": 11252 + }, + { + "epoch": 2.9944119212346996, + "grad_norm": 0.34248432517051697, + "learning_rate": 7.066360930555442e-08, + "loss": 0.1859, + "step": 11253 + }, + { + "epoch": 2.9946780202235233, + "grad_norm": 0.284010648727417, + "learning_rate": 7.064746433513908e-08, + "loss": 0.1719, + "step": 11254 + }, + { + "epoch": 2.994944119212347, + "grad_norm": 0.3069957494735718, + "learning_rate": 7.063132020191769e-08, + "loss": 0.1887, + "step": 11255 + }, + { + "epoch": 2.995210218201171, + "grad_norm": 0.28541189432144165, + "learning_rate": 7.061517690635072e-08, + "loss": 0.18, + "step": 11256 + }, + { + "epoch": 2.9954763171899947, + "grad_norm": 0.2975207269191742, + "learning_rate": 7.059903444889862e-08, + "loss": 0.1779, + "step": 11257 + }, + { + "epoch": 2.9957424161788184, + "grad_norm": 0.2820189297199249, + "learning_rate": 7.058289283002181e-08, + "loss": 0.183, + "step": 11258 + }, + { + "epoch": 2.9960085151676425, + "grad_norm": 0.3721234202384949, + "learning_rate": 7.056675205018065e-08, + "loss": 0.2098, + "step": 11259 + }, + { + "epoch": 2.9962746141564662, + "grad_norm": 0.37823817133903503, + "learning_rate": 7.05506121098356e-08, + "loss": 0.1617, + "step": 11260 + }, + { + "epoch": 2.99654071314529, + "grad_norm": 0.278285950422287, + "learning_rate": 7.053447300944688e-08, + "loss": 0.1827, + "step": 11261 + }, + { + "epoch": 2.996806812134114, + "grad_norm": 0.4051964581012726, + "learning_rate": 7.051833474947489e-08, + "loss": 0.189, + "step": 11262 + }, + { + "epoch": 2.9970729111229377, + "grad_norm": 0.27948060631752014, + "learning_rate": 7.050219733037986e-08, + "loss": 0.1737, + "step": 11263 + }, + { + "epoch": 2.9973390101117614, + "grad_norm": 0.3940369784832001, + "learning_rate": 7.048606075262212e-08, + "loss": 0.2048, + "step": 11264 + }, + { + "epoch": 2.9976051091005855, + "grad_norm": 0.30806389451026917, + "learning_rate": 7.046992501666195e-08, + "loss": 0.1898, + "step": 11265 + }, + { + "epoch": 2.997871208089409, + "grad_norm": 0.33609458804130554, + "learning_rate": 7.045379012295948e-08, + "loss": 0.1822, + "step": 11266 + }, + { + "epoch": 2.998137307078233, + "grad_norm": 0.3546127676963806, + "learning_rate": 7.043765607197498e-08, + "loss": 0.1913, + "step": 11267 + }, + { + "epoch": 2.998403406067057, + "grad_norm": 0.27766481041908264, + "learning_rate": 7.042152286416858e-08, + "loss": 0.1839, + "step": 11268 + }, + { + "epoch": 2.9986695050558807, + "grad_norm": 0.2669111490249634, + "learning_rate": 7.040539050000049e-08, + "loss": 0.1767, + "step": 11269 + }, + { + "epoch": 2.9989356040447044, + "grad_norm": 0.261749267578125, + "learning_rate": 7.038925897993074e-08, + "loss": 0.1639, + "step": 11270 + }, + { + "epoch": 2.9992017030335285, + "grad_norm": 0.4483630657196045, + "learning_rate": 7.037312830441957e-08, + "loss": 0.1685, + "step": 11271 + }, + { + "epoch": 2.999467802022352, + "grad_norm": 0.2539655864238739, + "learning_rate": 7.035699847392693e-08, + "loss": 0.1758, + "step": 11272 + }, + { + "epoch": 2.9997339010111763, + "grad_norm": 0.3535880744457245, + "learning_rate": 7.034086948891296e-08, + "loss": 0.1838, + "step": 11273 + }, + { + "epoch": 3.0, + "grad_norm": 0.2708391845226288, + "learning_rate": 7.032474134983763e-08, + "loss": 0.1726, + "step": 11274 + }, + { + "epoch": 3.0002660989888237, + "grad_norm": 0.2753804624080658, + "learning_rate": 7.030861405716104e-08, + "loss": 0.1612, + "step": 11275 + }, + { + "epoch": 3.000532197977648, + "grad_norm": 0.27784720063209534, + "learning_rate": 7.029248761134306e-08, + "loss": 0.1738, + "step": 11276 + }, + { + "epoch": 3.0007982969664715, + "grad_norm": 0.36792975664138794, + "learning_rate": 7.027636201284372e-08, + "loss": 0.194, + "step": 11277 + }, + { + "epoch": 3.001064395955295, + "grad_norm": 0.33022958040237427, + "learning_rate": 7.026023726212293e-08, + "loss": 0.193, + "step": 11278 + }, + { + "epoch": 3.0013304949441193, + "grad_norm": 0.25785282254219055, + "learning_rate": 7.024411335964065e-08, + "loss": 0.1555, + "step": 11279 + }, + { + "epoch": 3.001596593932943, + "grad_norm": 0.28061679005622864, + "learning_rate": 7.022799030585666e-08, + "loss": 0.1722, + "step": 11280 + }, + { + "epoch": 3.001862692921767, + "grad_norm": 0.3191794157028198, + "learning_rate": 7.021186810123087e-08, + "loss": 0.1798, + "step": 11281 + }, + { + "epoch": 3.002128791910591, + "grad_norm": 0.2901633083820343, + "learning_rate": 7.019574674622322e-08, + "loss": 0.1813, + "step": 11282 + }, + { + "epoch": 3.0023948908994145, + "grad_norm": 0.384689062833786, + "learning_rate": 7.017962624129337e-08, + "loss": 0.1696, + "step": 11283 + }, + { + "epoch": 3.0026609898882386, + "grad_norm": 0.408191978931427, + "learning_rate": 7.016350658690121e-08, + "loss": 0.1882, + "step": 11284 + }, + { + "epoch": 3.0029270888770623, + "grad_norm": 0.34784895181655884, + "learning_rate": 7.014738778350646e-08, + "loss": 0.1816, + "step": 11285 + }, + { + "epoch": 3.003193187865886, + "grad_norm": 0.4094961881637573, + "learning_rate": 7.01312698315689e-08, + "loss": 0.1888, + "step": 11286 + }, + { + "epoch": 3.00345928685471, + "grad_norm": 0.26420092582702637, + "learning_rate": 7.011515273154818e-08, + "loss": 0.1591, + "step": 11287 + }, + { + "epoch": 3.0037253858435338, + "grad_norm": 0.2946520447731018, + "learning_rate": 7.009903648390406e-08, + "loss": 0.1977, + "step": 11288 + }, + { + "epoch": 3.0039914848323575, + "grad_norm": 0.34914660453796387, + "learning_rate": 7.008292108909616e-08, + "loss": 0.1807, + "step": 11289 + }, + { + "epoch": 3.0042575838211816, + "grad_norm": 0.2555616497993469, + "learning_rate": 7.00668065475842e-08, + "loss": 0.166, + "step": 11290 + }, + { + "epoch": 3.0045236828100053, + "grad_norm": 0.2661423981189728, + "learning_rate": 7.005069285982768e-08, + "loss": 0.1537, + "step": 11291 + }, + { + "epoch": 3.0047897817988294, + "grad_norm": 0.29859569668769836, + "learning_rate": 7.003458002628628e-08, + "loss": 0.167, + "step": 11292 + }, + { + "epoch": 3.005055880787653, + "grad_norm": 0.2736766040325165, + "learning_rate": 7.001846804741954e-08, + "loss": 0.1793, + "step": 11293 + }, + { + "epoch": 3.0053219797764767, + "grad_norm": 0.30332431197166443, + "learning_rate": 7.000235692368706e-08, + "loss": 0.1703, + "step": 11294 + }, + { + "epoch": 3.005588078765301, + "grad_norm": 0.2923865020275116, + "learning_rate": 6.998624665554825e-08, + "loss": 0.1725, + "step": 11295 + }, + { + "epoch": 3.0058541777541246, + "grad_norm": 0.4212912917137146, + "learning_rate": 6.997013724346275e-08, + "loss": 0.198, + "step": 11296 + }, + { + "epoch": 3.0061202767429482, + "grad_norm": 0.2746681571006775, + "learning_rate": 6.995402868788991e-08, + "loss": 0.1833, + "step": 11297 + }, + { + "epoch": 3.0063863757317724, + "grad_norm": 0.6553515195846558, + "learning_rate": 6.993792098928923e-08, + "loss": 0.1924, + "step": 11298 + }, + { + "epoch": 3.006652474720596, + "grad_norm": 0.29433712363243103, + "learning_rate": 6.992181414812014e-08, + "loss": 0.1692, + "step": 11299 + }, + { + "epoch": 3.0069185737094197, + "grad_norm": 0.4065908193588257, + "learning_rate": 6.990570816484203e-08, + "loss": 0.1844, + "step": 11300 + }, + { + "epoch": 3.007184672698244, + "grad_norm": 0.3829757273197174, + "learning_rate": 6.988960303991431e-08, + "loss": 0.1813, + "step": 11301 + }, + { + "epoch": 3.0074507716870675, + "grad_norm": 0.27418074011802673, + "learning_rate": 6.987349877379624e-08, + "loss": 0.1642, + "step": 11302 + }, + { + "epoch": 3.007716870675891, + "grad_norm": 0.2683059275150299, + "learning_rate": 6.985739536694726e-08, + "loss": 0.1737, + "step": 11303 + }, + { + "epoch": 3.0079829696647153, + "grad_norm": 0.2910446226596832, + "learning_rate": 6.984129281982658e-08, + "loss": 0.1691, + "step": 11304 + }, + { + "epoch": 3.008249068653539, + "grad_norm": 0.2534378170967102, + "learning_rate": 6.982519113289356e-08, + "loss": 0.1713, + "step": 11305 + }, + { + "epoch": 3.008515167642363, + "grad_norm": 0.2675822675228119, + "learning_rate": 6.980909030660735e-08, + "loss": 0.1826, + "step": 11306 + }, + { + "epoch": 3.008781266631187, + "grad_norm": 0.29095640778541565, + "learning_rate": 6.979299034142727e-08, + "loss": 0.1938, + "step": 11307 + }, + { + "epoch": 3.0090473656200105, + "grad_norm": 0.28988024592399597, + "learning_rate": 6.977689123781247e-08, + "loss": 0.1726, + "step": 11308 + }, + { + "epoch": 3.0093134646088346, + "grad_norm": 0.2698650658130646, + "learning_rate": 6.976079299622217e-08, + "loss": 0.1677, + "step": 11309 + }, + { + "epoch": 3.0095795635976583, + "grad_norm": 0.2867286205291748, + "learning_rate": 6.974469561711546e-08, + "loss": 0.1747, + "step": 11310 + }, + { + "epoch": 3.009845662586482, + "grad_norm": 0.40488916635513306, + "learning_rate": 6.972859910095159e-08, + "loss": 0.1954, + "step": 11311 + }, + { + "epoch": 3.010111761575306, + "grad_norm": 0.30341219902038574, + "learning_rate": 6.971250344818952e-08, + "loss": 0.1983, + "step": 11312 + }, + { + "epoch": 3.01037786056413, + "grad_norm": 0.3058733344078064, + "learning_rate": 6.969640865928843e-08, + "loss": 0.1861, + "step": 11313 + }, + { + "epoch": 3.0106439595529535, + "grad_norm": 0.27179592847824097, + "learning_rate": 6.968031473470733e-08, + "loss": 0.1839, + "step": 11314 + }, + { + "epoch": 3.0109100585417776, + "grad_norm": 0.3529525697231293, + "learning_rate": 6.966422167490529e-08, + "loss": 0.1911, + "step": 11315 + }, + { + "epoch": 3.0111761575306013, + "grad_norm": 0.34464067220687866, + "learning_rate": 6.964812948034126e-08, + "loss": 0.1764, + "step": 11316 + }, + { + "epoch": 3.0114422565194254, + "grad_norm": 0.37724944949150085, + "learning_rate": 6.963203815147427e-08, + "loss": 0.1906, + "step": 11317 + }, + { + "epoch": 3.011708355508249, + "grad_norm": 0.2623693346977234, + "learning_rate": 6.961594768876325e-08, + "loss": 0.1671, + "step": 11318 + }, + { + "epoch": 3.011974454497073, + "grad_norm": 0.4770945608615875, + "learning_rate": 6.959985809266715e-08, + "loss": 0.196, + "step": 11319 + }, + { + "epoch": 3.012240553485897, + "grad_norm": 0.3858098089694977, + "learning_rate": 6.958376936364488e-08, + "loss": 0.1878, + "step": 11320 + }, + { + "epoch": 3.0125066524747206, + "grad_norm": 0.2685815691947937, + "learning_rate": 6.95676815021553e-08, + "loss": 0.1668, + "step": 11321 + }, + { + "epoch": 3.0127727514635443, + "grad_norm": 0.2916623055934906, + "learning_rate": 6.955159450865734e-08, + "loss": 0.1704, + "step": 11322 + }, + { + "epoch": 3.0130388504523684, + "grad_norm": 0.27865299582481384, + "learning_rate": 6.953550838360973e-08, + "loss": 0.168, + "step": 11323 + }, + { + "epoch": 3.013304949441192, + "grad_norm": 0.2844417989253998, + "learning_rate": 6.951942312747134e-08, + "loss": 0.1823, + "step": 11324 + }, + { + "epoch": 3.0135710484300158, + "grad_norm": 0.31206071376800537, + "learning_rate": 6.950333874070094e-08, + "loss": 0.1846, + "step": 11325 + }, + { + "epoch": 3.01383714741884, + "grad_norm": 0.28926628828048706, + "learning_rate": 6.948725522375731e-08, + "loss": 0.1896, + "step": 11326 + }, + { + "epoch": 3.0141032464076636, + "grad_norm": 0.2594873011112213, + "learning_rate": 6.947117257709914e-08, + "loss": 0.168, + "step": 11327 + }, + { + "epoch": 3.0143693453964877, + "grad_norm": 0.453885942697525, + "learning_rate": 6.945509080118516e-08, + "loss": 0.1876, + "step": 11328 + }, + { + "epoch": 3.0146354443853114, + "grad_norm": 0.28895094990730286, + "learning_rate": 6.943900989647407e-08, + "loss": 0.1808, + "step": 11329 + }, + { + "epoch": 3.014901543374135, + "grad_norm": 0.26419758796691895, + "learning_rate": 6.942292986342455e-08, + "loss": 0.1698, + "step": 11330 + }, + { + "epoch": 3.015167642362959, + "grad_norm": 0.3999006748199463, + "learning_rate": 6.940685070249517e-08, + "loss": 0.1914, + "step": 11331 + }, + { + "epoch": 3.015433741351783, + "grad_norm": 0.353606253862381, + "learning_rate": 6.939077241414459e-08, + "loss": 0.1779, + "step": 11332 + }, + { + "epoch": 3.0156998403406066, + "grad_norm": 0.3085554540157318, + "learning_rate": 6.937469499883138e-08, + "loss": 0.1741, + "step": 11333 + }, + { + "epoch": 3.0159659393294307, + "grad_norm": 0.30376091599464417, + "learning_rate": 6.93586184570141e-08, + "loss": 0.1952, + "step": 11334 + }, + { + "epoch": 3.0162320383182544, + "grad_norm": 0.3803359270095825, + "learning_rate": 6.934254278915128e-08, + "loss": 0.1821, + "step": 11335 + }, + { + "epoch": 3.016498137307078, + "grad_norm": 0.3849638104438782, + "learning_rate": 6.932646799570143e-08, + "loss": 0.1849, + "step": 11336 + }, + { + "epoch": 3.016764236295902, + "grad_norm": 0.35349181294441223, + "learning_rate": 6.93103940771231e-08, + "loss": 0.1899, + "step": 11337 + }, + { + "epoch": 3.017030335284726, + "grad_norm": 0.3399497866630554, + "learning_rate": 6.929432103387462e-08, + "loss": 0.1857, + "step": 11338 + }, + { + "epoch": 3.0172964342735495, + "grad_norm": 0.33459338545799255, + "learning_rate": 6.927824886641456e-08, + "loss": 0.1769, + "step": 11339 + }, + { + "epoch": 3.0175625332623737, + "grad_norm": 0.25926241278648376, + "learning_rate": 6.926217757520124e-08, + "loss": 0.1749, + "step": 11340 + }, + { + "epoch": 3.0178286322511974, + "grad_norm": 0.4154568612575531, + "learning_rate": 6.92461071606931e-08, + "loss": 0.1851, + "step": 11341 + }, + { + "epoch": 3.0180947312400215, + "grad_norm": 0.3334653377532959, + "learning_rate": 6.923003762334845e-08, + "loss": 0.1888, + "step": 11342 + }, + { + "epoch": 3.018360830228845, + "grad_norm": 0.2979173958301544, + "learning_rate": 6.921396896362569e-08, + "loss": 0.1768, + "step": 11343 + }, + { + "epoch": 3.018626929217669, + "grad_norm": 0.390371710062027, + "learning_rate": 6.919790118198305e-08, + "loss": 0.1788, + "step": 11344 + }, + { + "epoch": 3.018893028206493, + "grad_norm": 0.28629207611083984, + "learning_rate": 6.918183427887891e-08, + "loss": 0.1906, + "step": 11345 + }, + { + "epoch": 3.0191591271953166, + "grad_norm": 0.26444557309150696, + "learning_rate": 6.916576825477143e-08, + "loss": 0.163, + "step": 11346 + }, + { + "epoch": 3.0194252261841403, + "grad_norm": 0.36419516801834106, + "learning_rate": 6.914970311011896e-08, + "loss": 0.1655, + "step": 11347 + }, + { + "epoch": 3.0196913251729645, + "grad_norm": 0.2825525999069214, + "learning_rate": 6.913363884537959e-08, + "loss": 0.1642, + "step": 11348 + }, + { + "epoch": 3.019957424161788, + "grad_norm": 0.3770568072795868, + "learning_rate": 6.91175754610116e-08, + "loss": 0.1758, + "step": 11349 + }, + { + "epoch": 3.020223523150612, + "grad_norm": 0.34133458137512207, + "learning_rate": 6.91015129574731e-08, + "loss": 0.1902, + "step": 11350 + }, + { + "epoch": 3.020489622139436, + "grad_norm": 0.2950696349143982, + "learning_rate": 6.908545133522226e-08, + "loss": 0.1943, + "step": 11351 + }, + { + "epoch": 3.0207557211282596, + "grad_norm": 0.38032788038253784, + "learning_rate": 6.906939059471715e-08, + "loss": 0.1724, + "step": 11352 + }, + { + "epoch": 3.0210218201170838, + "grad_norm": 0.46660953760147095, + "learning_rate": 6.905333073641587e-08, + "loss": 0.1846, + "step": 11353 + }, + { + "epoch": 3.0212879191059074, + "grad_norm": 0.3380657434463501, + "learning_rate": 6.903727176077647e-08, + "loss": 0.1734, + "step": 11354 + }, + { + "epoch": 3.021554018094731, + "grad_norm": 0.2997693419456482, + "learning_rate": 6.902121366825702e-08, + "loss": 0.186, + "step": 11355 + }, + { + "epoch": 3.0218201170835552, + "grad_norm": 0.34130051732063293, + "learning_rate": 6.900515645931553e-08, + "loss": 0.1928, + "step": 11356 + }, + { + "epoch": 3.022086216072379, + "grad_norm": 0.3483177125453949, + "learning_rate": 6.898910013440992e-08, + "loss": 0.1885, + "step": 11357 + }, + { + "epoch": 3.0223523150612026, + "grad_norm": 0.3417685329914093, + "learning_rate": 6.897304469399821e-08, + "loss": 0.1689, + "step": 11358 + }, + { + "epoch": 3.0226184140500267, + "grad_norm": 0.46542203426361084, + "learning_rate": 6.89569901385383e-08, + "loss": 0.1958, + "step": 11359 + }, + { + "epoch": 3.0228845130388504, + "grad_norm": 0.3693368434906006, + "learning_rate": 6.894093646848814e-08, + "loss": 0.1645, + "step": 11360 + }, + { + "epoch": 3.023150612027674, + "grad_norm": 0.26825129985809326, + "learning_rate": 6.892488368430556e-08, + "loss": 0.1655, + "step": 11361 + }, + { + "epoch": 3.0234167110164982, + "grad_norm": 0.28481435775756836, + "learning_rate": 6.890883178644849e-08, + "loss": 0.1596, + "step": 11362 + }, + { + "epoch": 3.023682810005322, + "grad_norm": 0.28954577445983887, + "learning_rate": 6.889278077537468e-08, + "loss": 0.1857, + "step": 11363 + }, + { + "epoch": 3.023948908994146, + "grad_norm": 0.28064146637916565, + "learning_rate": 6.887673065154198e-08, + "loss": 0.1939, + "step": 11364 + }, + { + "epoch": 3.0242150079829697, + "grad_norm": 0.4099082052707672, + "learning_rate": 6.886068141540816e-08, + "loss": 0.1799, + "step": 11365 + }, + { + "epoch": 3.0244811069717934, + "grad_norm": 0.33845794200897217, + "learning_rate": 6.884463306743102e-08, + "loss": 0.196, + "step": 11366 + }, + { + "epoch": 3.0247472059606175, + "grad_norm": 0.3677947223186493, + "learning_rate": 6.882858560806821e-08, + "loss": 0.2092, + "step": 11367 + }, + { + "epoch": 3.025013304949441, + "grad_norm": 0.38556379079818726, + "learning_rate": 6.881253903777751e-08, + "loss": 0.172, + "step": 11368 + }, + { + "epoch": 3.025279403938265, + "grad_norm": 0.27108925580978394, + "learning_rate": 6.879649335701656e-08, + "loss": 0.1634, + "step": 11369 + }, + { + "epoch": 3.025545502927089, + "grad_norm": 0.25989824533462524, + "learning_rate": 6.878044856624306e-08, + "loss": 0.1706, + "step": 11370 + }, + { + "epoch": 3.0258116019159127, + "grad_norm": 0.3360441327095032, + "learning_rate": 6.876440466591454e-08, + "loss": 0.1811, + "step": 11371 + }, + { + "epoch": 3.0260777009047364, + "grad_norm": 0.2662251591682434, + "learning_rate": 6.874836165648874e-08, + "loss": 0.1668, + "step": 11372 + }, + { + "epoch": 3.0263437998935605, + "grad_norm": 0.31968173384666443, + "learning_rate": 6.873231953842314e-08, + "loss": 0.1845, + "step": 11373 + }, + { + "epoch": 3.026609898882384, + "grad_norm": 0.3375992178916931, + "learning_rate": 6.871627831217531e-08, + "loss": 0.1743, + "step": 11374 + }, + { + "epoch": 3.0268759978712083, + "grad_norm": 0.29918941855430603, + "learning_rate": 6.870023797820283e-08, + "loss": 0.1742, + "step": 11375 + }, + { + "epoch": 3.027142096860032, + "grad_norm": 0.2795722782611847, + "learning_rate": 6.868419853696313e-08, + "loss": 0.1777, + "step": 11376 + }, + { + "epoch": 3.0274081958488557, + "grad_norm": 0.2811254858970642, + "learning_rate": 6.866815998891378e-08, + "loss": 0.1815, + "step": 11377 + }, + { + "epoch": 3.02767429483768, + "grad_norm": 0.2641691267490387, + "learning_rate": 6.865212233451212e-08, + "loss": 0.16, + "step": 11378 + }, + { + "epoch": 3.0279403938265035, + "grad_norm": 0.2698999345302582, + "learning_rate": 6.863608557421568e-08, + "loss": 0.1624, + "step": 11379 + }, + { + "epoch": 3.028206492815327, + "grad_norm": 0.3485264778137207, + "learning_rate": 6.862004970848176e-08, + "loss": 0.1815, + "step": 11380 + }, + { + "epoch": 3.0284725918041513, + "grad_norm": 0.28547394275665283, + "learning_rate": 6.860401473776785e-08, + "loss": 0.1673, + "step": 11381 + }, + { + "epoch": 3.028738690792975, + "grad_norm": 0.25759243965148926, + "learning_rate": 6.85879806625312e-08, + "loss": 0.169, + "step": 11382 + }, + { + "epoch": 3.0290047897817987, + "grad_norm": 0.2969092130661011, + "learning_rate": 6.857194748322919e-08, + "loss": 0.173, + "step": 11383 + }, + { + "epoch": 3.029270888770623, + "grad_norm": 0.3574209213256836, + "learning_rate": 6.855591520031907e-08, + "loss": 0.1849, + "step": 11384 + }, + { + "epoch": 3.0295369877594465, + "grad_norm": 0.3357173502445221, + "learning_rate": 6.853988381425819e-08, + "loss": 0.1776, + "step": 11385 + }, + { + "epoch": 3.02980308674827, + "grad_norm": 0.3584473431110382, + "learning_rate": 6.852385332550372e-08, + "loss": 0.1808, + "step": 11386 + }, + { + "epoch": 3.0300691857370943, + "grad_norm": 0.29029521346092224, + "learning_rate": 6.850782373451296e-08, + "loss": 0.1723, + "step": 11387 + }, + { + "epoch": 3.030335284725918, + "grad_norm": 0.2583891451358795, + "learning_rate": 6.849179504174301e-08, + "loss": 0.1603, + "step": 11388 + }, + { + "epoch": 3.030601383714742, + "grad_norm": 0.2864086329936981, + "learning_rate": 6.847576724765112e-08, + "loss": 0.1765, + "step": 11389 + }, + { + "epoch": 3.0308674827035658, + "grad_norm": 0.3793181777000427, + "learning_rate": 6.84597403526944e-08, + "loss": 0.1783, + "step": 11390 + }, + { + "epoch": 3.0311335816923894, + "grad_norm": 0.2649883031845093, + "learning_rate": 6.844371435732996e-08, + "loss": 0.1538, + "step": 11391 + }, + { + "epoch": 3.0313996806812136, + "grad_norm": 0.4264189600944519, + "learning_rate": 6.842768926201497e-08, + "loss": 0.1836, + "step": 11392 + }, + { + "epoch": 3.0316657796700373, + "grad_norm": 0.3849910795688629, + "learning_rate": 6.841166506720639e-08, + "loss": 0.1609, + "step": 11393 + }, + { + "epoch": 3.031931878658861, + "grad_norm": 0.2838071882724762, + "learning_rate": 6.839564177336134e-08, + "loss": 0.1701, + "step": 11394 + }, + { + "epoch": 3.032197977647685, + "grad_norm": 0.4075479805469513, + "learning_rate": 6.837961938093676e-08, + "loss": 0.178, + "step": 11395 + }, + { + "epoch": 3.0324640766365087, + "grad_norm": 0.3619602918624878, + "learning_rate": 6.836359789038977e-08, + "loss": 0.169, + "step": 11396 + }, + { + "epoch": 3.0327301756253324, + "grad_norm": 0.28038835525512695, + "learning_rate": 6.834757730217719e-08, + "loss": 0.1693, + "step": 11397 + }, + { + "epoch": 3.0329962746141566, + "grad_norm": 0.35130801796913147, + "learning_rate": 6.833155761675607e-08, + "loss": 0.1919, + "step": 11398 + }, + { + "epoch": 3.0332623736029802, + "grad_norm": 0.27555564045906067, + "learning_rate": 6.831553883458325e-08, + "loss": 0.1551, + "step": 11399 + }, + { + "epoch": 3.0335284725918044, + "grad_norm": 0.3581506013870239, + "learning_rate": 6.829952095611567e-08, + "loss": 0.1858, + "step": 11400 + }, + { + "epoch": 3.033794571580628, + "grad_norm": 0.4124397337436676, + "learning_rate": 6.828350398181013e-08, + "loss": 0.189, + "step": 11401 + }, + { + "epoch": 3.0340606705694517, + "grad_norm": 0.2545134425163269, + "learning_rate": 6.826748791212358e-08, + "loss": 0.1663, + "step": 11402 + }, + { + "epoch": 3.034326769558276, + "grad_norm": 0.2633861303329468, + "learning_rate": 6.825147274751269e-08, + "loss": 0.1562, + "step": 11403 + }, + { + "epoch": 3.0345928685470995, + "grad_norm": 0.2910088896751404, + "learning_rate": 6.823545848843434e-08, + "loss": 0.1903, + "step": 11404 + }, + { + "epoch": 3.034858967535923, + "grad_norm": 0.2622862160205841, + "learning_rate": 6.821944513534526e-08, + "loss": 0.161, + "step": 11405 + }, + { + "epoch": 3.0351250665247473, + "grad_norm": 0.3105517625808716, + "learning_rate": 6.820343268870221e-08, + "loss": 0.1988, + "step": 11406 + }, + { + "epoch": 3.035391165513571, + "grad_norm": 0.3234846591949463, + "learning_rate": 6.818742114896184e-08, + "loss": 0.1641, + "step": 11407 + }, + { + "epoch": 3.0356572645023947, + "grad_norm": 0.25850656628608704, + "learning_rate": 6.817141051658088e-08, + "loss": 0.1747, + "step": 11408 + }, + { + "epoch": 3.035923363491219, + "grad_norm": 0.3436259925365448, + "learning_rate": 6.815540079201596e-08, + "loss": 0.1798, + "step": 11409 + }, + { + "epoch": 3.0361894624800425, + "grad_norm": 0.2566177546977997, + "learning_rate": 6.813939197572371e-08, + "loss": 0.1577, + "step": 11410 + }, + { + "epoch": 3.036455561468866, + "grad_norm": 0.2711499035358429, + "learning_rate": 6.812338406816077e-08, + "loss": 0.1883, + "step": 11411 + }, + { + "epoch": 3.0367216604576903, + "grad_norm": 0.38158726692199707, + "learning_rate": 6.810737706978366e-08, + "loss": 0.1852, + "step": 11412 + }, + { + "epoch": 3.036987759446514, + "grad_norm": 0.2753658890724182, + "learning_rate": 6.809137098104902e-08, + "loss": 0.1761, + "step": 11413 + }, + { + "epoch": 3.037253858435338, + "grad_norm": 0.2983064353466034, + "learning_rate": 6.807536580241328e-08, + "loss": 0.1677, + "step": 11414 + }, + { + "epoch": 3.037519957424162, + "grad_norm": 0.3380524218082428, + "learning_rate": 6.8059361534333e-08, + "loss": 0.1903, + "step": 11415 + }, + { + "epoch": 3.0377860564129855, + "grad_norm": 0.28073787689208984, + "learning_rate": 6.80433581772646e-08, + "loss": 0.1586, + "step": 11416 + }, + { + "epoch": 3.0380521554018096, + "grad_norm": 0.3055996596813202, + "learning_rate": 6.802735573166463e-08, + "loss": 0.192, + "step": 11417 + }, + { + "epoch": 3.0383182543906333, + "grad_norm": 0.287840336561203, + "learning_rate": 6.80113541979894e-08, + "loss": 0.194, + "step": 11418 + }, + { + "epoch": 3.038584353379457, + "grad_norm": 0.3186977803707123, + "learning_rate": 6.799535357669537e-08, + "loss": 0.1747, + "step": 11419 + }, + { + "epoch": 3.038850452368281, + "grad_norm": 0.379061222076416, + "learning_rate": 6.797935386823887e-08, + "loss": 0.1912, + "step": 11420 + }, + { + "epoch": 3.039116551357105, + "grad_norm": 0.27992674708366394, + "learning_rate": 6.79633550730763e-08, + "loss": 0.1846, + "step": 11421 + }, + { + "epoch": 3.0393826503459285, + "grad_norm": 0.3240988850593567, + "learning_rate": 6.794735719166389e-08, + "loss": 0.1644, + "step": 11422 + }, + { + "epoch": 3.0396487493347526, + "grad_norm": 0.35884565114974976, + "learning_rate": 6.793136022445805e-08, + "loss": 0.1824, + "step": 11423 + }, + { + "epoch": 3.0399148483235763, + "grad_norm": 0.3515612483024597, + "learning_rate": 6.791536417191495e-08, + "loss": 0.1644, + "step": 11424 + }, + { + "epoch": 3.0401809473124004, + "grad_norm": 0.4101440906524658, + "learning_rate": 6.789936903449087e-08, + "loss": 0.1728, + "step": 11425 + }, + { + "epoch": 3.040447046301224, + "grad_norm": 0.2677772641181946, + "learning_rate": 6.788337481264199e-08, + "loss": 0.2034, + "step": 11426 + }, + { + "epoch": 3.0407131452900478, + "grad_norm": 0.2559507191181183, + "learning_rate": 6.786738150682453e-08, + "loss": 0.1615, + "step": 11427 + }, + { + "epoch": 3.040979244278872, + "grad_norm": 0.3427935242652893, + "learning_rate": 6.78513891174947e-08, + "loss": 0.1573, + "step": 11428 + }, + { + "epoch": 3.0412453432676956, + "grad_norm": 0.33017808198928833, + "learning_rate": 6.783539764510853e-08, + "loss": 0.1825, + "step": 11429 + }, + { + "epoch": 3.0415114422565193, + "grad_norm": 0.26985979080200195, + "learning_rate": 6.78194070901222e-08, + "loss": 0.1688, + "step": 11430 + }, + { + "epoch": 3.0417775412453434, + "grad_norm": 0.4411875903606415, + "learning_rate": 6.780341745299176e-08, + "loss": 0.1958, + "step": 11431 + }, + { + "epoch": 3.042043640234167, + "grad_norm": 0.47399744391441345, + "learning_rate": 6.778742873417331e-08, + "loss": 0.2115, + "step": 11432 + }, + { + "epoch": 3.0423097392229907, + "grad_norm": 0.29933738708496094, + "learning_rate": 6.777144093412284e-08, + "loss": 0.1892, + "step": 11433 + }, + { + "epoch": 3.042575838211815, + "grad_norm": 0.275136262178421, + "learning_rate": 6.775545405329635e-08, + "loss": 0.1804, + "step": 11434 + }, + { + "epoch": 3.0428419372006386, + "grad_norm": 0.35907015204429626, + "learning_rate": 6.773946809214984e-08, + "loss": 0.1906, + "step": 11435 + }, + { + "epoch": 3.0431080361894627, + "grad_norm": 0.4220466911792755, + "learning_rate": 6.772348305113927e-08, + "loss": 0.2004, + "step": 11436 + }, + { + "epoch": 3.0433741351782864, + "grad_norm": 0.42235246300697327, + "learning_rate": 6.770749893072052e-08, + "loss": 0.1793, + "step": 11437 + }, + { + "epoch": 3.04364023416711, + "grad_norm": 0.3157483637332916, + "learning_rate": 6.769151573134956e-08, + "loss": 0.1766, + "step": 11438 + }, + { + "epoch": 3.043906333155934, + "grad_norm": 0.33846989274024963, + "learning_rate": 6.76755334534822e-08, + "loss": 0.1817, + "step": 11439 + }, + { + "epoch": 3.044172432144758, + "grad_norm": 0.3244176208972931, + "learning_rate": 6.765955209757433e-08, + "loss": 0.1881, + "step": 11440 + }, + { + "epoch": 3.0444385311335815, + "grad_norm": 0.3580017387866974, + "learning_rate": 6.764357166408174e-08, + "loss": 0.1741, + "step": 11441 + }, + { + "epoch": 3.0447046301224057, + "grad_norm": 0.40683436393737793, + "learning_rate": 6.762759215346027e-08, + "loss": 0.1881, + "step": 11442 + }, + { + "epoch": 3.0449707291112293, + "grad_norm": 0.31839293241500854, + "learning_rate": 6.76116135661656e-08, + "loss": 0.1707, + "step": 11443 + }, + { + "epoch": 3.045236828100053, + "grad_norm": 0.3884865939617157, + "learning_rate": 6.759563590265358e-08, + "loss": 0.1825, + "step": 11444 + }, + { + "epoch": 3.045502927088877, + "grad_norm": 0.3333714008331299, + "learning_rate": 6.757965916337983e-08, + "loss": 0.1863, + "step": 11445 + }, + { + "epoch": 3.045769026077701, + "grad_norm": 0.3561297059059143, + "learning_rate": 6.756368334880008e-08, + "loss": 0.1693, + "step": 11446 + }, + { + "epoch": 3.046035125066525, + "grad_norm": 0.3088895082473755, + "learning_rate": 6.754770845937003e-08, + "loss": 0.196, + "step": 11447 + }, + { + "epoch": 3.0463012240553486, + "grad_norm": 0.28274399042129517, + "learning_rate": 6.753173449554526e-08, + "loss": 0.1658, + "step": 11448 + }, + { + "epoch": 3.0465673230441723, + "grad_norm": 0.4183090329170227, + "learning_rate": 6.751576145778142e-08, + "loss": 0.2004, + "step": 11449 + }, + { + "epoch": 3.0468334220329965, + "grad_norm": 0.5249990820884705, + "learning_rate": 6.749978934653405e-08, + "loss": 0.1917, + "step": 11450 + }, + { + "epoch": 3.04709952102182, + "grad_norm": 0.27046483755111694, + "learning_rate": 6.748381816225874e-08, + "loss": 0.1778, + "step": 11451 + }, + { + "epoch": 3.047365620010644, + "grad_norm": 0.25562676787376404, + "learning_rate": 6.746784790541101e-08, + "loss": 0.164, + "step": 11452 + }, + { + "epoch": 3.047631718999468, + "grad_norm": 0.25910255312919617, + "learning_rate": 6.74518785764464e-08, + "loss": 0.1655, + "step": 11453 + }, + { + "epoch": 3.0478978179882916, + "grad_norm": 0.2763303816318512, + "learning_rate": 6.743591017582031e-08, + "loss": 0.1714, + "step": 11454 + }, + { + "epoch": 3.0481639169771153, + "grad_norm": 0.31486329436302185, + "learning_rate": 6.741994270398825e-08, + "loss": 0.1748, + "step": 11455 + }, + { + "epoch": 3.0484300159659394, + "grad_norm": 0.2834235727787018, + "learning_rate": 6.740397616140563e-08, + "loss": 0.1735, + "step": 11456 + }, + { + "epoch": 3.048696114954763, + "grad_norm": 0.3136415183544159, + "learning_rate": 6.738801054852787e-08, + "loss": 0.1923, + "step": 11457 + }, + { + "epoch": 3.048962213943587, + "grad_norm": 0.3492773771286011, + "learning_rate": 6.737204586581029e-08, + "loss": 0.1897, + "step": 11458 + }, + { + "epoch": 3.049228312932411, + "grad_norm": 0.3046285808086395, + "learning_rate": 6.735608211370827e-08, + "loss": 0.1779, + "step": 11459 + }, + { + "epoch": 3.0494944119212346, + "grad_norm": 0.2688966691493988, + "learning_rate": 6.734011929267711e-08, + "loss": 0.1769, + "step": 11460 + }, + { + "epoch": 3.0497605109100587, + "grad_norm": 0.29492413997650146, + "learning_rate": 6.732415740317215e-08, + "loss": 0.1694, + "step": 11461 + }, + { + "epoch": 3.0500266098988824, + "grad_norm": 0.396960973739624, + "learning_rate": 6.730819644564858e-08, + "loss": 0.2067, + "step": 11462 + }, + { + "epoch": 3.050292708887706, + "grad_norm": 0.28778716921806335, + "learning_rate": 6.729223642056175e-08, + "loss": 0.1557, + "step": 11463 + }, + { + "epoch": 3.05055880787653, + "grad_norm": 0.2869500517845154, + "learning_rate": 6.727627732836673e-08, + "loss": 0.1846, + "step": 11464 + }, + { + "epoch": 3.050824906865354, + "grad_norm": 0.4305046498775482, + "learning_rate": 6.72603191695188e-08, + "loss": 0.1925, + "step": 11465 + }, + { + "epoch": 3.0510910058541776, + "grad_norm": 0.29095223546028137, + "learning_rate": 6.72443619444731e-08, + "loss": 0.1833, + "step": 11466 + }, + { + "epoch": 3.0513571048430017, + "grad_norm": 0.2829912602901459, + "learning_rate": 6.722840565368474e-08, + "loss": 0.1782, + "step": 11467 + }, + { + "epoch": 3.0516232038318254, + "grad_norm": 0.26966461539268494, + "learning_rate": 6.721245029760891e-08, + "loss": 0.1906, + "step": 11468 + }, + { + "epoch": 3.051889302820649, + "grad_norm": 0.26177114248275757, + "learning_rate": 6.719649587670058e-08, + "loss": 0.171, + "step": 11469 + }, + { + "epoch": 3.052155401809473, + "grad_norm": 0.34620019793510437, + "learning_rate": 6.718054239141484e-08, + "loss": 0.1884, + "step": 11470 + }, + { + "epoch": 3.052421500798297, + "grad_norm": 0.2802492082118988, + "learning_rate": 6.716458984220672e-08, + "loss": 0.1744, + "step": 11471 + }, + { + "epoch": 3.052687599787121, + "grad_norm": 0.29337960481643677, + "learning_rate": 6.714863822953128e-08, + "loss": 0.1753, + "step": 11472 + }, + { + "epoch": 3.0529536987759447, + "grad_norm": 0.3811142146587372, + "learning_rate": 6.713268755384334e-08, + "loss": 0.1775, + "step": 11473 + }, + { + "epoch": 3.0532197977647684, + "grad_norm": 0.2706407308578491, + "learning_rate": 6.711673781559804e-08, + "loss": 0.1925, + "step": 11474 + }, + { + "epoch": 3.0534858967535925, + "grad_norm": 0.27826452255249023, + "learning_rate": 6.710078901525013e-08, + "loss": 0.1798, + "step": 11475 + }, + { + "epoch": 3.053751995742416, + "grad_norm": 0.46716630458831787, + "learning_rate": 6.708484115325458e-08, + "loss": 0.1841, + "step": 11476 + }, + { + "epoch": 3.05401809473124, + "grad_norm": 0.3537292778491974, + "learning_rate": 6.706889423006626e-08, + "loss": 0.1758, + "step": 11477 + }, + { + "epoch": 3.054284193720064, + "grad_norm": 0.2563280463218689, + "learning_rate": 6.705294824614003e-08, + "loss": 0.1682, + "step": 11478 + }, + { + "epoch": 3.0545502927088877, + "grad_norm": 0.2656494379043579, + "learning_rate": 6.703700320193062e-08, + "loss": 0.167, + "step": 11479 + }, + { + "epoch": 3.0548163916977114, + "grad_norm": 0.3150673508644104, + "learning_rate": 6.702105909789289e-08, + "loss": 0.1519, + "step": 11480 + }, + { + "epoch": 3.0550824906865355, + "grad_norm": 0.26549413800239563, + "learning_rate": 6.700511593448153e-08, + "loss": 0.1802, + "step": 11481 + }, + { + "epoch": 3.055348589675359, + "grad_norm": 0.27086156606674194, + "learning_rate": 6.698917371215134e-08, + "loss": 0.1679, + "step": 11482 + }, + { + "epoch": 3.0556146886641833, + "grad_norm": 0.2826789915561676, + "learning_rate": 6.697323243135702e-08, + "loss": 0.191, + "step": 11483 + }, + { + "epoch": 3.055880787653007, + "grad_norm": 0.2739424407482147, + "learning_rate": 6.695729209255318e-08, + "loss": 0.1853, + "step": 11484 + }, + { + "epoch": 3.0561468866418307, + "grad_norm": 0.2920622229576111, + "learning_rate": 6.694135269619455e-08, + "loss": 0.1855, + "step": 11485 + }, + { + "epoch": 3.0564129856306548, + "grad_norm": 0.2843305766582489, + "learning_rate": 6.692541424273569e-08, + "loss": 0.1771, + "step": 11486 + }, + { + "epoch": 3.0566790846194785, + "grad_norm": 0.3483114242553711, + "learning_rate": 6.690947673263125e-08, + "loss": 0.1823, + "step": 11487 + }, + { + "epoch": 3.056945183608302, + "grad_norm": 0.30826133489608765, + "learning_rate": 6.689354016633575e-08, + "loss": 0.1765, + "step": 11488 + }, + { + "epoch": 3.0572112825971263, + "grad_norm": 0.338001012802124, + "learning_rate": 6.687760454430383e-08, + "loss": 0.1766, + "step": 11489 + }, + { + "epoch": 3.05747738158595, + "grad_norm": 0.2816598117351532, + "learning_rate": 6.686166986698986e-08, + "loss": 0.1823, + "step": 11490 + }, + { + "epoch": 3.0577434805747736, + "grad_norm": 0.306831032037735, + "learning_rate": 6.684573613484842e-08, + "loss": 0.1598, + "step": 11491 + }, + { + "epoch": 3.0580095795635978, + "grad_norm": 0.2706563472747803, + "learning_rate": 6.682980334833397e-08, + "loss": 0.1661, + "step": 11492 + }, + { + "epoch": 3.0582756785524214, + "grad_norm": 0.28401675820350647, + "learning_rate": 6.681387150790097e-08, + "loss": 0.1714, + "step": 11493 + }, + { + "epoch": 3.0585417775412456, + "grad_norm": 0.3706303536891937, + "learning_rate": 6.679794061400374e-08, + "loss": 0.1817, + "step": 11494 + }, + { + "epoch": 3.0588078765300692, + "grad_norm": 0.42236170172691345, + "learning_rate": 6.678201066709674e-08, + "loss": 0.1808, + "step": 11495 + }, + { + "epoch": 3.059073975518893, + "grad_norm": 0.40665575861930847, + "learning_rate": 6.676608166763426e-08, + "loss": 0.183, + "step": 11496 + }, + { + "epoch": 3.059340074507717, + "grad_norm": 0.3916824758052826, + "learning_rate": 6.675015361607072e-08, + "loss": 0.1643, + "step": 11497 + }, + { + "epoch": 3.0596061734965407, + "grad_norm": 0.29449814558029175, + "learning_rate": 6.673422651286031e-08, + "loss": 0.176, + "step": 11498 + }, + { + "epoch": 3.0598722724853644, + "grad_norm": 0.38101956248283386, + "learning_rate": 6.671830035845741e-08, + "loss": 0.1952, + "step": 11499 + }, + { + "epoch": 3.0601383714741885, + "grad_norm": 0.3784218728542328, + "learning_rate": 6.670237515331619e-08, + "loss": 0.1883, + "step": 11500 + }, + { + "epoch": 3.0604044704630122, + "grad_norm": 0.27497297525405884, + "learning_rate": 6.668645089789088e-08, + "loss": 0.1851, + "step": 11501 + }, + { + "epoch": 3.060670569451836, + "grad_norm": 0.39149829745292664, + "learning_rate": 6.66705275926357e-08, + "loss": 0.1809, + "step": 11502 + }, + { + "epoch": 3.06093666844066, + "grad_norm": 0.3628574311733246, + "learning_rate": 6.66546052380048e-08, + "loss": 0.1761, + "step": 11503 + }, + { + "epoch": 3.0612027674294837, + "grad_norm": 0.2568015158176422, + "learning_rate": 6.663868383445234e-08, + "loss": 0.1566, + "step": 11504 + }, + { + "epoch": 3.0614688664183074, + "grad_norm": 0.26384034752845764, + "learning_rate": 6.66227633824324e-08, + "loss": 0.1595, + "step": 11505 + }, + { + "epoch": 3.0617349654071315, + "grad_norm": 0.27351903915405273, + "learning_rate": 6.660684388239908e-08, + "loss": 0.1796, + "step": 11506 + }, + { + "epoch": 3.062001064395955, + "grad_norm": 0.39198359847068787, + "learning_rate": 6.65909253348064e-08, + "loss": 0.182, + "step": 11507 + }, + { + "epoch": 3.0622671633847793, + "grad_norm": 0.2791697382926941, + "learning_rate": 6.657500774010847e-08, + "loss": 0.1756, + "step": 11508 + }, + { + "epoch": 3.062533262373603, + "grad_norm": 0.3248801529407501, + "learning_rate": 6.655909109875919e-08, + "loss": 0.1678, + "step": 11509 + }, + { + "epoch": 3.0627993613624267, + "grad_norm": 0.2755732238292694, + "learning_rate": 6.654317541121262e-08, + "loss": 0.1577, + "step": 11510 + }, + { + "epoch": 3.063065460351251, + "grad_norm": 0.27307188510894775, + "learning_rate": 6.652726067792266e-08, + "loss": 0.1845, + "step": 11511 + }, + { + "epoch": 3.0633315593400745, + "grad_norm": 0.3016586899757385, + "learning_rate": 6.651134689934326e-08, + "loss": 0.1791, + "step": 11512 + }, + { + "epoch": 3.063597658328898, + "grad_norm": 0.25245922803878784, + "learning_rate": 6.649543407592827e-08, + "loss": 0.1629, + "step": 11513 + }, + { + "epoch": 3.0638637573177223, + "grad_norm": 0.2758750319480896, + "learning_rate": 6.647952220813162e-08, + "loss": 0.1815, + "step": 11514 + }, + { + "epoch": 3.064129856306546, + "grad_norm": 0.2930145263671875, + "learning_rate": 6.64636112964071e-08, + "loss": 0.1634, + "step": 11515 + }, + { + "epoch": 3.0643959552953697, + "grad_norm": 0.26474300026893616, + "learning_rate": 6.644770134120854e-08, + "loss": 0.1712, + "step": 11516 + }, + { + "epoch": 3.064662054284194, + "grad_norm": 0.32712942361831665, + "learning_rate": 6.64317923429897e-08, + "loss": 0.173, + "step": 11517 + }, + { + "epoch": 3.0649281532730175, + "grad_norm": 0.25520601868629456, + "learning_rate": 6.641588430220438e-08, + "loss": 0.1674, + "step": 11518 + }, + { + "epoch": 3.0651942522618416, + "grad_norm": 0.27399152517318726, + "learning_rate": 6.639997721930626e-08, + "loss": 0.1772, + "step": 11519 + }, + { + "epoch": 3.0654603512506653, + "grad_norm": 0.27241289615631104, + "learning_rate": 6.638407109474907e-08, + "loss": 0.1692, + "step": 11520 + }, + { + "epoch": 3.065726450239489, + "grad_norm": 0.31003984808921814, + "learning_rate": 6.636816592898651e-08, + "loss": 0.1683, + "step": 11521 + }, + { + "epoch": 3.065992549228313, + "grad_norm": 2.578995943069458, + "learning_rate": 6.635226172247216e-08, + "loss": 0.1733, + "step": 11522 + }, + { + "epoch": 3.066258648217137, + "grad_norm": 0.26626521348953247, + "learning_rate": 6.633635847565975e-08, + "loss": 0.1671, + "step": 11523 + }, + { + "epoch": 3.0665247472059605, + "grad_norm": 0.2852923572063446, + "learning_rate": 6.632045618900271e-08, + "loss": 0.19, + "step": 11524 + }, + { + "epoch": 3.0667908461947846, + "grad_norm": 0.27630552649497986, + "learning_rate": 6.630455486295479e-08, + "loss": 0.1849, + "step": 11525 + }, + { + "epoch": 3.0670569451836083, + "grad_norm": 0.26505687832832336, + "learning_rate": 6.628865449796938e-08, + "loss": 0.1731, + "step": 11526 + }, + { + "epoch": 3.067323044172432, + "grad_norm": 0.2991253137588501, + "learning_rate": 6.627275509450008e-08, + "loss": 0.1663, + "step": 11527 + }, + { + "epoch": 3.067589143161256, + "grad_norm": 0.38557955622673035, + "learning_rate": 6.625685665300032e-08, + "loss": 0.1901, + "step": 11528 + }, + { + "epoch": 3.0678552421500798, + "grad_norm": 0.3135991394519806, + "learning_rate": 6.62409591739236e-08, + "loss": 0.1682, + "step": 11529 + }, + { + "epoch": 3.0681213411389034, + "grad_norm": 0.3076907694339752, + "learning_rate": 6.622506265772329e-08, + "loss": 0.1687, + "step": 11530 + }, + { + "epoch": 3.0683874401277276, + "grad_norm": 0.48492300510406494, + "learning_rate": 6.620916710485288e-08, + "loss": 0.178, + "step": 11531 + }, + { + "epoch": 3.0686535391165513, + "grad_norm": 0.27104511857032776, + "learning_rate": 6.619327251576562e-08, + "loss": 0.1687, + "step": 11532 + }, + { + "epoch": 3.0689196381053754, + "grad_norm": 0.26266905665397644, + "learning_rate": 6.6177378890915e-08, + "loss": 0.167, + "step": 11533 + }, + { + "epoch": 3.069185737094199, + "grad_norm": 0.3472118079662323, + "learning_rate": 6.616148623075422e-08, + "loss": 0.1703, + "step": 11534 + }, + { + "epoch": 3.0694518360830227, + "grad_norm": 0.274699866771698, + "learning_rate": 6.614559453573664e-08, + "loss": 0.1766, + "step": 11535 + }, + { + "epoch": 3.069717935071847, + "grad_norm": 0.279175341129303, + "learning_rate": 6.612970380631546e-08, + "loss": 0.1698, + "step": 11536 + }, + { + "epoch": 3.0699840340606706, + "grad_norm": 0.2971974313259125, + "learning_rate": 6.611381404294397e-08, + "loss": 0.1894, + "step": 11537 + }, + { + "epoch": 3.0702501330494942, + "grad_norm": 0.26665106415748596, + "learning_rate": 6.609792524607539e-08, + "loss": 0.1608, + "step": 11538 + }, + { + "epoch": 3.0705162320383184, + "grad_norm": 0.31075406074523926, + "learning_rate": 6.608203741616284e-08, + "loss": 0.1784, + "step": 11539 + }, + { + "epoch": 3.070782331027142, + "grad_norm": 0.2685352563858032, + "learning_rate": 6.606615055365956e-08, + "loss": 0.1925, + "step": 11540 + }, + { + "epoch": 3.0710484300159657, + "grad_norm": 0.28348681330680847, + "learning_rate": 6.605026465901857e-08, + "loss": 0.1805, + "step": 11541 + }, + { + "epoch": 3.07131452900479, + "grad_norm": 0.3110988736152649, + "learning_rate": 6.603437973269305e-08, + "loss": 0.1872, + "step": 11542 + }, + { + "epoch": 3.0715806279936135, + "grad_norm": 0.28461819887161255, + "learning_rate": 6.601849577513602e-08, + "loss": 0.1715, + "step": 11543 + }, + { + "epoch": 3.0718467269824377, + "grad_norm": 0.315843403339386, + "learning_rate": 6.600261278680059e-08, + "loss": 0.1764, + "step": 11544 + }, + { + "epoch": 3.0721128259712613, + "grad_norm": 0.3335183560848236, + "learning_rate": 6.598673076813967e-08, + "loss": 0.1762, + "step": 11545 + }, + { + "epoch": 3.072378924960085, + "grad_norm": 0.29439109563827515, + "learning_rate": 6.597084971960634e-08, + "loss": 0.1749, + "step": 11546 + }, + { + "epoch": 3.072645023948909, + "grad_norm": 0.4521283209323883, + "learning_rate": 6.595496964165351e-08, + "loss": 0.1953, + "step": 11547 + }, + { + "epoch": 3.072911122937733, + "grad_norm": 0.2794090509414673, + "learning_rate": 6.593909053473416e-08, + "loss": 0.1655, + "step": 11548 + }, + { + "epoch": 3.0731772219265565, + "grad_norm": 0.24812017381191254, + "learning_rate": 6.592321239930111e-08, + "loss": 0.1592, + "step": 11549 + }, + { + "epoch": 3.0734433209153806, + "grad_norm": 0.2822417914867401, + "learning_rate": 6.590733523580734e-08, + "loss": 0.1734, + "step": 11550 + }, + { + "epoch": 3.0737094199042043, + "grad_norm": 0.3310398459434509, + "learning_rate": 6.589145904470562e-08, + "loss": 0.187, + "step": 11551 + }, + { + "epoch": 3.073975518893028, + "grad_norm": 0.3302016258239746, + "learning_rate": 6.587558382644883e-08, + "loss": 0.1911, + "step": 11552 + }, + { + "epoch": 3.074241617881852, + "grad_norm": 0.2755812704563141, + "learning_rate": 6.585970958148969e-08, + "loss": 0.1821, + "step": 11553 + }, + { + "epoch": 3.074507716870676, + "grad_norm": 0.3677579164505005, + "learning_rate": 6.584383631028105e-08, + "loss": 0.1823, + "step": 11554 + }, + { + "epoch": 3.0747738158595, + "grad_norm": 0.3427356481552124, + "learning_rate": 6.582796401327557e-08, + "loss": 0.1789, + "step": 11555 + }, + { + "epoch": 3.0750399148483236, + "grad_norm": 0.34292128682136536, + "learning_rate": 6.5812092690926e-08, + "loss": 0.196, + "step": 11556 + }, + { + "epoch": 3.0753060138371473, + "grad_norm": 0.3629356622695923, + "learning_rate": 6.579622234368505e-08, + "loss": 0.2009, + "step": 11557 + }, + { + "epoch": 3.0755721128259714, + "grad_norm": 0.28353163599967957, + "learning_rate": 6.578035297200532e-08, + "loss": 0.1815, + "step": 11558 + }, + { + "epoch": 3.075838211814795, + "grad_norm": 0.33859848976135254, + "learning_rate": 6.57644845763395e-08, + "loss": 0.1723, + "step": 11559 + }, + { + "epoch": 3.076104310803619, + "grad_norm": 0.3344939649105072, + "learning_rate": 6.574861715714011e-08, + "loss": 0.176, + "step": 11560 + }, + { + "epoch": 3.076370409792443, + "grad_norm": 0.3777334690093994, + "learning_rate": 6.573275071485977e-08, + "loss": 0.179, + "step": 11561 + }, + { + "epoch": 3.0766365087812666, + "grad_norm": 0.3028768301010132, + "learning_rate": 6.5716885249951e-08, + "loss": 0.1804, + "step": 11562 + }, + { + "epoch": 3.0769026077700903, + "grad_norm": 0.25774911046028137, + "learning_rate": 6.570102076286635e-08, + "loss": 0.1653, + "step": 11563 + }, + { + "epoch": 3.0771687067589144, + "grad_norm": 0.266685426235199, + "learning_rate": 6.568515725405828e-08, + "loss": 0.1858, + "step": 11564 + }, + { + "epoch": 3.077434805747738, + "grad_norm": 0.290836364030838, + "learning_rate": 6.566929472397927e-08, + "loss": 0.1965, + "step": 11565 + }, + { + "epoch": 3.077700904736562, + "grad_norm": 0.33020347356796265, + "learning_rate": 6.565343317308171e-08, + "loss": 0.1818, + "step": 11566 + }, + { + "epoch": 3.077967003725386, + "grad_norm": 0.25758683681488037, + "learning_rate": 6.563757260181806e-08, + "loss": 0.181, + "step": 11567 + }, + { + "epoch": 3.0782331027142096, + "grad_norm": 0.34613168239593506, + "learning_rate": 6.562171301064063e-08, + "loss": 0.1808, + "step": 11568 + }, + { + "epoch": 3.0784992017030337, + "grad_norm": 0.2541366219520569, + "learning_rate": 6.560585440000186e-08, + "loss": 0.1653, + "step": 11569 + }, + { + "epoch": 3.0787653006918574, + "grad_norm": 0.36194610595703125, + "learning_rate": 6.558999677035397e-08, + "loss": 0.1752, + "step": 11570 + }, + { + "epoch": 3.079031399680681, + "grad_norm": 0.3331180512905121, + "learning_rate": 6.557414012214932e-08, + "loss": 0.1801, + "step": 11571 + }, + { + "epoch": 3.079297498669505, + "grad_norm": 0.4570329487323761, + "learning_rate": 6.555828445584013e-08, + "loss": 0.2163, + "step": 11572 + }, + { + "epoch": 3.079563597658329, + "grad_norm": 0.2805478870868683, + "learning_rate": 6.554242977187869e-08, + "loss": 0.1903, + "step": 11573 + }, + { + "epoch": 3.0798296966471526, + "grad_norm": 0.34786397218704224, + "learning_rate": 6.552657607071713e-08, + "loss": 0.1705, + "step": 11574 + }, + { + "epoch": 3.0800957956359767, + "grad_norm": 0.27200213074684143, + "learning_rate": 6.551072335280767e-08, + "loss": 0.1803, + "step": 11575 + }, + { + "epoch": 3.0803618946248004, + "grad_norm": 0.6733545660972595, + "learning_rate": 6.549487161860252e-08, + "loss": 0.173, + "step": 11576 + }, + { + "epoch": 3.080627993613624, + "grad_norm": 0.2687086760997772, + "learning_rate": 6.54790208685537e-08, + "loss": 0.1701, + "step": 11577 + }, + { + "epoch": 3.080894092602448, + "grad_norm": 0.607364296913147, + "learning_rate": 6.546317110311338e-08, + "loss": 0.1817, + "step": 11578 + }, + { + "epoch": 3.081160191591272, + "grad_norm": 0.31451958417892456, + "learning_rate": 6.544732232273358e-08, + "loss": 0.2011, + "step": 11579 + }, + { + "epoch": 3.081426290580096, + "grad_norm": 0.288559228181839, + "learning_rate": 6.543147452786642e-08, + "loss": 0.173, + "step": 11580 + }, + { + "epoch": 3.0816923895689197, + "grad_norm": 0.29714643955230713, + "learning_rate": 6.541562771896378e-08, + "loss": 0.1809, + "step": 11581 + }, + { + "epoch": 3.0819584885577433, + "grad_norm": 0.2496899962425232, + "learning_rate": 6.539978189647777e-08, + "loss": 0.173, + "step": 11582 + }, + { + "epoch": 3.0822245875465675, + "grad_norm": 0.3475358188152313, + "learning_rate": 6.538393706086025e-08, + "loss": 0.1944, + "step": 11583 + }, + { + "epoch": 3.082490686535391, + "grad_norm": 0.30644091963768005, + "learning_rate": 6.536809321256322e-08, + "loss": 0.1699, + "step": 11584 + }, + { + "epoch": 3.082756785524215, + "grad_norm": 0.27244603633880615, + "learning_rate": 6.535225035203854e-08, + "loss": 0.174, + "step": 11585 + }, + { + "epoch": 3.083022884513039, + "grad_norm": 0.34177419543266296, + "learning_rate": 6.533640847973808e-08, + "loss": 0.1974, + "step": 11586 + }, + { + "epoch": 3.0832889835018626, + "grad_norm": 0.35164710879325867, + "learning_rate": 6.532056759611367e-08, + "loss": 0.175, + "step": 11587 + }, + { + "epoch": 3.0835550824906863, + "grad_norm": 0.2517128586769104, + "learning_rate": 6.530472770161718e-08, + "loss": 0.1741, + "step": 11588 + }, + { + "epoch": 3.0838211814795105, + "grad_norm": 0.2984473407268524, + "learning_rate": 6.528888879670033e-08, + "loss": 0.1759, + "step": 11589 + }, + { + "epoch": 3.084087280468334, + "grad_norm": 0.3657985031604767, + "learning_rate": 6.527305088181495e-08, + "loss": 0.1635, + "step": 11590 + }, + { + "epoch": 3.0843533794571583, + "grad_norm": 0.28182241320610046, + "learning_rate": 6.525721395741269e-08, + "loss": 0.1893, + "step": 11591 + }, + { + "epoch": 3.084619478445982, + "grad_norm": 0.25014129281044006, + "learning_rate": 6.524137802394528e-08, + "loss": 0.1709, + "step": 11592 + }, + { + "epoch": 3.0848855774348056, + "grad_norm": 0.3409871757030487, + "learning_rate": 6.522554308186443e-08, + "loss": 0.1656, + "step": 11593 + }, + { + "epoch": 3.0851516764236298, + "grad_norm": 0.3593863248825073, + "learning_rate": 6.520970913162174e-08, + "loss": 0.1968, + "step": 11594 + }, + { + "epoch": 3.0854177754124534, + "grad_norm": 0.33951905369758606, + "learning_rate": 6.519387617366887e-08, + "loss": 0.1933, + "step": 11595 + }, + { + "epoch": 3.085683874401277, + "grad_norm": 0.2559398412704468, + "learning_rate": 6.517804420845735e-08, + "loss": 0.1684, + "step": 11596 + }, + { + "epoch": 3.0859499733901012, + "grad_norm": 0.25972187519073486, + "learning_rate": 6.51622132364388e-08, + "loss": 0.1771, + "step": 11597 + }, + { + "epoch": 3.086216072378925, + "grad_norm": 0.2764320373535156, + "learning_rate": 6.514638325806467e-08, + "loss": 0.1747, + "step": 11598 + }, + { + "epoch": 3.0864821713677486, + "grad_norm": 0.2898116409778595, + "learning_rate": 6.513055427378659e-08, + "loss": 0.1778, + "step": 11599 + }, + { + "epoch": 3.0867482703565727, + "grad_norm": 0.4475243091583252, + "learning_rate": 6.511472628405587e-08, + "loss": 0.1904, + "step": 11600 + }, + { + "epoch": 3.0870143693453964, + "grad_norm": 0.3954252302646637, + "learning_rate": 6.509889928932414e-08, + "loss": 0.1866, + "step": 11601 + }, + { + "epoch": 3.0872804683342205, + "grad_norm": 0.3701278269290924, + "learning_rate": 6.508307329004266e-08, + "loss": 0.1972, + "step": 11602 + }, + { + "epoch": 3.0875465673230442, + "grad_norm": 0.37298014760017395, + "learning_rate": 6.506724828666292e-08, + "loss": 0.1872, + "step": 11603 + }, + { + "epoch": 3.087812666311868, + "grad_norm": 0.39380019903182983, + "learning_rate": 6.505142427963621e-08, + "loss": 0.1835, + "step": 11604 + }, + { + "epoch": 3.088078765300692, + "grad_norm": 0.2798852324485779, + "learning_rate": 6.503560126941395e-08, + "loss": 0.1747, + "step": 11605 + }, + { + "epoch": 3.0883448642895157, + "grad_norm": 0.27111104130744934, + "learning_rate": 6.501977925644735e-08, + "loss": 0.1709, + "step": 11606 + }, + { + "epoch": 3.0886109632783394, + "grad_norm": 0.2949060797691345, + "learning_rate": 6.500395824118775e-08, + "loss": 0.1924, + "step": 11607 + }, + { + "epoch": 3.0888770622671635, + "grad_norm": 0.28076615929603577, + "learning_rate": 6.498813822408635e-08, + "loss": 0.1734, + "step": 11608 + }, + { + "epoch": 3.089143161255987, + "grad_norm": 0.2710563838481903, + "learning_rate": 6.497231920559444e-08, + "loss": 0.1796, + "step": 11609 + }, + { + "epoch": 3.089409260244811, + "grad_norm": 0.2821464240550995, + "learning_rate": 6.495650118616313e-08, + "loss": 0.1776, + "step": 11610 + }, + { + "epoch": 3.089675359233635, + "grad_norm": 0.2954208254814148, + "learning_rate": 6.494068416624361e-08, + "loss": 0.1857, + "step": 11611 + }, + { + "epoch": 3.0899414582224587, + "grad_norm": 0.4646206498146057, + "learning_rate": 6.492486814628704e-08, + "loss": 0.1951, + "step": 11612 + }, + { + "epoch": 3.090207557211283, + "grad_norm": 0.2806682884693146, + "learning_rate": 6.49090531267445e-08, + "loss": 0.189, + "step": 11613 + }, + { + "epoch": 3.0904736562001065, + "grad_norm": 0.2576977014541626, + "learning_rate": 6.489323910806709e-08, + "loss": 0.1666, + "step": 11614 + }, + { + "epoch": 3.09073975518893, + "grad_norm": 0.30121639370918274, + "learning_rate": 6.487742609070581e-08, + "loss": 0.1922, + "step": 11615 + }, + { + "epoch": 3.0910058541777543, + "grad_norm": 0.34948593378067017, + "learning_rate": 6.486161407511176e-08, + "loss": 0.2032, + "step": 11616 + }, + { + "epoch": 3.091271953166578, + "grad_norm": 0.4963858723640442, + "learning_rate": 6.484580306173583e-08, + "loss": 0.1829, + "step": 11617 + }, + { + "epoch": 3.0915380521554017, + "grad_norm": 0.37363165616989136, + "learning_rate": 6.482999305102907e-08, + "loss": 0.1888, + "step": 11618 + }, + { + "epoch": 3.091804151144226, + "grad_norm": 0.2641496956348419, + "learning_rate": 6.481418404344234e-08, + "loss": 0.1599, + "step": 11619 + }, + { + "epoch": 3.0920702501330495, + "grad_norm": 0.3542274832725525, + "learning_rate": 6.479837603942664e-08, + "loss": 0.1792, + "step": 11620 + }, + { + "epoch": 3.092336349121873, + "grad_norm": 0.26055827736854553, + "learning_rate": 6.478256903943274e-08, + "loss": 0.1629, + "step": 11621 + }, + { + "epoch": 3.0926024481106973, + "grad_norm": 0.36663326621055603, + "learning_rate": 6.476676304391155e-08, + "loss": 0.1851, + "step": 11622 + }, + { + "epoch": 3.092868547099521, + "grad_norm": 0.2791496515274048, + "learning_rate": 6.475095805331386e-08, + "loss": 0.1771, + "step": 11623 + }, + { + "epoch": 3.0931346460883447, + "grad_norm": 0.25346463918685913, + "learning_rate": 6.473515406809051e-08, + "loss": 0.1691, + "step": 11624 + }, + { + "epoch": 3.093400745077169, + "grad_norm": 0.33070629835128784, + "learning_rate": 6.471935108869217e-08, + "loss": 0.1826, + "step": 11625 + }, + { + "epoch": 3.0936668440659925, + "grad_norm": 0.28958675265312195, + "learning_rate": 6.47035491155697e-08, + "loss": 0.1692, + "step": 11626 + }, + { + "epoch": 3.0939329430548166, + "grad_norm": 0.3342282474040985, + "learning_rate": 6.468774814917369e-08, + "loss": 0.2049, + "step": 11627 + }, + { + "epoch": 3.0941990420436403, + "grad_norm": 0.33448266983032227, + "learning_rate": 6.467194818995486e-08, + "loss": 0.1823, + "step": 11628 + }, + { + "epoch": 3.094465141032464, + "grad_norm": 0.3592608869075775, + "learning_rate": 6.465614923836388e-08, + "loss": 0.176, + "step": 11629 + }, + { + "epoch": 3.094731240021288, + "grad_norm": 0.2957935631275177, + "learning_rate": 6.464035129485133e-08, + "loss": 0.1855, + "step": 11630 + }, + { + "epoch": 3.0949973390101118, + "grad_norm": 0.24898718297481537, + "learning_rate": 6.462455435986785e-08, + "loss": 0.1686, + "step": 11631 + }, + { + "epoch": 3.0952634379989354, + "grad_norm": 0.3134596049785614, + "learning_rate": 6.460875843386391e-08, + "loss": 0.1806, + "step": 11632 + }, + { + "epoch": 3.0955295369877596, + "grad_norm": 0.3941058814525604, + "learning_rate": 6.459296351729013e-08, + "loss": 0.1936, + "step": 11633 + }, + { + "epoch": 3.0957956359765832, + "grad_norm": 0.3200562000274658, + "learning_rate": 6.457716961059698e-08, + "loss": 0.1847, + "step": 11634 + }, + { + "epoch": 3.096061734965407, + "grad_norm": 0.26632025837898254, + "learning_rate": 6.456137671423496e-08, + "loss": 0.1788, + "step": 11635 + }, + { + "epoch": 3.096327833954231, + "grad_norm": 0.4238899350166321, + "learning_rate": 6.454558482865444e-08, + "loss": 0.18, + "step": 11636 + }, + { + "epoch": 3.0965939329430547, + "grad_norm": 0.29418104887008667, + "learning_rate": 6.452979395430591e-08, + "loss": 0.1874, + "step": 11637 + }, + { + "epoch": 3.096860031931879, + "grad_norm": 0.2559467554092407, + "learning_rate": 6.451400409163973e-08, + "loss": 0.1666, + "step": 11638 + }, + { + "epoch": 3.0971261309207025, + "grad_norm": 0.2602996230125427, + "learning_rate": 6.449821524110629e-08, + "loss": 0.1691, + "step": 11639 + }, + { + "epoch": 3.0973922299095262, + "grad_norm": 0.42500993609428406, + "learning_rate": 6.448242740315585e-08, + "loss": 0.183, + "step": 11640 + }, + { + "epoch": 3.0976583288983504, + "grad_norm": 0.26568809151649475, + "learning_rate": 6.44666405782388e-08, + "loss": 0.1709, + "step": 11641 + }, + { + "epoch": 3.097924427887174, + "grad_norm": 0.2556666135787964, + "learning_rate": 6.445085476680533e-08, + "loss": 0.1609, + "step": 11642 + }, + { + "epoch": 3.0981905268759977, + "grad_norm": 0.28445664048194885, + "learning_rate": 6.443506996930576e-08, + "loss": 0.1768, + "step": 11643 + }, + { + "epoch": 3.098456625864822, + "grad_norm": 0.34558483958244324, + "learning_rate": 6.441928618619022e-08, + "loss": 0.1704, + "step": 11644 + }, + { + "epoch": 3.0987227248536455, + "grad_norm": 0.2900159955024719, + "learning_rate": 6.4403503417909e-08, + "loss": 0.1934, + "step": 11645 + }, + { + "epoch": 3.098988823842469, + "grad_norm": 0.42548131942749023, + "learning_rate": 6.438772166491216e-08, + "loss": 0.1787, + "step": 11646 + }, + { + "epoch": 3.0992549228312933, + "grad_norm": 0.29439777135849, + "learning_rate": 6.437194092764985e-08, + "loss": 0.1617, + "step": 11647 + }, + { + "epoch": 3.099521021820117, + "grad_norm": 0.3362158238887787, + "learning_rate": 6.435616120657221e-08, + "loss": 0.178, + "step": 11648 + }, + { + "epoch": 3.0997871208089407, + "grad_norm": 0.2970612347126007, + "learning_rate": 6.434038250212926e-08, + "loss": 0.1798, + "step": 11649 + }, + { + "epoch": 3.100053219797765, + "grad_norm": 0.3501046597957611, + "learning_rate": 6.432460481477111e-08, + "loss": 0.1666, + "step": 11650 + }, + { + "epoch": 3.1003193187865885, + "grad_norm": 0.2699703574180603, + "learning_rate": 6.430882814494767e-08, + "loss": 0.1853, + "step": 11651 + }, + { + "epoch": 3.1005854177754126, + "grad_norm": 0.4240451455116272, + "learning_rate": 6.429305249310904e-08, + "loss": 0.1979, + "step": 11652 + }, + { + "epoch": 3.1008515167642363, + "grad_norm": 0.25617408752441406, + "learning_rate": 6.427727785970508e-08, + "loss": 0.1652, + "step": 11653 + }, + { + "epoch": 3.10111761575306, + "grad_norm": 0.276552677154541, + "learning_rate": 6.426150424518575e-08, + "loss": 0.176, + "step": 11654 + }, + { + "epoch": 3.101383714741884, + "grad_norm": 0.5533624887466431, + "learning_rate": 6.424573165000093e-08, + "loss": 0.1936, + "step": 11655 + }, + { + "epoch": 3.101649813730708, + "grad_norm": 0.2755781412124634, + "learning_rate": 6.422996007460053e-08, + "loss": 0.174, + "step": 11656 + }, + { + "epoch": 3.1019159127195315, + "grad_norm": 0.40405821800231934, + "learning_rate": 6.421418951943433e-08, + "loss": 0.1993, + "step": 11657 + }, + { + "epoch": 3.1021820117083556, + "grad_norm": 0.33252033591270447, + "learning_rate": 6.419841998495219e-08, + "loss": 0.1725, + "step": 11658 + }, + { + "epoch": 3.1024481106971793, + "grad_norm": 0.2571084797382355, + "learning_rate": 6.418265147160382e-08, + "loss": 0.1666, + "step": 11659 + }, + { + "epoch": 3.1027142096860034, + "grad_norm": 0.38267093896865845, + "learning_rate": 6.416688397983908e-08, + "loss": 0.1851, + "step": 11660 + }, + { + "epoch": 3.102980308674827, + "grad_norm": 0.3689437508583069, + "learning_rate": 6.415111751010758e-08, + "loss": 0.1878, + "step": 11661 + }, + { + "epoch": 3.103246407663651, + "grad_norm": 0.2969488501548767, + "learning_rate": 6.413535206285907e-08, + "loss": 0.1745, + "step": 11662 + }, + { + "epoch": 3.103512506652475, + "grad_norm": 0.2851848602294922, + "learning_rate": 6.411958763854318e-08, + "loss": 0.1771, + "step": 11663 + }, + { + "epoch": 3.1037786056412986, + "grad_norm": 0.26257213950157166, + "learning_rate": 6.410382423760959e-08, + "loss": 0.1705, + "step": 11664 + }, + { + "epoch": 3.1040447046301223, + "grad_norm": 0.2890278697013855, + "learning_rate": 6.408806186050785e-08, + "loss": 0.1843, + "step": 11665 + }, + { + "epoch": 3.1043108036189464, + "grad_norm": 0.2901405394077301, + "learning_rate": 6.407230050768756e-08, + "loss": 0.191, + "step": 11666 + }, + { + "epoch": 3.10457690260777, + "grad_norm": 0.30322834849357605, + "learning_rate": 6.405654017959832e-08, + "loss": 0.1828, + "step": 11667 + }, + { + "epoch": 3.1048430015965938, + "grad_norm": 0.28008443117141724, + "learning_rate": 6.404078087668954e-08, + "loss": 0.1773, + "step": 11668 + }, + { + "epoch": 3.105109100585418, + "grad_norm": 0.3929867148399353, + "learning_rate": 6.402502259941079e-08, + "loss": 0.1721, + "step": 11669 + }, + { + "epoch": 3.1053751995742416, + "grad_norm": 0.2849896550178528, + "learning_rate": 6.400926534821147e-08, + "loss": 0.1728, + "step": 11670 + }, + { + "epoch": 3.1056412985630653, + "grad_norm": 0.4261528253555298, + "learning_rate": 6.39935091235411e-08, + "loss": 0.1781, + "step": 11671 + }, + { + "epoch": 3.1059073975518894, + "grad_norm": 0.2805808186531067, + "learning_rate": 6.397775392584896e-08, + "loss": 0.1776, + "step": 11672 + }, + { + "epoch": 3.106173496540713, + "grad_norm": 0.3629963994026184, + "learning_rate": 6.396199975558451e-08, + "loss": 0.2066, + "step": 11673 + }, + { + "epoch": 3.106439595529537, + "grad_norm": 0.2875005006790161, + "learning_rate": 6.394624661319704e-08, + "loss": 0.1685, + "step": 11674 + }, + { + "epoch": 3.106705694518361, + "grad_norm": 0.28355348110198975, + "learning_rate": 6.393049449913592e-08, + "loss": 0.1763, + "step": 11675 + }, + { + "epoch": 3.1069717935071846, + "grad_norm": 0.25540855526924133, + "learning_rate": 6.391474341385033e-08, + "loss": 0.1619, + "step": 11676 + }, + { + "epoch": 3.1072378924960087, + "grad_norm": 0.3728851079940796, + "learning_rate": 6.389899335778964e-08, + "loss": 0.1755, + "step": 11677 + }, + { + "epoch": 3.1075039914848324, + "grad_norm": 0.3419608771800995, + "learning_rate": 6.388324433140298e-08, + "loss": 0.1817, + "step": 11678 + }, + { + "epoch": 3.107770090473656, + "grad_norm": 0.4453471302986145, + "learning_rate": 6.386749633513961e-08, + "loss": 0.1887, + "step": 11679 + }, + { + "epoch": 3.10803618946248, + "grad_norm": 0.37405410408973694, + "learning_rate": 6.385174936944866e-08, + "loss": 0.1937, + "step": 11680 + }, + { + "epoch": 3.108302288451304, + "grad_norm": 0.44319748878479004, + "learning_rate": 6.383600343477931e-08, + "loss": 0.1793, + "step": 11681 + }, + { + "epoch": 3.1085683874401275, + "grad_norm": 0.2900088429450989, + "learning_rate": 6.382025853158059e-08, + "loss": 0.1834, + "step": 11682 + }, + { + "epoch": 3.1088344864289517, + "grad_norm": 0.319386750459671, + "learning_rate": 6.380451466030161e-08, + "loss": 0.181, + "step": 11683 + }, + { + "epoch": 3.1091005854177753, + "grad_norm": 0.28910985589027405, + "learning_rate": 6.378877182139144e-08, + "loss": 0.1757, + "step": 11684 + }, + { + "epoch": 3.1093666844065995, + "grad_norm": 0.3168909251689911, + "learning_rate": 6.377303001529908e-08, + "loss": 0.1631, + "step": 11685 + }, + { + "epoch": 3.109632783395423, + "grad_norm": 0.26433756947517395, + "learning_rate": 6.375728924247354e-08, + "loss": 0.1673, + "step": 11686 + }, + { + "epoch": 3.109898882384247, + "grad_norm": 0.27405065298080444, + "learning_rate": 6.374154950336375e-08, + "loss": 0.1664, + "step": 11687 + }, + { + "epoch": 3.110164981373071, + "grad_norm": 0.2911308705806732, + "learning_rate": 6.372581079841864e-08, + "loss": 0.1834, + "step": 11688 + }, + { + "epoch": 3.1104310803618946, + "grad_norm": 0.4116620123386383, + "learning_rate": 6.371007312808711e-08, + "loss": 0.1975, + "step": 11689 + }, + { + "epoch": 3.1106971793507183, + "grad_norm": 0.2704771161079407, + "learning_rate": 6.369433649281805e-08, + "loss": 0.1819, + "step": 11690 + }, + { + "epoch": 3.1109632783395424, + "grad_norm": 0.2934158444404602, + "learning_rate": 6.367860089306027e-08, + "loss": 0.1641, + "step": 11691 + }, + { + "epoch": 3.111229377328366, + "grad_norm": 0.32223933935165405, + "learning_rate": 6.366286632926265e-08, + "loss": 0.1987, + "step": 11692 + }, + { + "epoch": 3.11149547631719, + "grad_norm": 0.27475616335868835, + "learning_rate": 6.364713280187389e-08, + "loss": 0.1813, + "step": 11693 + }, + { + "epoch": 3.111761575306014, + "grad_norm": 0.3573529124259949, + "learning_rate": 6.363140031134279e-08, + "loss": 0.1722, + "step": 11694 + }, + { + "epoch": 3.1120276742948376, + "grad_norm": 0.2605508863925934, + "learning_rate": 6.361566885811803e-08, + "loss": 0.1627, + "step": 11695 + }, + { + "epoch": 3.1122937732836613, + "grad_norm": 0.2575124502182007, + "learning_rate": 6.359993844264837e-08, + "loss": 0.1549, + "step": 11696 + }, + { + "epoch": 3.1125598722724854, + "grad_norm": 0.2896195650100708, + "learning_rate": 6.358420906538239e-08, + "loss": 0.1799, + "step": 11697 + }, + { + "epoch": 3.112825971261309, + "grad_norm": 0.3916361927986145, + "learning_rate": 6.356848072676881e-08, + "loss": 0.1789, + "step": 11698 + }, + { + "epoch": 3.1130920702501332, + "grad_norm": 0.37382790446281433, + "learning_rate": 6.355275342725617e-08, + "loss": 0.1641, + "step": 11699 + }, + { + "epoch": 3.113358169238957, + "grad_norm": 0.24136541783809662, + "learning_rate": 6.353702716729308e-08, + "loss": 0.1507, + "step": 11700 + }, + { + "epoch": 3.1136242682277806, + "grad_norm": 0.4388786554336548, + "learning_rate": 6.352130194732804e-08, + "loss": 0.202, + "step": 11701 + }, + { + "epoch": 3.1138903672166047, + "grad_norm": 0.27550816535949707, + "learning_rate": 6.35055777678096e-08, + "loss": 0.1715, + "step": 11702 + }, + { + "epoch": 3.1141564662054284, + "grad_norm": 0.37561026215553284, + "learning_rate": 6.348985462918629e-08, + "loss": 0.1782, + "step": 11703 + }, + { + "epoch": 3.114422565194252, + "grad_norm": 0.33926713466644287, + "learning_rate": 6.347413253190648e-08, + "loss": 0.1755, + "step": 11704 + }, + { + "epoch": 3.114688664183076, + "grad_norm": 0.2729485034942627, + "learning_rate": 6.345841147641864e-08, + "loss": 0.1821, + "step": 11705 + }, + { + "epoch": 3.1149547631719, + "grad_norm": 0.24802662432193756, + "learning_rate": 6.344269146317114e-08, + "loss": 0.1575, + "step": 11706 + }, + { + "epoch": 3.1152208621607236, + "grad_norm": 0.46630334854125977, + "learning_rate": 6.342697249261241e-08, + "loss": 0.211, + "step": 11707 + }, + { + "epoch": 3.1154869611495477, + "grad_norm": 0.32730087637901306, + "learning_rate": 6.341125456519072e-08, + "loss": 0.1806, + "step": 11708 + }, + { + "epoch": 3.1157530601383714, + "grad_norm": 0.349911630153656, + "learning_rate": 6.339553768135441e-08, + "loss": 0.1959, + "step": 11709 + }, + { + "epoch": 3.1160191591271955, + "grad_norm": 0.28156012296676636, + "learning_rate": 6.337982184155175e-08, + "loss": 0.1639, + "step": 11710 + }, + { + "epoch": 3.116285258116019, + "grad_norm": 0.3386182188987732, + "learning_rate": 6.3364107046231e-08, + "loss": 0.1907, + "step": 11711 + }, + { + "epoch": 3.116551357104843, + "grad_norm": 0.2587343752384186, + "learning_rate": 6.334839329584033e-08, + "loss": 0.1709, + "step": 11712 + }, + { + "epoch": 3.116817456093667, + "grad_norm": 0.4591147303581238, + "learning_rate": 6.333268059082799e-08, + "loss": 0.1884, + "step": 11713 + }, + { + "epoch": 3.1170835550824907, + "grad_norm": 0.2887817323207855, + "learning_rate": 6.331696893164209e-08, + "loss": 0.1782, + "step": 11714 + }, + { + "epoch": 3.1173496540713144, + "grad_norm": 0.2770932614803314, + "learning_rate": 6.330125831873078e-08, + "loss": 0.1787, + "step": 11715 + }, + { + "epoch": 3.1176157530601385, + "grad_norm": 0.26276126503944397, + "learning_rate": 6.328554875254218e-08, + "loss": 0.1767, + "step": 11716 + }, + { + "epoch": 3.117881852048962, + "grad_norm": 0.2721385061740875, + "learning_rate": 6.326984023352434e-08, + "loss": 0.1701, + "step": 11717 + }, + { + "epoch": 3.118147951037786, + "grad_norm": 0.29422497749328613, + "learning_rate": 6.325413276212527e-08, + "loss": 0.1653, + "step": 11718 + }, + { + "epoch": 3.11841405002661, + "grad_norm": 0.3510163426399231, + "learning_rate": 6.323842633879302e-08, + "loss": 0.2019, + "step": 11719 + }, + { + "epoch": 3.1186801490154337, + "grad_norm": 0.27223241329193115, + "learning_rate": 6.322272096397553e-08, + "loss": 0.1709, + "step": 11720 + }, + { + "epoch": 3.118946248004258, + "grad_norm": 0.3425898849964142, + "learning_rate": 6.320701663812078e-08, + "loss": 0.197, + "step": 11721 + }, + { + "epoch": 3.1192123469930815, + "grad_norm": 0.3734324276447296, + "learning_rate": 6.319131336167674e-08, + "loss": 0.1869, + "step": 11722 + }, + { + "epoch": 3.119478445981905, + "grad_norm": 0.3356291353702545, + "learning_rate": 6.317561113509118e-08, + "loss": 0.1996, + "step": 11723 + }, + { + "epoch": 3.1197445449707293, + "grad_norm": 0.2628432810306549, + "learning_rate": 6.315990995881204e-08, + "loss": 0.1616, + "step": 11724 + }, + { + "epoch": 3.120010643959553, + "grad_norm": 0.2691972255706787, + "learning_rate": 6.314420983328711e-08, + "loss": 0.1796, + "step": 11725 + }, + { + "epoch": 3.1202767429483766, + "grad_norm": 0.2685088515281677, + "learning_rate": 6.312851075896426e-08, + "loss": 0.1776, + "step": 11726 + }, + { + "epoch": 3.1205428419372008, + "grad_norm": 0.27314016222953796, + "learning_rate": 6.311281273629114e-08, + "loss": 0.1833, + "step": 11727 + }, + { + "epoch": 3.1208089409260245, + "grad_norm": 0.28028783202171326, + "learning_rate": 6.309711576571563e-08, + "loss": 0.1778, + "step": 11728 + }, + { + "epoch": 3.121075039914848, + "grad_norm": 0.3682609498500824, + "learning_rate": 6.308141984768533e-08, + "loss": 0.2005, + "step": 11729 + }, + { + "epoch": 3.1213411389036723, + "grad_norm": 0.2831803560256958, + "learning_rate": 6.306572498264797e-08, + "loss": 0.1743, + "step": 11730 + }, + { + "epoch": 3.121607237892496, + "grad_norm": 0.40425583720207214, + "learning_rate": 6.305003117105116e-08, + "loss": 0.189, + "step": 11731 + }, + { + "epoch": 3.12187333688132, + "grad_norm": 0.39949384331703186, + "learning_rate": 6.30343384133426e-08, + "loss": 0.1845, + "step": 11732 + }, + { + "epoch": 3.1221394358701438, + "grad_norm": 0.2715432345867157, + "learning_rate": 6.301864670996978e-08, + "loss": 0.1777, + "step": 11733 + }, + { + "epoch": 3.1224055348589674, + "grad_norm": 0.2681594789028168, + "learning_rate": 6.300295606138033e-08, + "loss": 0.1877, + "step": 11734 + }, + { + "epoch": 3.1226716338477916, + "grad_norm": 0.26425257325172424, + "learning_rate": 6.298726646802172e-08, + "loss": 0.1628, + "step": 11735 + }, + { + "epoch": 3.1229377328366152, + "grad_norm": 0.2965041697025299, + "learning_rate": 6.297157793034154e-08, + "loss": 0.1708, + "step": 11736 + }, + { + "epoch": 3.123203831825439, + "grad_norm": 0.3145304322242737, + "learning_rate": 6.295589044878717e-08, + "loss": 0.1653, + "step": 11737 + }, + { + "epoch": 3.123469930814263, + "grad_norm": 0.3210284113883972, + "learning_rate": 6.294020402380606e-08, + "loss": 0.1718, + "step": 11738 + }, + { + "epoch": 3.1237360298030867, + "grad_norm": 0.27557411789894104, + "learning_rate": 6.292451865584567e-08, + "loss": 0.1563, + "step": 11739 + }, + { + "epoch": 3.1240021287919104, + "grad_norm": 0.3117118775844574, + "learning_rate": 6.290883434535334e-08, + "loss": 0.1711, + "step": 11740 + }, + { + "epoch": 3.1242682277807345, + "grad_norm": 0.35706162452697754, + "learning_rate": 6.289315109277643e-08, + "loss": 0.1745, + "step": 11741 + }, + { + "epoch": 3.1245343267695582, + "grad_norm": 0.2784915864467621, + "learning_rate": 6.287746889856224e-08, + "loss": 0.1643, + "step": 11742 + }, + { + "epoch": 3.124800425758382, + "grad_norm": 0.28935468196868896, + "learning_rate": 6.286178776315811e-08, + "loss": 0.1818, + "step": 11743 + }, + { + "epoch": 3.125066524747206, + "grad_norm": 0.2746674120426178, + "learning_rate": 6.284610768701124e-08, + "loss": 0.1744, + "step": 11744 + }, + { + "epoch": 3.1253326237360297, + "grad_norm": 0.2911396026611328, + "learning_rate": 6.28304286705689e-08, + "loss": 0.17, + "step": 11745 + }, + { + "epoch": 3.125598722724854, + "grad_norm": 0.3839265704154968, + "learning_rate": 6.281475071427822e-08, + "loss": 0.1729, + "step": 11746 + }, + { + "epoch": 3.1258648217136775, + "grad_norm": 0.2874300181865692, + "learning_rate": 6.279907381858646e-08, + "loss": 0.186, + "step": 11747 + }, + { + "epoch": 3.126130920702501, + "grad_norm": 0.272319495677948, + "learning_rate": 6.278339798394068e-08, + "loss": 0.1767, + "step": 11748 + }, + { + "epoch": 3.1263970196913253, + "grad_norm": 0.275907039642334, + "learning_rate": 6.276772321078804e-08, + "loss": 0.175, + "step": 11749 + }, + { + "epoch": 3.126663118680149, + "grad_norm": 0.31525862216949463, + "learning_rate": 6.275204949957557e-08, + "loss": 0.1765, + "step": 11750 + }, + { + "epoch": 3.1269292176689727, + "grad_norm": 0.2965959310531616, + "learning_rate": 6.273637685075038e-08, + "loss": 0.1908, + "step": 11751 + }, + { + "epoch": 3.127195316657797, + "grad_norm": 0.2575252652168274, + "learning_rate": 6.272070526475938e-08, + "loss": 0.1611, + "step": 11752 + }, + { + "epoch": 3.1274614156466205, + "grad_norm": 0.2719392776489258, + "learning_rate": 6.27050347420497e-08, + "loss": 0.174, + "step": 11753 + }, + { + "epoch": 3.127727514635444, + "grad_norm": 0.2577155828475952, + "learning_rate": 6.268936528306814e-08, + "loss": 0.164, + "step": 11754 + }, + { + "epoch": 3.1279936136242683, + "grad_norm": 0.5044887065887451, + "learning_rate": 6.267369688826174e-08, + "loss": 0.188, + "step": 11755 + }, + { + "epoch": 3.128259712613092, + "grad_norm": 0.4423399865627289, + "learning_rate": 6.265802955807734e-08, + "loss": 0.2095, + "step": 11756 + }, + { + "epoch": 3.128525811601916, + "grad_norm": 0.2666386067867279, + "learning_rate": 6.264236329296182e-08, + "loss": 0.1741, + "step": 11757 + }, + { + "epoch": 3.12879191059074, + "grad_norm": 0.35542061924934387, + "learning_rate": 6.262669809336205e-08, + "loss": 0.1518, + "step": 11758 + }, + { + "epoch": 3.1290580095795635, + "grad_norm": 0.26693522930145264, + "learning_rate": 6.261103395972476e-08, + "loss": 0.1727, + "step": 11759 + }, + { + "epoch": 3.1293241085683876, + "grad_norm": 0.3711314797401428, + "learning_rate": 6.259537089249678e-08, + "loss": 0.1878, + "step": 11760 + }, + { + "epoch": 3.1295902075572113, + "grad_norm": 0.24813462793827057, + "learning_rate": 6.257970889212481e-08, + "loss": 0.1687, + "step": 11761 + }, + { + "epoch": 3.129856306546035, + "grad_norm": 0.49592870473861694, + "learning_rate": 6.256404795905561e-08, + "loss": 0.1941, + "step": 11762 + }, + { + "epoch": 3.130122405534859, + "grad_norm": 0.26758015155792236, + "learning_rate": 6.254838809373582e-08, + "loss": 0.1739, + "step": 11763 + }, + { + "epoch": 3.130388504523683, + "grad_norm": 0.2975725829601288, + "learning_rate": 6.253272929661211e-08, + "loss": 0.1704, + "step": 11764 + }, + { + "epoch": 3.1306546035125065, + "grad_norm": 0.27095240354537964, + "learning_rate": 6.251707156813108e-08, + "loss": 0.1745, + "step": 11765 + }, + { + "epoch": 3.1309207025013306, + "grad_norm": 0.3053513765335083, + "learning_rate": 6.250141490873935e-08, + "loss": 0.1918, + "step": 11766 + }, + { + "epoch": 3.1311868014901543, + "grad_norm": 0.3028111457824707, + "learning_rate": 6.248575931888347e-08, + "loss": 0.1907, + "step": 11767 + }, + { + "epoch": 3.131452900478978, + "grad_norm": 0.3511403799057007, + "learning_rate": 6.247010479900999e-08, + "loss": 0.1681, + "step": 11768 + }, + { + "epoch": 3.131718999467802, + "grad_norm": 0.3383048474788666, + "learning_rate": 6.245445134956536e-08, + "loss": 0.168, + "step": 11769 + }, + { + "epoch": 3.1319850984566258, + "grad_norm": 0.30024364590644836, + "learning_rate": 6.243879897099606e-08, + "loss": 0.177, + "step": 11770 + }, + { + "epoch": 3.13225119744545, + "grad_norm": 0.36638298630714417, + "learning_rate": 6.242314766374855e-08, + "loss": 0.194, + "step": 11771 + }, + { + "epoch": 3.1325172964342736, + "grad_norm": 0.42390701174736023, + "learning_rate": 6.240749742826927e-08, + "loss": 0.1909, + "step": 11772 + }, + { + "epoch": 3.1327833954230973, + "grad_norm": 0.2565818428993225, + "learning_rate": 6.239184826500451e-08, + "loss": 0.166, + "step": 11773 + }, + { + "epoch": 3.1330494944119214, + "grad_norm": 0.35377249121665955, + "learning_rate": 6.237620017440068e-08, + "loss": 0.1555, + "step": 11774 + }, + { + "epoch": 3.133315593400745, + "grad_norm": 0.27787888050079346, + "learning_rate": 6.236055315690404e-08, + "loss": 0.1918, + "step": 11775 + }, + { + "epoch": 3.1335816923895687, + "grad_norm": 0.34665000438690186, + "learning_rate": 6.234490721296094e-08, + "loss": 0.1792, + "step": 11776 + }, + { + "epoch": 3.133847791378393, + "grad_norm": 0.2761495113372803, + "learning_rate": 6.232926234301762e-08, + "loss": 0.1728, + "step": 11777 + }, + { + "epoch": 3.1341138903672165, + "grad_norm": 0.2704196274280548, + "learning_rate": 6.231361854752025e-08, + "loss": 0.1684, + "step": 11778 + }, + { + "epoch": 3.1343799893560407, + "grad_norm": 0.45207640528678894, + "learning_rate": 6.229797582691512e-08, + "loss": 0.1854, + "step": 11779 + }, + { + "epoch": 3.1346460883448644, + "grad_norm": 0.25879916548728943, + "learning_rate": 6.228233418164828e-08, + "loss": 0.1751, + "step": 11780 + }, + { + "epoch": 3.134912187333688, + "grad_norm": 0.2813839316368103, + "learning_rate": 6.226669361216594e-08, + "loss": 0.1733, + "step": 11781 + }, + { + "epoch": 3.135178286322512, + "grad_norm": 0.28637176752090454, + "learning_rate": 6.225105411891416e-08, + "loss": 0.1885, + "step": 11782 + }, + { + "epoch": 3.135444385311336, + "grad_norm": 0.3388189375400543, + "learning_rate": 6.223541570233906e-08, + "loss": 0.1807, + "step": 11783 + }, + { + "epoch": 3.1357104843001595, + "grad_norm": 0.33139047026634216, + "learning_rate": 6.221977836288661e-08, + "loss": 0.1845, + "step": 11784 + }, + { + "epoch": 3.1359765832889837, + "grad_norm": 0.29043349623680115, + "learning_rate": 6.220414210100286e-08, + "loss": 0.1811, + "step": 11785 + }, + { + "epoch": 3.1362426822778073, + "grad_norm": 0.2685903012752533, + "learning_rate": 6.218850691713377e-08, + "loss": 0.1832, + "step": 11786 + }, + { + "epoch": 3.136508781266631, + "grad_norm": 0.3271641731262207, + "learning_rate": 6.217287281172537e-08, + "loss": 0.1895, + "step": 11787 + }, + { + "epoch": 3.136774880255455, + "grad_norm": 0.34912946820259094, + "learning_rate": 6.215723978522343e-08, + "loss": 0.1723, + "step": 11788 + }, + { + "epoch": 3.137040979244279, + "grad_norm": 0.3522581160068512, + "learning_rate": 6.214160783807394e-08, + "loss": 0.2006, + "step": 11789 + }, + { + "epoch": 3.1373070782331025, + "grad_norm": 0.34237536787986755, + "learning_rate": 6.212597697072271e-08, + "loss": 0.1731, + "step": 11790 + }, + { + "epoch": 3.1375731772219266, + "grad_norm": 0.3092157244682312, + "learning_rate": 6.211034718361559e-08, + "loss": 0.1864, + "step": 11791 + }, + { + "epoch": 3.1378392762107503, + "grad_norm": 0.2652761936187744, + "learning_rate": 6.209471847719835e-08, + "loss": 0.1736, + "step": 11792 + }, + { + "epoch": 3.1381053751995744, + "grad_norm": 0.375948965549469, + "learning_rate": 6.207909085191677e-08, + "loss": 0.1759, + "step": 11793 + }, + { + "epoch": 3.138371474188398, + "grad_norm": 0.2720886766910553, + "learning_rate": 6.206346430821661e-08, + "loss": 0.1704, + "step": 11794 + }, + { + "epoch": 3.138637573177222, + "grad_norm": 0.275339275598526, + "learning_rate": 6.20478388465435e-08, + "loss": 0.1765, + "step": 11795 + }, + { + "epoch": 3.138903672166046, + "grad_norm": 0.2460118532180786, + "learning_rate": 6.203221446734318e-08, + "loss": 0.1649, + "step": 11796 + }, + { + "epoch": 3.1391697711548696, + "grad_norm": 0.27163103222846985, + "learning_rate": 6.201659117106123e-08, + "loss": 0.1725, + "step": 11797 + }, + { + "epoch": 3.1394358701436933, + "grad_norm": 0.26162540912628174, + "learning_rate": 6.200096895814333e-08, + "loss": 0.1691, + "step": 11798 + }, + { + "epoch": 3.1397019691325174, + "grad_norm": 0.2917788326740265, + "learning_rate": 6.198534782903498e-08, + "loss": 0.1848, + "step": 11799 + }, + { + "epoch": 3.139968068121341, + "grad_norm": 0.35273462533950806, + "learning_rate": 6.196972778418176e-08, + "loss": 0.1996, + "step": 11800 + }, + { + "epoch": 3.140234167110165, + "grad_norm": 0.4871094822883606, + "learning_rate": 6.195410882402917e-08, + "loss": 0.1772, + "step": 11801 + }, + { + "epoch": 3.140500266098989, + "grad_norm": 0.34212514758110046, + "learning_rate": 6.193849094902275e-08, + "loss": 0.1869, + "step": 11802 + }, + { + "epoch": 3.1407663650878126, + "grad_norm": 0.2719171941280365, + "learning_rate": 6.192287415960786e-08, + "loss": 0.1778, + "step": 11803 + }, + { + "epoch": 3.1410324640766367, + "grad_norm": 0.31095126271247864, + "learning_rate": 6.190725845623003e-08, + "loss": 0.1773, + "step": 11804 + }, + { + "epoch": 3.1412985630654604, + "grad_norm": 0.27268511056900024, + "learning_rate": 6.189164383933457e-08, + "loss": 0.1723, + "step": 11805 + }, + { + "epoch": 3.141564662054284, + "grad_norm": 0.33228379487991333, + "learning_rate": 6.187603030936685e-08, + "loss": 0.2, + "step": 11806 + }, + { + "epoch": 3.141830761043108, + "grad_norm": 0.3692718744277954, + "learning_rate": 6.186041786677223e-08, + "loss": 0.1796, + "step": 11807 + }, + { + "epoch": 3.142096860031932, + "grad_norm": 0.28738901019096375, + "learning_rate": 6.184480651199603e-08, + "loss": 0.1705, + "step": 11808 + }, + { + "epoch": 3.1423629590207556, + "grad_norm": 0.36443886160850525, + "learning_rate": 6.182919624548344e-08, + "loss": 0.1943, + "step": 11809 + }, + { + "epoch": 3.1426290580095797, + "grad_norm": 0.282056599855423, + "learning_rate": 6.181358706767976e-08, + "loss": 0.1754, + "step": 11810 + }, + { + "epoch": 3.1428951569984034, + "grad_norm": 0.3490464985370636, + "learning_rate": 6.179797897903016e-08, + "loss": 0.1575, + "step": 11811 + }, + { + "epoch": 3.143161255987227, + "grad_norm": 0.2728011906147003, + "learning_rate": 6.178237197997984e-08, + "loss": 0.1654, + "step": 11812 + }, + { + "epoch": 3.143427354976051, + "grad_norm": 0.2657942771911621, + "learning_rate": 6.176676607097395e-08, + "loss": 0.1484, + "step": 11813 + }, + { + "epoch": 3.143693453964875, + "grad_norm": 0.26059404015541077, + "learning_rate": 6.175116125245756e-08, + "loss": 0.162, + "step": 11814 + }, + { + "epoch": 3.1439595529536986, + "grad_norm": 0.28274571895599365, + "learning_rate": 6.173555752487578e-08, + "loss": 0.1786, + "step": 11815 + }, + { + "epoch": 3.1442256519425227, + "grad_norm": 0.2618149220943451, + "learning_rate": 6.171995488867365e-08, + "loss": 0.1475, + "step": 11816 + }, + { + "epoch": 3.1444917509313464, + "grad_norm": 0.3362545371055603, + "learning_rate": 6.170435334429623e-08, + "loss": 0.1741, + "step": 11817 + }, + { + "epoch": 3.1447578499201705, + "grad_norm": 0.31659919023513794, + "learning_rate": 6.168875289218842e-08, + "loss": 0.1928, + "step": 11818 + }, + { + "epoch": 3.145023948908994, + "grad_norm": 0.3591203987598419, + "learning_rate": 6.16731535327953e-08, + "loss": 0.1782, + "step": 11819 + }, + { + "epoch": 3.145290047897818, + "grad_norm": 0.31530073285102844, + "learning_rate": 6.165755526656166e-08, + "loss": 0.166, + "step": 11820 + }, + { + "epoch": 3.145556146886642, + "grad_norm": 0.2683168649673462, + "learning_rate": 6.164195809393248e-08, + "loss": 0.1921, + "step": 11821 + }, + { + "epoch": 3.1458222458754657, + "grad_norm": 0.29282864928245544, + "learning_rate": 6.16263620153526e-08, + "loss": 0.1889, + "step": 11822 + }, + { + "epoch": 3.1460883448642893, + "grad_norm": 0.27429842948913574, + "learning_rate": 6.161076703126688e-08, + "loss": 0.1639, + "step": 11823 + }, + { + "epoch": 3.1463544438531135, + "grad_norm": 0.2754949629306793, + "learning_rate": 6.159517314212008e-08, + "loss": 0.1662, + "step": 11824 + }, + { + "epoch": 3.146620542841937, + "grad_norm": 0.274797260761261, + "learning_rate": 6.157958034835698e-08, + "loss": 0.1831, + "step": 11825 + }, + { + "epoch": 3.1468866418307613, + "grad_norm": 0.28264304995536804, + "learning_rate": 6.156398865042231e-08, + "loss": 0.1917, + "step": 11826 + }, + { + "epoch": 3.147152740819585, + "grad_norm": 0.3808365762233734, + "learning_rate": 6.154839804876083e-08, + "loss": 0.1903, + "step": 11827 + }, + { + "epoch": 3.1474188398084086, + "grad_norm": 0.3480012118816376, + "learning_rate": 6.15328085438171e-08, + "loss": 0.1732, + "step": 11828 + }, + { + "epoch": 3.1476849387972328, + "grad_norm": 0.327425479888916, + "learning_rate": 6.151722013603589e-08, + "loss": 0.1929, + "step": 11829 + }, + { + "epoch": 3.1479510377860564, + "grad_norm": 0.26215165853500366, + "learning_rate": 6.150163282586179e-08, + "loss": 0.1539, + "step": 11830 + }, + { + "epoch": 3.14821713677488, + "grad_norm": 0.26598238945007324, + "learning_rate": 6.14860466137393e-08, + "loss": 0.1641, + "step": 11831 + }, + { + "epoch": 3.1484832357637043, + "grad_norm": 0.3596375286579132, + "learning_rate": 6.147046150011305e-08, + "loss": 0.1535, + "step": 11832 + }, + { + "epoch": 3.148749334752528, + "grad_norm": 0.3598477244377136, + "learning_rate": 6.145487748542752e-08, + "loss": 0.179, + "step": 11833 + }, + { + "epoch": 3.1490154337413516, + "grad_norm": 0.29264017939567566, + "learning_rate": 6.143929457012726e-08, + "loss": 0.1864, + "step": 11834 + }, + { + "epoch": 3.1492815327301757, + "grad_norm": 0.2756272852420807, + "learning_rate": 6.142371275465661e-08, + "loss": 0.1787, + "step": 11835 + }, + { + "epoch": 3.1495476317189994, + "grad_norm": 0.2745903432369232, + "learning_rate": 6.140813203946012e-08, + "loss": 0.1717, + "step": 11836 + }, + { + "epoch": 3.149813730707823, + "grad_norm": 0.41323646903038025, + "learning_rate": 6.139255242498208e-08, + "loss": 0.187, + "step": 11837 + }, + { + "epoch": 3.1500798296966472, + "grad_norm": 0.24904702603816986, + "learning_rate": 6.137697391166694e-08, + "loss": 0.1587, + "step": 11838 + }, + { + "epoch": 3.150345928685471, + "grad_norm": 0.25851261615753174, + "learning_rate": 6.136139649995896e-08, + "loss": 0.1679, + "step": 11839 + }, + { + "epoch": 3.1506120276742946, + "grad_norm": 0.40504348278045654, + "learning_rate": 6.134582019030249e-08, + "loss": 0.1843, + "step": 11840 + }, + { + "epoch": 3.1508781266631187, + "grad_norm": 0.3370211124420166, + "learning_rate": 6.133024498314174e-08, + "loss": 0.193, + "step": 11841 + }, + { + "epoch": 3.1511442256519424, + "grad_norm": 0.3134459853172302, + "learning_rate": 6.131467087892102e-08, + "loss": 0.1871, + "step": 11842 + }, + { + "epoch": 3.1514103246407665, + "grad_norm": 0.36258193850517273, + "learning_rate": 6.129909787808448e-08, + "loss": 0.1982, + "step": 11843 + }, + { + "epoch": 3.15167642362959, + "grad_norm": 0.35292720794677734, + "learning_rate": 6.128352598107635e-08, + "loss": 0.1816, + "step": 11844 + }, + { + "epoch": 3.151942522618414, + "grad_norm": 0.6492853164672852, + "learning_rate": 6.126795518834069e-08, + "loss": 0.1547, + "step": 11845 + }, + { + "epoch": 3.152208621607238, + "grad_norm": 2.072326183319092, + "learning_rate": 6.125238550032168e-08, + "loss": 0.183, + "step": 11846 + }, + { + "epoch": 3.1524747205960617, + "grad_norm": 0.27150464057922363, + "learning_rate": 6.123681691746335e-08, + "loss": 0.1817, + "step": 11847 + }, + { + "epoch": 3.1527408195848854, + "grad_norm": 0.3066323697566986, + "learning_rate": 6.122124944020977e-08, + "loss": 0.1574, + "step": 11848 + }, + { + "epoch": 3.1530069185737095, + "grad_norm": 0.3181384801864624, + "learning_rate": 6.1205683069005e-08, + "loss": 0.1732, + "step": 11849 + }, + { + "epoch": 3.153273017562533, + "grad_norm": 0.2716754674911499, + "learning_rate": 6.119011780429296e-08, + "loss": 0.1721, + "step": 11850 + }, + { + "epoch": 3.1535391165513573, + "grad_norm": 0.2751457095146179, + "learning_rate": 6.117455364651763e-08, + "loss": 0.1715, + "step": 11851 + }, + { + "epoch": 3.153805215540181, + "grad_norm": 0.26036766171455383, + "learning_rate": 6.11589905961229e-08, + "loss": 0.1636, + "step": 11852 + }, + { + "epoch": 3.1540713145290047, + "grad_norm": 0.2755800485610962, + "learning_rate": 6.114342865355273e-08, + "loss": 0.1712, + "step": 11853 + }, + { + "epoch": 3.154337413517829, + "grad_norm": 0.3536057770252228, + "learning_rate": 6.112786781925087e-08, + "loss": 0.1959, + "step": 11854 + }, + { + "epoch": 3.1546035125066525, + "grad_norm": 0.2790689766407013, + "learning_rate": 6.111230809366131e-08, + "loss": 0.1722, + "step": 11855 + }, + { + "epoch": 3.154869611495476, + "grad_norm": 0.2902163565158844, + "learning_rate": 6.109674947722768e-08, + "loss": 0.1811, + "step": 11856 + }, + { + "epoch": 3.1551357104843003, + "grad_norm": 0.3816221058368683, + "learning_rate": 6.108119197039381e-08, + "loss": 0.1723, + "step": 11857 + }, + { + "epoch": 3.155401809473124, + "grad_norm": 0.2952605187892914, + "learning_rate": 6.106563557360342e-08, + "loss": 0.186, + "step": 11858 + }, + { + "epoch": 3.1556679084619477, + "grad_norm": 0.33536574244499207, + "learning_rate": 6.105008028730028e-08, + "loss": 0.1799, + "step": 11859 + }, + { + "epoch": 3.155934007450772, + "grad_norm": 0.2507496774196625, + "learning_rate": 6.103452611192797e-08, + "loss": 0.1581, + "step": 11860 + }, + { + "epoch": 3.1562001064395955, + "grad_norm": 0.3493976294994354, + "learning_rate": 6.101897304793015e-08, + "loss": 0.1935, + "step": 11861 + }, + { + "epoch": 3.156466205428419, + "grad_norm": 0.3326808512210846, + "learning_rate": 6.100342109575042e-08, + "loss": 0.17, + "step": 11862 + }, + { + "epoch": 3.1567323044172433, + "grad_norm": 0.3779498040676117, + "learning_rate": 6.098787025583241e-08, + "loss": 0.1899, + "step": 11863 + }, + { + "epoch": 3.156998403406067, + "grad_norm": 0.5835388898849487, + "learning_rate": 6.097232052861956e-08, + "loss": 0.1741, + "step": 11864 + }, + { + "epoch": 3.157264502394891, + "grad_norm": 0.26820001006126404, + "learning_rate": 6.095677191455549e-08, + "loss": 0.181, + "step": 11865 + }, + { + "epoch": 3.1575306013837148, + "grad_norm": 0.2751956880092621, + "learning_rate": 6.094122441408357e-08, + "loss": 0.1662, + "step": 11866 + }, + { + "epoch": 3.1577967003725385, + "grad_norm": 0.29316461086273193, + "learning_rate": 6.092567802764732e-08, + "loss": 0.184, + "step": 11867 + }, + { + "epoch": 3.1580627993613626, + "grad_norm": 0.27174636721611023, + "learning_rate": 6.091013275569014e-08, + "loss": 0.1803, + "step": 11868 + }, + { + "epoch": 3.1583288983501863, + "grad_norm": 0.30243566632270813, + "learning_rate": 6.08945885986554e-08, + "loss": 0.18, + "step": 11869 + }, + { + "epoch": 3.15859499733901, + "grad_norm": 0.3551551401615143, + "learning_rate": 6.08790455569865e-08, + "loss": 0.1772, + "step": 11870 + }, + { + "epoch": 3.158861096327834, + "grad_norm": 0.31014540791511536, + "learning_rate": 6.086350363112668e-08, + "loss": 0.1863, + "step": 11871 + }, + { + "epoch": 3.1591271953166578, + "grad_norm": 0.26934340596199036, + "learning_rate": 6.084796282151927e-08, + "loss": 0.1584, + "step": 11872 + }, + { + "epoch": 3.1593932943054814, + "grad_norm": 0.26557061076164246, + "learning_rate": 6.083242312860752e-08, + "loss": 0.168, + "step": 11873 + }, + { + "epoch": 3.1596593932943056, + "grad_norm": 0.3021237850189209, + "learning_rate": 6.081688455283469e-08, + "loss": 0.2051, + "step": 11874 + }, + { + "epoch": 3.1599254922831292, + "grad_norm": 0.26448068022727966, + "learning_rate": 6.080134709464389e-08, + "loss": 0.1624, + "step": 11875 + }, + { + "epoch": 3.1601915912719534, + "grad_norm": 0.27568501234054565, + "learning_rate": 6.078581075447836e-08, + "loss": 0.1832, + "step": 11876 + }, + { + "epoch": 3.160457690260777, + "grad_norm": 0.27622759342193604, + "learning_rate": 6.077027553278116e-08, + "loss": 0.1827, + "step": 11877 + }, + { + "epoch": 3.1607237892496007, + "grad_norm": 0.2685629725456238, + "learning_rate": 6.075474142999546e-08, + "loss": 0.1637, + "step": 11878 + }, + { + "epoch": 3.160989888238425, + "grad_norm": 0.42661193013191223, + "learning_rate": 6.073920844656424e-08, + "loss": 0.1749, + "step": 11879 + }, + { + "epoch": 3.1612559872272485, + "grad_norm": 0.2602249085903168, + "learning_rate": 6.072367658293063e-08, + "loss": 0.1674, + "step": 11880 + }, + { + "epoch": 3.1615220862160722, + "grad_norm": 0.26038050651550293, + "learning_rate": 6.070814583953753e-08, + "loss": 0.1653, + "step": 11881 + }, + { + "epoch": 3.1617881852048964, + "grad_norm": 0.2405632883310318, + "learning_rate": 6.069261621682798e-08, + "loss": 0.1562, + "step": 11882 + }, + { + "epoch": 3.16205428419372, + "grad_norm": 0.41698819398880005, + "learning_rate": 6.067708771524487e-08, + "loss": 0.2035, + "step": 11883 + }, + { + "epoch": 3.1623203831825437, + "grad_norm": 0.38324791193008423, + "learning_rate": 6.066156033523113e-08, + "loss": 0.1682, + "step": 11884 + }, + { + "epoch": 3.162586482171368, + "grad_norm": 0.2754330635070801, + "learning_rate": 6.064603407722966e-08, + "loss": 0.1883, + "step": 11885 + }, + { + "epoch": 3.1628525811601915, + "grad_norm": 0.33249861001968384, + "learning_rate": 6.063050894168324e-08, + "loss": 0.1744, + "step": 11886 + }, + { + "epoch": 3.163118680149015, + "grad_norm": 0.35368868708610535, + "learning_rate": 6.061498492903472e-08, + "loss": 0.1732, + "step": 11887 + }, + { + "epoch": 3.1633847791378393, + "grad_norm": 0.2654012143611908, + "learning_rate": 6.059946203972685e-08, + "loss": 0.1737, + "step": 11888 + }, + { + "epoch": 3.163650878126663, + "grad_norm": 0.3462672531604767, + "learning_rate": 6.058394027420242e-08, + "loss": 0.194, + "step": 11889 + }, + { + "epoch": 3.163916977115487, + "grad_norm": 0.3557705283164978, + "learning_rate": 6.056841963290408e-08, + "loss": 0.1833, + "step": 11890 + }, + { + "epoch": 3.164183076104311, + "grad_norm": 0.31792423129081726, + "learning_rate": 6.055290011627455e-08, + "loss": 0.1853, + "step": 11891 + }, + { + "epoch": 3.1644491750931345, + "grad_norm": 0.34526190161705017, + "learning_rate": 6.053738172475646e-08, + "loss": 0.178, + "step": 11892 + }, + { + "epoch": 3.1647152740819586, + "grad_norm": 0.3613329231739044, + "learning_rate": 6.052186445879246e-08, + "loss": 0.1803, + "step": 11893 + }, + { + "epoch": 3.1649813730707823, + "grad_norm": 0.28126049041748047, + "learning_rate": 6.050634831882507e-08, + "loss": 0.1721, + "step": 11894 + }, + { + "epoch": 3.165247472059606, + "grad_norm": 0.27053797245025635, + "learning_rate": 6.049083330529694e-08, + "loss": 0.1764, + "step": 11895 + }, + { + "epoch": 3.16551357104843, + "grad_norm": 0.310651570558548, + "learning_rate": 6.047531941865048e-08, + "loss": 0.1536, + "step": 11896 + }, + { + "epoch": 3.165779670037254, + "grad_norm": 0.2838292419910431, + "learning_rate": 6.045980665932823e-08, + "loss": 0.1859, + "step": 11897 + }, + { + "epoch": 3.166045769026078, + "grad_norm": 0.2638840079307556, + "learning_rate": 6.044429502777267e-08, + "loss": 0.1652, + "step": 11898 + }, + { + "epoch": 3.1663118680149016, + "grad_norm": 0.276083379983902, + "learning_rate": 6.04287845244262e-08, + "loss": 0.1703, + "step": 11899 + }, + { + "epoch": 3.1665779670037253, + "grad_norm": 0.29969632625579834, + "learning_rate": 6.041327514973118e-08, + "loss": 0.1881, + "step": 11900 + }, + { + "epoch": 3.1668440659925494, + "grad_norm": 0.2720969319343567, + "learning_rate": 6.039776690413002e-08, + "loss": 0.1738, + "step": 11901 + }, + { + "epoch": 3.167110164981373, + "grad_norm": 0.26625901460647583, + "learning_rate": 6.038225978806499e-08, + "loss": 0.1465, + "step": 11902 + }, + { + "epoch": 3.167376263970197, + "grad_norm": 0.2774592638015747, + "learning_rate": 6.036675380197842e-08, + "loss": 0.1718, + "step": 11903 + }, + { + "epoch": 3.167642362959021, + "grad_norm": 0.29199185967445374, + "learning_rate": 6.035124894631263e-08, + "loss": 0.1935, + "step": 11904 + }, + { + "epoch": 3.1679084619478446, + "grad_norm": 0.3856048285961151, + "learning_rate": 6.033574522150971e-08, + "loss": 0.201, + "step": 11905 + }, + { + "epoch": 3.1681745609366683, + "grad_norm": 0.28834083676338196, + "learning_rate": 6.0320242628012e-08, + "loss": 0.1843, + "step": 11906 + }, + { + "epoch": 3.1684406599254924, + "grad_norm": 0.30330485105514526, + "learning_rate": 6.030474116626158e-08, + "loss": 0.1833, + "step": 11907 + }, + { + "epoch": 3.168706758914316, + "grad_norm": 0.3458113670349121, + "learning_rate": 6.02892408367006e-08, + "loss": 0.1708, + "step": 11908 + }, + { + "epoch": 3.1689728579031398, + "grad_norm": 0.2906891703605652, + "learning_rate": 6.027374163977117e-08, + "loss": 0.1753, + "step": 11909 + }, + { + "epoch": 3.169238956891964, + "grad_norm": 0.2812684178352356, + "learning_rate": 6.025824357591539e-08, + "loss": 0.1779, + "step": 11910 + }, + { + "epoch": 3.1695050558807876, + "grad_norm": 0.38023585081100464, + "learning_rate": 6.024274664557522e-08, + "loss": 0.1882, + "step": 11911 + }, + { + "epoch": 3.1697711548696117, + "grad_norm": 0.251190185546875, + "learning_rate": 6.022725084919273e-08, + "loss": 0.1522, + "step": 11912 + }, + { + "epoch": 3.1700372538584354, + "grad_norm": 0.3387731909751892, + "learning_rate": 6.021175618720984e-08, + "loss": 0.182, + "step": 11913 + }, + { + "epoch": 3.170303352847259, + "grad_norm": 0.3180868327617645, + "learning_rate": 6.019626266006856e-08, + "loss": 0.1832, + "step": 11914 + }, + { + "epoch": 3.170569451836083, + "grad_norm": 0.26614701747894287, + "learning_rate": 6.018077026821071e-08, + "loss": 0.1902, + "step": 11915 + }, + { + "epoch": 3.170835550824907, + "grad_norm": 0.570016086101532, + "learning_rate": 6.016527901207824e-08, + "loss": 0.1692, + "step": 11916 + }, + { + "epoch": 3.1711016498137305, + "grad_norm": 0.29296180605888367, + "learning_rate": 6.014978889211293e-08, + "loss": 0.1723, + "step": 11917 + }, + { + "epoch": 3.1713677488025547, + "grad_norm": 0.28071051836013794, + "learning_rate": 6.013429990875665e-08, + "loss": 0.1829, + "step": 11918 + }, + { + "epoch": 3.1716338477913784, + "grad_norm": 0.2726999521255493, + "learning_rate": 6.011881206245113e-08, + "loss": 0.1661, + "step": 11919 + }, + { + "epoch": 3.171899946780202, + "grad_norm": 0.28222736716270447, + "learning_rate": 6.010332535363815e-08, + "loss": 0.1737, + "step": 11920 + }, + { + "epoch": 3.172166045769026, + "grad_norm": 0.27098461985588074, + "learning_rate": 6.00878397827594e-08, + "loss": 0.175, + "step": 11921 + }, + { + "epoch": 3.17243214475785, + "grad_norm": 0.2649068236351013, + "learning_rate": 6.007235535025654e-08, + "loss": 0.1582, + "step": 11922 + }, + { + "epoch": 3.172698243746674, + "grad_norm": 0.3439512848854065, + "learning_rate": 6.005687205657126e-08, + "loss": 0.1857, + "step": 11923 + }, + { + "epoch": 3.1729643427354977, + "grad_norm": 0.3835671544075012, + "learning_rate": 6.004138990214514e-08, + "loss": 0.1799, + "step": 11924 + }, + { + "epoch": 3.1732304417243213, + "grad_norm": 0.39118650555610657, + "learning_rate": 6.002590888741983e-08, + "loss": 0.1906, + "step": 11925 + }, + { + "epoch": 3.1734965407131455, + "grad_norm": 0.3017942011356354, + "learning_rate": 6.001042901283678e-08, + "loss": 0.1603, + "step": 11926 + }, + { + "epoch": 3.173762639701969, + "grad_norm": 0.35917001962661743, + "learning_rate": 5.999495027883759e-08, + "loss": 0.1939, + "step": 11927 + }, + { + "epoch": 3.174028738690793, + "grad_norm": 0.3210327923297882, + "learning_rate": 5.997947268586368e-08, + "loss": 0.1716, + "step": 11928 + }, + { + "epoch": 3.174294837679617, + "grad_norm": 0.2770915627479553, + "learning_rate": 5.996399623435658e-08, + "loss": 0.1741, + "step": 11929 + }, + { + "epoch": 3.1745609366684406, + "grad_norm": 0.3628910481929779, + "learning_rate": 5.99485209247576e-08, + "loss": 0.1947, + "step": 11930 + }, + { + "epoch": 3.1748270356572643, + "grad_norm": 0.2688465416431427, + "learning_rate": 5.993304675750825e-08, + "loss": 0.1621, + "step": 11931 + }, + { + "epoch": 3.1750931346460884, + "grad_norm": 0.27930665016174316, + "learning_rate": 5.99175737330498e-08, + "loss": 0.1635, + "step": 11932 + }, + { + "epoch": 3.175359233634912, + "grad_norm": 0.47622328996658325, + "learning_rate": 5.99021018518236e-08, + "loss": 0.1785, + "step": 11933 + }, + { + "epoch": 3.175625332623736, + "grad_norm": 0.33191365003585815, + "learning_rate": 5.988663111427092e-08, + "loss": 0.1829, + "step": 11934 + }, + { + "epoch": 3.17589143161256, + "grad_norm": 0.35490578413009644, + "learning_rate": 5.987116152083309e-08, + "loss": 0.1676, + "step": 11935 + }, + { + "epoch": 3.1761575306013836, + "grad_norm": 0.3226417601108551, + "learning_rate": 5.985569307195123e-08, + "loss": 0.2022, + "step": 11936 + }, + { + "epoch": 3.1764236295902077, + "grad_norm": 0.2860112488269806, + "learning_rate": 5.98402257680666e-08, + "loss": 0.1785, + "step": 11937 + }, + { + "epoch": 3.1766897285790314, + "grad_norm": 0.2875100374221802, + "learning_rate": 5.982475960962033e-08, + "loss": 0.1847, + "step": 11938 + }, + { + "epoch": 3.176955827567855, + "grad_norm": 0.28881633281707764, + "learning_rate": 5.980929459705355e-08, + "loss": 0.1701, + "step": 11939 + }, + { + "epoch": 3.1772219265566792, + "grad_norm": 0.2802940607070923, + "learning_rate": 5.979383073080739e-08, + "loss": 0.1809, + "step": 11940 + }, + { + "epoch": 3.177488025545503, + "grad_norm": 0.25457876920700073, + "learning_rate": 5.977836801132285e-08, + "loss": 0.1643, + "step": 11941 + }, + { + "epoch": 3.1777541245343266, + "grad_norm": 0.25361230969429016, + "learning_rate": 5.9762906439041e-08, + "loss": 0.1742, + "step": 11942 + }, + { + "epoch": 3.1780202235231507, + "grad_norm": 0.38504382967948914, + "learning_rate": 5.974744601440282e-08, + "loss": 0.18, + "step": 11943 + }, + { + "epoch": 3.1782863225119744, + "grad_norm": 0.29778531193733215, + "learning_rate": 5.973198673784927e-08, + "loss": 0.1862, + "step": 11944 + }, + { + "epoch": 3.1785524215007985, + "grad_norm": 0.2912069857120514, + "learning_rate": 5.971652860982129e-08, + "loss": 0.1812, + "step": 11945 + }, + { + "epoch": 3.178818520489622, + "grad_norm": 0.27768680453300476, + "learning_rate": 5.970107163075982e-08, + "loss": 0.179, + "step": 11946 + }, + { + "epoch": 3.179084619478446, + "grad_norm": 0.28923651576042175, + "learning_rate": 5.968561580110562e-08, + "loss": 0.1808, + "step": 11947 + }, + { + "epoch": 3.17935071846727, + "grad_norm": 0.268280565738678, + "learning_rate": 5.96701611212996e-08, + "loss": 0.162, + "step": 11948 + }, + { + "epoch": 3.1796168174560937, + "grad_norm": 0.2836625277996063, + "learning_rate": 5.965470759178253e-08, + "loss": 0.1857, + "step": 11949 + }, + { + "epoch": 3.1798829164449174, + "grad_norm": 0.2827036678791046, + "learning_rate": 5.963925521299522e-08, + "loss": 0.1754, + "step": 11950 + }, + { + "epoch": 3.1801490154337415, + "grad_norm": 0.29567715525627136, + "learning_rate": 5.962380398537831e-08, + "loss": 0.1589, + "step": 11951 + }, + { + "epoch": 3.180415114422565, + "grad_norm": 0.2977464199066162, + "learning_rate": 5.960835390937261e-08, + "loss": 0.1795, + "step": 11952 + }, + { + "epoch": 3.180681213411389, + "grad_norm": 0.34296318888664246, + "learning_rate": 5.9592904985418714e-08, + "loss": 0.1703, + "step": 11953 + }, + { + "epoch": 3.180947312400213, + "grad_norm": 0.36856022477149963, + "learning_rate": 5.957745721395731e-08, + "loss": 0.1692, + "step": 11954 + }, + { + "epoch": 3.1812134113890367, + "grad_norm": 0.3832537531852722, + "learning_rate": 5.956201059542889e-08, + "loss": 0.1815, + "step": 11955 + }, + { + "epoch": 3.1814795103778604, + "grad_norm": 0.27379220724105835, + "learning_rate": 5.9546565130274184e-08, + "loss": 0.1776, + "step": 11956 + }, + { + "epoch": 3.1817456093666845, + "grad_norm": 0.31119421124458313, + "learning_rate": 5.95311208189336e-08, + "loss": 0.1648, + "step": 11957 + }, + { + "epoch": 3.182011708355508, + "grad_norm": 0.28941088914871216, + "learning_rate": 5.9515677661847684e-08, + "loss": 0.1684, + "step": 11958 + }, + { + "epoch": 3.1822778073443323, + "grad_norm": 0.3636801540851593, + "learning_rate": 5.950023565945694e-08, + "loss": 0.1698, + "step": 11959 + }, + { + "epoch": 3.182543906333156, + "grad_norm": 0.2652329206466675, + "learning_rate": 5.948479481220174e-08, + "loss": 0.1728, + "step": 11960 + }, + { + "epoch": 3.1828100053219797, + "grad_norm": 0.2582966983318329, + "learning_rate": 5.9469355120522576e-08, + "loss": 0.167, + "step": 11961 + }, + { + "epoch": 3.183076104310804, + "grad_norm": 0.30457982420921326, + "learning_rate": 5.945391658485972e-08, + "loss": 0.1702, + "step": 11962 + }, + { + "epoch": 3.1833422032996275, + "grad_norm": 0.24992243945598602, + "learning_rate": 5.943847920565357e-08, + "loss": 0.1713, + "step": 11963 + }, + { + "epoch": 3.183608302288451, + "grad_norm": 0.2741711437702179, + "learning_rate": 5.9423042983344407e-08, + "loss": 0.1588, + "step": 11964 + }, + { + "epoch": 3.1838744012772753, + "grad_norm": 0.26241305470466614, + "learning_rate": 5.940760791837256e-08, + "loss": 0.1695, + "step": 11965 + }, + { + "epoch": 3.184140500266099, + "grad_norm": 0.3896002769470215, + "learning_rate": 5.939217401117818e-08, + "loss": 0.176, + "step": 11966 + }, + { + "epoch": 3.1844065992549226, + "grad_norm": 0.27144014835357666, + "learning_rate": 5.9376741262201514e-08, + "loss": 0.1876, + "step": 11967 + }, + { + "epoch": 3.1846726982437468, + "grad_norm": 0.34928637742996216, + "learning_rate": 5.936130967188274e-08, + "loss": 0.1863, + "step": 11968 + }, + { + "epoch": 3.1849387972325705, + "grad_norm": 0.5001983046531677, + "learning_rate": 5.934587924066201e-08, + "loss": 0.1656, + "step": 11969 + }, + { + "epoch": 3.1852048962213946, + "grad_norm": 0.3499554395675659, + "learning_rate": 5.9330449968979404e-08, + "loss": 0.1756, + "step": 11970 + }, + { + "epoch": 3.1854709952102183, + "grad_norm": 0.3885536789894104, + "learning_rate": 5.931502185727504e-08, + "loss": 0.1921, + "step": 11971 + }, + { + "epoch": 3.185737094199042, + "grad_norm": 0.3609967529773712, + "learning_rate": 5.9299594905988893e-08, + "loss": 0.1818, + "step": 11972 + }, + { + "epoch": 3.186003193187866, + "grad_norm": 0.28766992688179016, + "learning_rate": 5.9284169115561024e-08, + "loss": 0.1903, + "step": 11973 + }, + { + "epoch": 3.1862692921766897, + "grad_norm": 0.26685017347335815, + "learning_rate": 5.926874448643138e-08, + "loss": 0.1771, + "step": 11974 + }, + { + "epoch": 3.1865353911655134, + "grad_norm": 0.2912537455558777, + "learning_rate": 5.925332101903994e-08, + "loss": 0.1862, + "step": 11975 + }, + { + "epoch": 3.1868014901543376, + "grad_norm": 0.2815379202365875, + "learning_rate": 5.923789871382654e-08, + "loss": 0.1694, + "step": 11976 + }, + { + "epoch": 3.1870675891431612, + "grad_norm": 0.34788277745246887, + "learning_rate": 5.9222477571231114e-08, + "loss": 0.1992, + "step": 11977 + }, + { + "epoch": 3.187333688131985, + "grad_norm": 0.34535568952560425, + "learning_rate": 5.920705759169351e-08, + "loss": 0.2048, + "step": 11978 + }, + { + "epoch": 3.187599787120809, + "grad_norm": 0.26352280378341675, + "learning_rate": 5.9191638775653496e-08, + "loss": 0.164, + "step": 11979 + }, + { + "epoch": 3.1878658861096327, + "grad_norm": 0.28702908754348755, + "learning_rate": 5.917622112355092e-08, + "loss": 0.1651, + "step": 11980 + }, + { + "epoch": 3.1881319850984564, + "grad_norm": 0.33622515201568604, + "learning_rate": 5.91608046358254e-08, + "loss": 0.1809, + "step": 11981 + }, + { + "epoch": 3.1883980840872805, + "grad_norm": 0.2898399829864502, + "learning_rate": 5.9145389312916794e-08, + "loss": 0.1708, + "step": 11982 + }, + { + "epoch": 3.188664183076104, + "grad_norm": 0.2940463125705719, + "learning_rate": 5.9129975155264656e-08, + "loss": 0.1814, + "step": 11983 + }, + { + "epoch": 3.1889302820649283, + "grad_norm": 0.517297625541687, + "learning_rate": 5.91145621633087e-08, + "loss": 0.1845, + "step": 11984 + }, + { + "epoch": 3.189196381053752, + "grad_norm": 0.5578708052635193, + "learning_rate": 5.90991503374885e-08, + "loss": 0.1948, + "step": 11985 + }, + { + "epoch": 3.1894624800425757, + "grad_norm": 0.2653059959411621, + "learning_rate": 5.9083739678243694e-08, + "loss": 0.1557, + "step": 11986 + }, + { + "epoch": 3.1897285790314, + "grad_norm": 0.25151151418685913, + "learning_rate": 5.906833018601371e-08, + "loss": 0.1711, + "step": 11987 + }, + { + "epoch": 3.1899946780202235, + "grad_norm": 0.29830506443977356, + "learning_rate": 5.9052921861238184e-08, + "loss": 0.1701, + "step": 11988 + }, + { + "epoch": 3.190260777009047, + "grad_norm": 0.25313836336135864, + "learning_rate": 5.9037514704356494e-08, + "loss": 0.1632, + "step": 11989 + }, + { + "epoch": 3.1905268759978713, + "grad_norm": 0.3865346908569336, + "learning_rate": 5.902210871580816e-08, + "loss": 0.1971, + "step": 11990 + }, + { + "epoch": 3.190792974986695, + "grad_norm": 0.3515937030315399, + "learning_rate": 5.900670389603253e-08, + "loss": 0.1886, + "step": 11991 + }, + { + "epoch": 3.1910590739755187, + "grad_norm": 0.2679867446422577, + "learning_rate": 5.899130024546901e-08, + "loss": 0.1682, + "step": 11992 + }, + { + "epoch": 3.191325172964343, + "grad_norm": 0.34065479040145874, + "learning_rate": 5.8975897764556934e-08, + "loss": 0.1722, + "step": 11993 + }, + { + "epoch": 3.1915912719531665, + "grad_norm": 0.30871421098709106, + "learning_rate": 5.896049645373562e-08, + "loss": 0.1535, + "step": 11994 + }, + { + "epoch": 3.1918573709419906, + "grad_norm": 0.29427453875541687, + "learning_rate": 5.894509631344435e-08, + "loss": 0.1906, + "step": 11995 + }, + { + "epoch": 3.1921234699308143, + "grad_norm": 0.4640393853187561, + "learning_rate": 5.892969734412234e-08, + "loss": 0.2138, + "step": 11996 + }, + { + "epoch": 3.192389568919638, + "grad_norm": 0.26527586579322815, + "learning_rate": 5.891429954620888e-08, + "loss": 0.1772, + "step": 11997 + }, + { + "epoch": 3.192655667908462, + "grad_norm": 0.3744160532951355, + "learning_rate": 5.889890292014303e-08, + "loss": 0.1809, + "step": 11998 + }, + { + "epoch": 3.192921766897286, + "grad_norm": 0.4246637523174286, + "learning_rate": 5.8883507466364024e-08, + "loss": 0.1949, + "step": 11999 + }, + { + "epoch": 3.1931878658861095, + "grad_norm": 0.29714322090148926, + "learning_rate": 5.886811318531091e-08, + "loss": 0.1853, + "step": 12000 + }, + { + "epoch": 3.1934539648749336, + "grad_norm": 0.28420138359069824, + "learning_rate": 5.8852720077422836e-08, + "loss": 0.1767, + "step": 12001 + }, + { + "epoch": 3.1937200638637573, + "grad_norm": 0.24383562803268433, + "learning_rate": 5.883732814313875e-08, + "loss": 0.1704, + "step": 12002 + }, + { + "epoch": 3.193986162852581, + "grad_norm": 0.2677529454231262, + "learning_rate": 5.8821937382897735e-08, + "loss": 0.1698, + "step": 12003 + }, + { + "epoch": 3.194252261841405, + "grad_norm": 0.39828842878341675, + "learning_rate": 5.880654779713874e-08, + "loss": 0.1946, + "step": 12004 + }, + { + "epoch": 3.1945183608302288, + "grad_norm": 0.26583895087242126, + "learning_rate": 5.8791159386300726e-08, + "loss": 0.1848, + "step": 12005 + }, + { + "epoch": 3.1947844598190525, + "grad_norm": 0.4101714491844177, + "learning_rate": 5.8775772150822544e-08, + "loss": 0.195, + "step": 12006 + }, + { + "epoch": 3.1950505588078766, + "grad_norm": 0.284790575504303, + "learning_rate": 5.8760386091143176e-08, + "loss": 0.1854, + "step": 12007 + }, + { + "epoch": 3.1953166577967003, + "grad_norm": 0.27508991956710815, + "learning_rate": 5.874500120770134e-08, + "loss": 0.1751, + "step": 12008 + }, + { + "epoch": 3.1955827567855244, + "grad_norm": 0.41857147216796875, + "learning_rate": 5.8729617500935944e-08, + "loss": 0.1962, + "step": 12009 + }, + { + "epoch": 3.195848855774348, + "grad_norm": 0.2987561523914337, + "learning_rate": 5.871423497128569e-08, + "loss": 0.1807, + "step": 12010 + }, + { + "epoch": 3.1961149547631718, + "grad_norm": 0.261999249458313, + "learning_rate": 5.8698853619189394e-08, + "loss": 0.1757, + "step": 12011 + }, + { + "epoch": 3.196381053751996, + "grad_norm": 0.252748042345047, + "learning_rate": 5.868347344508568e-08, + "loss": 0.1648, + "step": 12012 + }, + { + "epoch": 3.1966471527408196, + "grad_norm": 0.28660669922828674, + "learning_rate": 5.8668094449413265e-08, + "loss": 0.191, + "step": 12013 + }, + { + "epoch": 3.1969132517296432, + "grad_norm": 0.41577044129371643, + "learning_rate": 5.8652716632610796e-08, + "loss": 0.2123, + "step": 12014 + }, + { + "epoch": 3.1971793507184674, + "grad_norm": 0.3547002971172333, + "learning_rate": 5.863733999511686e-08, + "loss": 0.1864, + "step": 12015 + }, + { + "epoch": 3.197445449707291, + "grad_norm": 0.28020694851875305, + "learning_rate": 5.8621964537370085e-08, + "loss": 0.1775, + "step": 12016 + }, + { + "epoch": 3.197711548696115, + "grad_norm": 0.3525298237800598, + "learning_rate": 5.860659025980892e-08, + "loss": 0.1912, + "step": 12017 + }, + { + "epoch": 3.197977647684939, + "grad_norm": 0.40609806776046753, + "learning_rate": 5.859121716287194e-08, + "loss": 0.1874, + "step": 12018 + }, + { + "epoch": 3.1982437466737625, + "grad_norm": 0.26157912611961365, + "learning_rate": 5.857584524699757e-08, + "loss": 0.1629, + "step": 12019 + }, + { + "epoch": 3.1985098456625867, + "grad_norm": 0.2931583523750305, + "learning_rate": 5.856047451262429e-08, + "loss": 0.184, + "step": 12020 + }, + { + "epoch": 3.1987759446514104, + "grad_norm": 0.28126394748687744, + "learning_rate": 5.8545104960190473e-08, + "loss": 0.1768, + "step": 12021 + }, + { + "epoch": 3.199042043640234, + "grad_norm": 0.42986103892326355, + "learning_rate": 5.852973659013453e-08, + "loss": 0.207, + "step": 12022 + }, + { + "epoch": 3.199308142629058, + "grad_norm": 0.2919469177722931, + "learning_rate": 5.8514369402894734e-08, + "loss": 0.1659, + "step": 12023 + }, + { + "epoch": 3.199574241617882, + "grad_norm": 0.26449957489967346, + "learning_rate": 5.849900339890945e-08, + "loss": 0.1731, + "step": 12024 + }, + { + "epoch": 3.1998403406067055, + "grad_norm": 0.26058733463287354, + "learning_rate": 5.8483638578616915e-08, + "loss": 0.1643, + "step": 12025 + }, + { + "epoch": 3.2001064395955297, + "grad_norm": 0.27072152495384216, + "learning_rate": 5.8468274942455384e-08, + "loss": 0.1658, + "step": 12026 + }, + { + "epoch": 3.2003725385843533, + "grad_norm": 0.30244532227516174, + "learning_rate": 5.8452912490863037e-08, + "loss": 0.1785, + "step": 12027 + }, + { + "epoch": 3.200638637573177, + "grad_norm": 0.3349272906780243, + "learning_rate": 5.843755122427806e-08, + "loss": 0.176, + "step": 12028 + }, + { + "epoch": 3.200904736562001, + "grad_norm": 0.28154730796813965, + "learning_rate": 5.842219114313855e-08, + "loss": 0.1542, + "step": 12029 + }, + { + "epoch": 3.201170835550825, + "grad_norm": 0.27325916290283203, + "learning_rate": 5.840683224788265e-08, + "loss": 0.1662, + "step": 12030 + }, + { + "epoch": 3.201436934539649, + "grad_norm": 0.27200034260749817, + "learning_rate": 5.839147453894846e-08, + "loss": 0.1903, + "step": 12031 + }, + { + "epoch": 3.2017030335284726, + "grad_norm": 0.2734995186328888, + "learning_rate": 5.8376118016773936e-08, + "loss": 0.1861, + "step": 12032 + }, + { + "epoch": 3.2019691325172963, + "grad_norm": 0.3931134045124054, + "learning_rate": 5.836076268179711e-08, + "loss": 0.2079, + "step": 12033 + }, + { + "epoch": 3.2022352315061204, + "grad_norm": 0.25417637825012207, + "learning_rate": 5.8345408534455934e-08, + "loss": 0.1637, + "step": 12034 + }, + { + "epoch": 3.202501330494944, + "grad_norm": 0.2828201651573181, + "learning_rate": 5.8330055575188385e-08, + "loss": 0.1514, + "step": 12035 + }, + { + "epoch": 3.202767429483768, + "grad_norm": 0.3717331290245056, + "learning_rate": 5.8314703804432296e-08, + "loss": 0.1933, + "step": 12036 + }, + { + "epoch": 3.203033528472592, + "grad_norm": 0.2728697955608368, + "learning_rate": 5.8299353222625605e-08, + "loss": 0.1602, + "step": 12037 + }, + { + "epoch": 3.2032996274614156, + "grad_norm": 0.2831442058086395, + "learning_rate": 5.828400383020606e-08, + "loss": 0.1862, + "step": 12038 + }, + { + "epoch": 3.2035657264502393, + "grad_norm": 0.6519262790679932, + "learning_rate": 5.826865562761154e-08, + "loss": 0.1861, + "step": 12039 + }, + { + "epoch": 3.2038318254390634, + "grad_norm": 0.3814117908477783, + "learning_rate": 5.825330861527973e-08, + "loss": 0.1711, + "step": 12040 + }, + { + "epoch": 3.204097924427887, + "grad_norm": 0.27905887365341187, + "learning_rate": 5.8237962793648374e-08, + "loss": 0.1807, + "step": 12041 + }, + { + "epoch": 3.2043640234167112, + "grad_norm": 0.29629823565483093, + "learning_rate": 5.822261816315519e-08, + "loss": 0.189, + "step": 12042 + }, + { + "epoch": 3.204630122405535, + "grad_norm": 0.3916611969470978, + "learning_rate": 5.8207274724237874e-08, + "loss": 0.1929, + "step": 12043 + }, + { + "epoch": 3.2048962213943586, + "grad_norm": 0.25367558002471924, + "learning_rate": 5.819193247733398e-08, + "loss": 0.1642, + "step": 12044 + }, + { + "epoch": 3.2051623203831827, + "grad_norm": 0.3603900969028473, + "learning_rate": 5.817659142288116e-08, + "loss": 0.1665, + "step": 12045 + }, + { + "epoch": 3.2054284193720064, + "grad_norm": 0.4571562707424164, + "learning_rate": 5.8161251561316907e-08, + "loss": 0.1793, + "step": 12046 + }, + { + "epoch": 3.20569451836083, + "grad_norm": 0.2797011733055115, + "learning_rate": 5.8145912893078765e-08, + "loss": 0.17, + "step": 12047 + }, + { + "epoch": 3.205960617349654, + "grad_norm": 0.28821396827697754, + "learning_rate": 5.8130575418604245e-08, + "loss": 0.1809, + "step": 12048 + }, + { + "epoch": 3.206226716338478, + "grad_norm": 0.4036629796028137, + "learning_rate": 5.8115239138330805e-08, + "loss": 0.208, + "step": 12049 + }, + { + "epoch": 3.2064928153273016, + "grad_norm": 0.26510629057884216, + "learning_rate": 5.809990405269589e-08, + "loss": 0.1561, + "step": 12050 + }, + { + "epoch": 3.2067589143161257, + "grad_norm": 0.28780654072761536, + "learning_rate": 5.808457016213681e-08, + "loss": 0.1761, + "step": 12051 + }, + { + "epoch": 3.2070250133049494, + "grad_norm": 0.30571579933166504, + "learning_rate": 5.8069237467091026e-08, + "loss": 0.1842, + "step": 12052 + }, + { + "epoch": 3.207291112293773, + "grad_norm": 0.28506138920783997, + "learning_rate": 5.8053905967995755e-08, + "loss": 0.1654, + "step": 12053 + }, + { + "epoch": 3.207557211282597, + "grad_norm": 0.4073519706726074, + "learning_rate": 5.8038575665288345e-08, + "loss": 0.1874, + "step": 12054 + }, + { + "epoch": 3.207823310271421, + "grad_norm": 0.38089513778686523, + "learning_rate": 5.8023246559406004e-08, + "loss": 0.1709, + "step": 12055 + }, + { + "epoch": 3.208089409260245, + "grad_norm": 0.2681274116039276, + "learning_rate": 5.800791865078596e-08, + "loss": 0.1825, + "step": 12056 + }, + { + "epoch": 3.2083555082490687, + "grad_norm": 0.2760154902935028, + "learning_rate": 5.799259193986541e-08, + "loss": 0.1828, + "step": 12057 + }, + { + "epoch": 3.2086216072378924, + "grad_norm": 0.2667584717273712, + "learning_rate": 5.797726642708155e-08, + "loss": 0.1676, + "step": 12058 + }, + { + "epoch": 3.2088877062267165, + "grad_norm": 0.3671555519104004, + "learning_rate": 5.7961942112871396e-08, + "loss": 0.2023, + "step": 12059 + }, + { + "epoch": 3.20915380521554, + "grad_norm": 0.5322686433792114, + "learning_rate": 5.7946618997672125e-08, + "loss": 0.1751, + "step": 12060 + }, + { + "epoch": 3.209419904204364, + "grad_norm": 0.4113117754459381, + "learning_rate": 5.79312970819207e-08, + "loss": 0.1793, + "step": 12061 + }, + { + "epoch": 3.209686003193188, + "grad_norm": 0.32827672362327576, + "learning_rate": 5.791597636605416e-08, + "loss": 0.1832, + "step": 12062 + }, + { + "epoch": 3.2099521021820117, + "grad_norm": 0.34732192754745483, + "learning_rate": 5.79006568505095e-08, + "loss": 0.1751, + "step": 12063 + }, + { + "epoch": 3.210218201170836, + "grad_norm": 0.3025844097137451, + "learning_rate": 5.7885338535723696e-08, + "loss": 0.1892, + "step": 12064 + }, + { + "epoch": 3.2104843001596595, + "grad_norm": 0.2692311704158783, + "learning_rate": 5.787002142213358e-08, + "loss": 0.1681, + "step": 12065 + }, + { + "epoch": 3.210750399148483, + "grad_norm": 0.2695189118385315, + "learning_rate": 5.7854705510176096e-08, + "loss": 0.1925, + "step": 12066 + }, + { + "epoch": 3.2110164981373073, + "grad_norm": 0.28052952885627747, + "learning_rate": 5.7839390800288016e-08, + "loss": 0.1645, + "step": 12067 + }, + { + "epoch": 3.211282597126131, + "grad_norm": 0.28359031677246094, + "learning_rate": 5.7824077292906194e-08, + "loss": 0.1893, + "step": 12068 + }, + { + "epoch": 3.2115486961149546, + "grad_norm": 0.2650441825389862, + "learning_rate": 5.780876498846743e-08, + "loss": 0.1766, + "step": 12069 + }, + { + "epoch": 3.2118147951037788, + "grad_norm": 0.29341891407966614, + "learning_rate": 5.779345388740833e-08, + "loss": 0.1763, + "step": 12070 + }, + { + "epoch": 3.2120808940926024, + "grad_norm": 0.2630641460418701, + "learning_rate": 5.7778143990165785e-08, + "loss": 0.1721, + "step": 12071 + }, + { + "epoch": 3.212346993081426, + "grad_norm": 0.3335302174091339, + "learning_rate": 5.776283529717634e-08, + "loss": 0.1813, + "step": 12072 + }, + { + "epoch": 3.2126130920702503, + "grad_norm": 0.29598015546798706, + "learning_rate": 5.7747527808876694e-08, + "loss": 0.1769, + "step": 12073 + }, + { + "epoch": 3.212879191059074, + "grad_norm": 0.2699013948440552, + "learning_rate": 5.7732221525703373e-08, + "loss": 0.1732, + "step": 12074 + }, + { + "epoch": 3.2131452900478976, + "grad_norm": 0.28070372343063354, + "learning_rate": 5.7716916448093035e-08, + "loss": 0.1767, + "step": 12075 + }, + { + "epoch": 3.2134113890367217, + "grad_norm": 0.32514166831970215, + "learning_rate": 5.770161257648212e-08, + "loss": 0.1782, + "step": 12076 + }, + { + "epoch": 3.2136774880255454, + "grad_norm": 0.5391966104507446, + "learning_rate": 5.7686309911307164e-08, + "loss": 0.1966, + "step": 12077 + }, + { + "epoch": 3.2139435870143696, + "grad_norm": 0.26536035537719727, + "learning_rate": 5.767100845300463e-08, + "loss": 0.1635, + "step": 12078 + }, + { + "epoch": 3.2142096860031932, + "grad_norm": 0.3630008101463318, + "learning_rate": 5.7655708202011e-08, + "loss": 0.1644, + "step": 12079 + }, + { + "epoch": 3.214475784992017, + "grad_norm": 0.26583132147789, + "learning_rate": 5.764040915876258e-08, + "loss": 0.1716, + "step": 12080 + }, + { + "epoch": 3.214741883980841, + "grad_norm": 0.27126526832580566, + "learning_rate": 5.762511132369582e-08, + "loss": 0.1751, + "step": 12081 + }, + { + "epoch": 3.2150079829696647, + "grad_norm": 0.2837965488433838, + "learning_rate": 5.760981469724694e-08, + "loss": 0.1898, + "step": 12082 + }, + { + "epoch": 3.2152740819584884, + "grad_norm": 0.32176196575164795, + "learning_rate": 5.7594519279852284e-08, + "loss": 0.1833, + "step": 12083 + }, + { + "epoch": 3.2155401809473125, + "grad_norm": 0.3732297420501709, + "learning_rate": 5.757922507194812e-08, + "loss": 0.1865, + "step": 12084 + }, + { + "epoch": 3.215806279936136, + "grad_norm": 0.2814352214336395, + "learning_rate": 5.7563932073970656e-08, + "loss": 0.1624, + "step": 12085 + }, + { + "epoch": 3.21607237892496, + "grad_norm": 0.3968043327331543, + "learning_rate": 5.754864028635613e-08, + "loss": 0.175, + "step": 12086 + }, + { + "epoch": 3.216338477913784, + "grad_norm": 0.3620111644268036, + "learning_rate": 5.75333497095406e-08, + "loss": 0.1793, + "step": 12087 + }, + { + "epoch": 3.2166045769026077, + "grad_norm": 0.3967008590698242, + "learning_rate": 5.751806034396027e-08, + "loss": 0.189, + "step": 12088 + }, + { + "epoch": 3.216870675891432, + "grad_norm": 0.359412282705307, + "learning_rate": 5.7502772190051154e-08, + "loss": 0.1842, + "step": 12089 + }, + { + "epoch": 3.2171367748802555, + "grad_norm": 0.3475116491317749, + "learning_rate": 5.748748524824938e-08, + "loss": 0.1912, + "step": 12090 + }, + { + "epoch": 3.217402873869079, + "grad_norm": 0.4287604093551636, + "learning_rate": 5.747219951899086e-08, + "loss": 0.1867, + "step": 12091 + }, + { + "epoch": 3.2176689728579033, + "grad_norm": 0.30761176347732544, + "learning_rate": 5.7456915002711636e-08, + "loss": 0.192, + "step": 12092 + }, + { + "epoch": 3.217935071846727, + "grad_norm": 0.350363165140152, + "learning_rate": 5.744163169984765e-08, + "loss": 0.1784, + "step": 12093 + }, + { + "epoch": 3.2182011708355507, + "grad_norm": 0.2846067249774933, + "learning_rate": 5.7426349610834855e-08, + "loss": 0.197, + "step": 12094 + }, + { + "epoch": 3.218467269824375, + "grad_norm": 0.2740904688835144, + "learning_rate": 5.741106873610904e-08, + "loss": 0.1772, + "step": 12095 + }, + { + "epoch": 3.2187333688131985, + "grad_norm": 0.25501546263694763, + "learning_rate": 5.739578907610612e-08, + "loss": 0.1711, + "step": 12096 + }, + { + "epoch": 3.218999467802022, + "grad_norm": 0.46550923585891724, + "learning_rate": 5.738051063126184e-08, + "loss": 0.2067, + "step": 12097 + }, + { + "epoch": 3.2192655667908463, + "grad_norm": 0.2688685953617096, + "learning_rate": 5.7365233402012004e-08, + "loss": 0.1723, + "step": 12098 + }, + { + "epoch": 3.21953166577967, + "grad_norm": 0.2669370174407959, + "learning_rate": 5.7349957388792335e-08, + "loss": 0.1556, + "step": 12099 + }, + { + "epoch": 3.2197977647684937, + "grad_norm": 0.29725170135498047, + "learning_rate": 5.7334682592038617e-08, + "loss": 0.191, + "step": 12100 + }, + { + "epoch": 3.220063863757318, + "grad_norm": 0.2500232458114624, + "learning_rate": 5.7319409012186395e-08, + "loss": 0.1603, + "step": 12101 + }, + { + "epoch": 3.2203299627461415, + "grad_norm": 0.3728543221950531, + "learning_rate": 5.73041366496714e-08, + "loss": 0.1736, + "step": 12102 + }, + { + "epoch": 3.2205960617349656, + "grad_norm": 0.2912701964378357, + "learning_rate": 5.728886550492915e-08, + "loss": 0.1776, + "step": 12103 + }, + { + "epoch": 3.2208621607237893, + "grad_norm": 0.37397855520248413, + "learning_rate": 5.727359557839525e-08, + "loss": 0.193, + "step": 12104 + }, + { + "epoch": 3.221128259712613, + "grad_norm": 0.40045803785324097, + "learning_rate": 5.725832687050527e-08, + "loss": 0.1744, + "step": 12105 + }, + { + "epoch": 3.221394358701437, + "grad_norm": 0.27592647075653076, + "learning_rate": 5.7243059381694626e-08, + "loss": 0.1754, + "step": 12106 + }, + { + "epoch": 3.2216604576902608, + "grad_norm": 0.26790183782577515, + "learning_rate": 5.72277931123988e-08, + "loss": 0.167, + "step": 12107 + }, + { + "epoch": 3.2219265566790845, + "grad_norm": 0.25760284066200256, + "learning_rate": 5.721252806305326e-08, + "loss": 0.1627, + "step": 12108 + }, + { + "epoch": 3.2221926556679086, + "grad_norm": 0.3279418349266052, + "learning_rate": 5.719726423409338e-08, + "loss": 0.1816, + "step": 12109 + }, + { + "epoch": 3.2224587546567323, + "grad_norm": 0.253671795129776, + "learning_rate": 5.718200162595448e-08, + "loss": 0.1699, + "step": 12110 + }, + { + "epoch": 3.222724853645556, + "grad_norm": 0.28974297642707825, + "learning_rate": 5.7166740239071934e-08, + "loss": 0.1793, + "step": 12111 + }, + { + "epoch": 3.22299095263438, + "grad_norm": 0.2924748659133911, + "learning_rate": 5.7151480073880954e-08, + "loss": 0.1697, + "step": 12112 + }, + { + "epoch": 3.2232570516232038, + "grad_norm": 0.26780998706817627, + "learning_rate": 5.7136221130816844e-08, + "loss": 0.1716, + "step": 12113 + }, + { + "epoch": 3.223523150612028, + "grad_norm": 0.2602057158946991, + "learning_rate": 5.712096341031481e-08, + "loss": 0.16, + "step": 12114 + }, + { + "epoch": 3.2237892496008516, + "grad_norm": 0.26809030771255493, + "learning_rate": 5.710570691281007e-08, + "loss": 0.1629, + "step": 12115 + }, + { + "epoch": 3.2240553485896752, + "grad_norm": 0.3602738082408905, + "learning_rate": 5.709045163873769e-08, + "loss": 0.17, + "step": 12116 + }, + { + "epoch": 3.2243214475784994, + "grad_norm": 0.409970223903656, + "learning_rate": 5.707519758853288e-08, + "loss": 0.1665, + "step": 12117 + }, + { + "epoch": 3.224587546567323, + "grad_norm": 0.34652188420295715, + "learning_rate": 5.7059944762630605e-08, + "loss": 0.181, + "step": 12118 + }, + { + "epoch": 3.2248536455561467, + "grad_norm": 0.2753649055957794, + "learning_rate": 5.704469316146603e-08, + "loss": 0.1882, + "step": 12119 + }, + { + "epoch": 3.225119744544971, + "grad_norm": 0.33025678992271423, + "learning_rate": 5.7029442785474036e-08, + "loss": 0.1704, + "step": 12120 + }, + { + "epoch": 3.2253858435337945, + "grad_norm": 0.25195610523223877, + "learning_rate": 5.7014193635089656e-08, + "loss": 0.1615, + "step": 12121 + }, + { + "epoch": 3.225651942522618, + "grad_norm": 0.2891600430011749, + "learning_rate": 5.699894571074781e-08, + "loss": 0.1907, + "step": 12122 + }, + { + "epoch": 3.2259180415114423, + "grad_norm": 0.31243985891342163, + "learning_rate": 5.6983699012883446e-08, + "loss": 0.188, + "step": 12123 + }, + { + "epoch": 3.226184140500266, + "grad_norm": 0.2699930667877197, + "learning_rate": 5.696845354193143e-08, + "loss": 0.1798, + "step": 12124 + }, + { + "epoch": 3.2264502394890897, + "grad_norm": 0.2973981201648712, + "learning_rate": 5.695320929832651e-08, + "loss": 0.1831, + "step": 12125 + }, + { + "epoch": 3.226716338477914, + "grad_norm": 0.38145527243614197, + "learning_rate": 5.693796628250358e-08, + "loss": 0.1833, + "step": 12126 + }, + { + "epoch": 3.2269824374667375, + "grad_norm": 0.3775535225868225, + "learning_rate": 5.692272449489732e-08, + "loss": 0.1936, + "step": 12127 + }, + { + "epoch": 3.2272485364555616, + "grad_norm": 0.3663948178291321, + "learning_rate": 5.690748393594249e-08, + "loss": 0.1834, + "step": 12128 + }, + { + "epoch": 3.2275146354443853, + "grad_norm": 0.2669130265712738, + "learning_rate": 5.68922446060738e-08, + "loss": 0.1658, + "step": 12129 + }, + { + "epoch": 3.227780734433209, + "grad_norm": 0.26355910301208496, + "learning_rate": 5.687700650572592e-08, + "loss": 0.1637, + "step": 12130 + }, + { + "epoch": 3.228046833422033, + "grad_norm": 0.387480229139328, + "learning_rate": 5.686176963533341e-08, + "loss": 0.2095, + "step": 12131 + }, + { + "epoch": 3.228312932410857, + "grad_norm": 0.2822800874710083, + "learning_rate": 5.684653399533094e-08, + "loss": 0.1866, + "step": 12132 + }, + { + "epoch": 3.2285790313996805, + "grad_norm": 0.31087198853492737, + "learning_rate": 5.683129958615296e-08, + "loss": 0.1861, + "step": 12133 + }, + { + "epoch": 3.2288451303885046, + "grad_norm": 0.2589489817619324, + "learning_rate": 5.6816066408234046e-08, + "loss": 0.182, + "step": 12134 + }, + { + "epoch": 3.2291112293773283, + "grad_norm": 0.3611598610877991, + "learning_rate": 5.680083446200866e-08, + "loss": 0.1735, + "step": 12135 + }, + { + "epoch": 3.2293773283661524, + "grad_norm": 0.34869763255119324, + "learning_rate": 5.6785603747911304e-08, + "loss": 0.1903, + "step": 12136 + }, + { + "epoch": 3.229643427354976, + "grad_norm": 0.40219759941101074, + "learning_rate": 5.677037426637632e-08, + "loss": 0.2006, + "step": 12137 + }, + { + "epoch": 3.2299095263438, + "grad_norm": 0.30880123376846313, + "learning_rate": 5.675514601783813e-08, + "loss": 0.1815, + "step": 12138 + }, + { + "epoch": 3.230175625332624, + "grad_norm": 0.39356788992881775, + "learning_rate": 5.673991900273103e-08, + "loss": 0.155, + "step": 12139 + }, + { + "epoch": 3.2304417243214476, + "grad_norm": 0.2629331052303314, + "learning_rate": 5.6724693221489327e-08, + "loss": 0.171, + "step": 12140 + }, + { + "epoch": 3.2307078233102713, + "grad_norm": 0.26207777857780457, + "learning_rate": 5.6709468674547365e-08, + "loss": 0.1677, + "step": 12141 + }, + { + "epoch": 3.2309739222990954, + "grad_norm": 0.37912753224372864, + "learning_rate": 5.669424536233929e-08, + "loss": 0.1884, + "step": 12142 + }, + { + "epoch": 3.231240021287919, + "grad_norm": 0.28365838527679443, + "learning_rate": 5.667902328529934e-08, + "loss": 0.1683, + "step": 12143 + }, + { + "epoch": 3.2315061202767428, + "grad_norm": 0.27148544788360596, + "learning_rate": 5.6663802443861676e-08, + "loss": 0.1795, + "step": 12144 + }, + { + "epoch": 3.231772219265567, + "grad_norm": 0.2513904273509979, + "learning_rate": 5.6648582838460457e-08, + "loss": 0.1588, + "step": 12145 + }, + { + "epoch": 3.2320383182543906, + "grad_norm": 0.29296162724494934, + "learning_rate": 5.663336446952973e-08, + "loss": 0.1838, + "step": 12146 + }, + { + "epoch": 3.2323044172432143, + "grad_norm": 0.2766943573951721, + "learning_rate": 5.661814733750361e-08, + "loss": 0.1578, + "step": 12147 + }, + { + "epoch": 3.2325705162320384, + "grad_norm": 0.3312455117702484, + "learning_rate": 5.660293144281603e-08, + "loss": 0.1747, + "step": 12148 + }, + { + "epoch": 3.232836615220862, + "grad_norm": 0.41881561279296875, + "learning_rate": 5.658771678590104e-08, + "loss": 0.195, + "step": 12149 + }, + { + "epoch": 3.233102714209686, + "grad_norm": 0.28431224822998047, + "learning_rate": 5.657250336719259e-08, + "loss": 0.1809, + "step": 12150 + }, + { + "epoch": 3.23336881319851, + "grad_norm": 0.41102659702301025, + "learning_rate": 5.6557291187124624e-08, + "loss": 0.2215, + "step": 12151 + }, + { + "epoch": 3.2336349121873336, + "grad_norm": 0.3288455307483673, + "learning_rate": 5.654208024613096e-08, + "loss": 0.1736, + "step": 12152 + }, + { + "epoch": 3.2339010111761577, + "grad_norm": 0.29819366335868835, + "learning_rate": 5.652687054464551e-08, + "loss": 0.1624, + "step": 12153 + }, + { + "epoch": 3.2341671101649814, + "grad_norm": 0.27725911140441895, + "learning_rate": 5.651166208310203e-08, + "loss": 0.1622, + "step": 12154 + }, + { + "epoch": 3.234433209153805, + "grad_norm": 0.2749694883823395, + "learning_rate": 5.649645486193435e-08, + "loss": 0.1557, + "step": 12155 + }, + { + "epoch": 3.234699308142629, + "grad_norm": 0.2773599922657013, + "learning_rate": 5.6481248881576136e-08, + "loss": 0.175, + "step": 12156 + }, + { + "epoch": 3.234965407131453, + "grad_norm": 0.26839491724967957, + "learning_rate": 5.6466044142461146e-08, + "loss": 0.1486, + "step": 12157 + }, + { + "epoch": 3.2352315061202765, + "grad_norm": 0.32497698068618774, + "learning_rate": 5.6450840645023036e-08, + "loss": 0.187, + "step": 12158 + }, + { + "epoch": 3.2354976051091007, + "grad_norm": 0.2898588180541992, + "learning_rate": 5.643563838969545e-08, + "loss": 0.1851, + "step": 12159 + }, + { + "epoch": 3.2357637040979244, + "grad_norm": 0.26597246527671814, + "learning_rate": 5.642043737691202e-08, + "loss": 0.1704, + "step": 12160 + }, + { + "epoch": 3.2360298030867485, + "grad_norm": 0.2972193658351898, + "learning_rate": 5.6405237607106247e-08, + "loss": 0.19, + "step": 12161 + }, + { + "epoch": 3.236295902075572, + "grad_norm": 0.5122739672660828, + "learning_rate": 5.639003908071172e-08, + "loss": 0.1766, + "step": 12162 + }, + { + "epoch": 3.236562001064396, + "grad_norm": 0.32992568612098694, + "learning_rate": 5.637484179816185e-08, + "loss": 0.1894, + "step": 12163 + }, + { + "epoch": 3.23682810005322, + "grad_norm": 0.3413207232952118, + "learning_rate": 5.635964575989015e-08, + "loss": 0.1682, + "step": 12164 + }, + { + "epoch": 3.2370941990420437, + "grad_norm": 0.27973154187202454, + "learning_rate": 5.634445096633002e-08, + "loss": 0.1787, + "step": 12165 + }, + { + "epoch": 3.2373602980308673, + "grad_norm": 0.2662612199783325, + "learning_rate": 5.632925741791492e-08, + "loss": 0.1727, + "step": 12166 + }, + { + "epoch": 3.2376263970196915, + "grad_norm": 0.30647003650665283, + "learning_rate": 5.63140651150781e-08, + "loss": 0.1969, + "step": 12167 + }, + { + "epoch": 3.237892496008515, + "grad_norm": 0.46083372831344604, + "learning_rate": 5.629887405825296e-08, + "loss": 0.1913, + "step": 12168 + }, + { + "epoch": 3.238158594997339, + "grad_norm": 0.35882678627967834, + "learning_rate": 5.6283684247872686e-08, + "loss": 0.175, + "step": 12169 + }, + { + "epoch": 3.238424693986163, + "grad_norm": 0.28776296973228455, + "learning_rate": 5.6268495684370606e-08, + "loss": 0.1673, + "step": 12170 + }, + { + "epoch": 3.2386907929749866, + "grad_norm": 0.29541441798210144, + "learning_rate": 5.625330836817983e-08, + "loss": 0.1796, + "step": 12171 + }, + { + "epoch": 3.2389568919638103, + "grad_norm": 0.32916197180747986, + "learning_rate": 5.623812229973367e-08, + "loss": 0.1746, + "step": 12172 + }, + { + "epoch": 3.2392229909526344, + "grad_norm": 0.2906399667263031, + "learning_rate": 5.622293747946516e-08, + "loss": 0.176, + "step": 12173 + }, + { + "epoch": 3.239489089941458, + "grad_norm": 0.37830162048339844, + "learning_rate": 5.6207753907807455e-08, + "loss": 0.1744, + "step": 12174 + }, + { + "epoch": 3.2397551889302822, + "grad_norm": 0.27390462160110474, + "learning_rate": 5.6192571585193574e-08, + "loss": 0.1867, + "step": 12175 + }, + { + "epoch": 3.240021287919106, + "grad_norm": 0.3001086115837097, + "learning_rate": 5.617739051205661e-08, + "loss": 0.1901, + "step": 12176 + }, + { + "epoch": 3.2402873869079296, + "grad_norm": 0.365885466337204, + "learning_rate": 5.616221068882947e-08, + "loss": 0.1736, + "step": 12177 + }, + { + "epoch": 3.2405534858967537, + "grad_norm": 0.3953106105327606, + "learning_rate": 5.614703211594518e-08, + "loss": 0.1886, + "step": 12178 + }, + { + "epoch": 3.2408195848855774, + "grad_norm": 0.2817712128162384, + "learning_rate": 5.6131854793836634e-08, + "loss": 0.1846, + "step": 12179 + }, + { + "epoch": 3.241085683874401, + "grad_norm": 0.2736819088459015, + "learning_rate": 5.611667872293674e-08, + "loss": 0.1723, + "step": 12180 + }, + { + "epoch": 3.2413517828632252, + "grad_norm": 0.26593995094299316, + "learning_rate": 5.610150390367838e-08, + "loss": 0.1693, + "step": 12181 + }, + { + "epoch": 3.241617881852049, + "grad_norm": 0.370404452085495, + "learning_rate": 5.60863303364943e-08, + "loss": 0.1792, + "step": 12182 + }, + { + "epoch": 3.241883980840873, + "grad_norm": 0.27447646856307983, + "learning_rate": 5.607115802181734e-08, + "loss": 0.1768, + "step": 12183 + }, + { + "epoch": 3.2421500798296967, + "grad_norm": 0.27325326204299927, + "learning_rate": 5.6055986960080206e-08, + "loss": 0.1825, + "step": 12184 + }, + { + "epoch": 3.2424161788185204, + "grad_norm": 0.3397376239299774, + "learning_rate": 5.60408171517156e-08, + "loss": 0.1654, + "step": 12185 + }, + { + "epoch": 3.2426822778073445, + "grad_norm": 0.27804267406463623, + "learning_rate": 5.602564859715624e-08, + "loss": 0.1786, + "step": 12186 + }, + { + "epoch": 3.242948376796168, + "grad_norm": 0.36954641342163086, + "learning_rate": 5.601048129683476e-08, + "loss": 0.1968, + "step": 12187 + }, + { + "epoch": 3.243214475784992, + "grad_norm": 0.2778235077857971, + "learning_rate": 5.599531525118373e-08, + "loss": 0.1668, + "step": 12188 + }, + { + "epoch": 3.243480574773816, + "grad_norm": 0.265949547290802, + "learning_rate": 5.5980150460635744e-08, + "loss": 0.1811, + "step": 12189 + }, + { + "epoch": 3.2437466737626397, + "grad_norm": 0.3025849461555481, + "learning_rate": 5.596498692562329e-08, + "loss": 0.1837, + "step": 12190 + }, + { + "epoch": 3.2440127727514634, + "grad_norm": 0.2768423557281494, + "learning_rate": 5.594982464657894e-08, + "loss": 0.1752, + "step": 12191 + }, + { + "epoch": 3.2442788717402875, + "grad_norm": 0.2748223543167114, + "learning_rate": 5.593466362393505e-08, + "loss": 0.1679, + "step": 12192 + }, + { + "epoch": 3.244544970729111, + "grad_norm": 0.2620415687561035, + "learning_rate": 5.591950385812412e-08, + "loss": 0.1509, + "step": 12193 + }, + { + "epoch": 3.244811069717935, + "grad_norm": 0.3872402310371399, + "learning_rate": 5.5904345349578494e-08, + "loss": 0.1793, + "step": 12194 + }, + { + "epoch": 3.245077168706759, + "grad_norm": 0.3441896438598633, + "learning_rate": 5.5889188098730556e-08, + "loss": 0.1874, + "step": 12195 + }, + { + "epoch": 3.2453432676955827, + "grad_norm": 0.262836217880249, + "learning_rate": 5.587403210601265e-08, + "loss": 0.1671, + "step": 12196 + }, + { + "epoch": 3.245609366684407, + "grad_norm": 0.2592092752456665, + "learning_rate": 5.5858877371856974e-08, + "loss": 0.1688, + "step": 12197 + }, + { + "epoch": 3.2458754656732305, + "grad_norm": 0.26496055722236633, + "learning_rate": 5.584372389669585e-08, + "loss": 0.1668, + "step": 12198 + }, + { + "epoch": 3.246141564662054, + "grad_norm": 0.3308366537094116, + "learning_rate": 5.582857168096142e-08, + "loss": 0.1675, + "step": 12199 + }, + { + "epoch": 3.2464076636508783, + "grad_norm": 0.2887897491455078, + "learning_rate": 5.581342072508588e-08, + "loss": 0.179, + "step": 12200 + }, + { + "epoch": 3.246673762639702, + "grad_norm": 0.3812142014503479, + "learning_rate": 5.579827102950139e-08, + "loss": 0.1975, + "step": 12201 + }, + { + "epoch": 3.2469398616285257, + "grad_norm": 0.2616729140281677, + "learning_rate": 5.578312259464004e-08, + "loss": 0.1679, + "step": 12202 + }, + { + "epoch": 3.24720596061735, + "grad_norm": 0.3835742175579071, + "learning_rate": 5.576797542093388e-08, + "loss": 0.1863, + "step": 12203 + }, + { + "epoch": 3.2474720596061735, + "grad_norm": 0.2561110258102417, + "learning_rate": 5.575282950881496e-08, + "loss": 0.1571, + "step": 12204 + }, + { + "epoch": 3.247738158594997, + "grad_norm": 0.43797165155410767, + "learning_rate": 5.573768485871523e-08, + "loss": 0.1853, + "step": 12205 + }, + { + "epoch": 3.2480042575838213, + "grad_norm": 0.44307076930999756, + "learning_rate": 5.57225414710667e-08, + "loss": 0.1863, + "step": 12206 + }, + { + "epoch": 3.248270356572645, + "grad_norm": 0.32886162400245667, + "learning_rate": 5.570739934630124e-08, + "loss": 0.1924, + "step": 12207 + }, + { + "epoch": 3.248536455561469, + "grad_norm": 0.28882890939712524, + "learning_rate": 5.5692258484850754e-08, + "loss": 0.1732, + "step": 12208 + }, + { + "epoch": 3.2488025545502928, + "grad_norm": 0.28707683086395264, + "learning_rate": 5.567711888714708e-08, + "loss": 0.1736, + "step": 12209 + }, + { + "epoch": 3.2490686535391164, + "grad_norm": 0.28362029790878296, + "learning_rate": 5.5661980553622103e-08, + "loss": 0.1774, + "step": 12210 + }, + { + "epoch": 3.2493347525279406, + "grad_norm": 0.2651776671409607, + "learning_rate": 5.5646843484707485e-08, + "loss": 0.1738, + "step": 12211 + }, + { + "epoch": 3.2496008515167643, + "grad_norm": 0.4271720051765442, + "learning_rate": 5.563170768083507e-08, + "loss": 0.1911, + "step": 12212 + }, + { + "epoch": 3.249866950505588, + "grad_norm": 0.27576228976249695, + "learning_rate": 5.5616573142436465e-08, + "loss": 0.1793, + "step": 12213 + }, + { + "epoch": 3.250133049494412, + "grad_norm": 0.3454083204269409, + "learning_rate": 5.560143986994339e-08, + "loss": 0.1707, + "step": 12214 + }, + { + "epoch": 3.2503991484832357, + "grad_norm": 0.2880803942680359, + "learning_rate": 5.5586307863787465e-08, + "loss": 0.1742, + "step": 12215 + }, + { + "epoch": 3.2506652474720594, + "grad_norm": 0.35345280170440674, + "learning_rate": 5.557117712440029e-08, + "loss": 0.202, + "step": 12216 + }, + { + "epoch": 3.2509313464608836, + "grad_norm": 0.28534796833992004, + "learning_rate": 5.5556047652213486e-08, + "loss": 0.1867, + "step": 12217 + }, + { + "epoch": 3.2511974454497072, + "grad_norm": 0.402561217546463, + "learning_rate": 5.5540919447658466e-08, + "loss": 0.2039, + "step": 12218 + }, + { + "epoch": 3.251463544438531, + "grad_norm": 0.262922465801239, + "learning_rate": 5.5525792511166805e-08, + "loss": 0.168, + "step": 12219 + }, + { + "epoch": 3.251729643427355, + "grad_norm": 0.28547921776771545, + "learning_rate": 5.5510666843169894e-08, + "loss": 0.1851, + "step": 12220 + }, + { + "epoch": 3.2519957424161787, + "grad_norm": 0.4137563407421112, + "learning_rate": 5.54955424440992e-08, + "loss": 0.17, + "step": 12221 + }, + { + "epoch": 3.252261841405003, + "grad_norm": 0.2796858251094818, + "learning_rate": 5.548041931438598e-08, + "loss": 0.1781, + "step": 12222 + }, + { + "epoch": 3.2525279403938265, + "grad_norm": 0.4503832757472992, + "learning_rate": 5.546529745446177e-08, + "loss": 0.1833, + "step": 12223 + }, + { + "epoch": 3.25279403938265, + "grad_norm": 0.3639169931411743, + "learning_rate": 5.545017686475773e-08, + "loss": 0.1631, + "step": 12224 + }, + { + "epoch": 3.2530601383714743, + "grad_norm": 0.30816957354545593, + "learning_rate": 5.543505754570521e-08, + "loss": 0.1905, + "step": 12225 + }, + { + "epoch": 3.253326237360298, + "grad_norm": 0.2977733314037323, + "learning_rate": 5.5419939497735366e-08, + "loss": 0.1744, + "step": 12226 + }, + { + "epoch": 3.2535923363491217, + "grad_norm": 0.3376561403274536, + "learning_rate": 5.5404822721279486e-08, + "loss": 0.1857, + "step": 12227 + }, + { + "epoch": 3.253858435337946, + "grad_norm": 0.26951542496681213, + "learning_rate": 5.538970721676863e-08, + "loss": 0.1587, + "step": 12228 + }, + { + "epoch": 3.2541245343267695, + "grad_norm": 0.3487564027309418, + "learning_rate": 5.5374592984634006e-08, + "loss": 0.1682, + "step": 12229 + }, + { + "epoch": 3.2543906333155936, + "grad_norm": 0.4550474286079407, + "learning_rate": 5.535948002530665e-08, + "loss": 0.2036, + "step": 12230 + }, + { + "epoch": 3.2546567323044173, + "grad_norm": 0.2855183780193329, + "learning_rate": 5.534436833921764e-08, + "loss": 0.1627, + "step": 12231 + }, + { + "epoch": 3.254922831293241, + "grad_norm": 0.284066766500473, + "learning_rate": 5.532925792679803e-08, + "loss": 0.184, + "step": 12232 + }, + { + "epoch": 3.255188930282065, + "grad_norm": 0.296865314245224, + "learning_rate": 5.531414878847873e-08, + "loss": 0.1653, + "step": 12233 + }, + { + "epoch": 3.255455029270889, + "grad_norm": 0.43716931343078613, + "learning_rate": 5.5299040924690745e-08, + "loss": 0.1804, + "step": 12234 + }, + { + "epoch": 3.2557211282597125, + "grad_norm": 0.2975432872772217, + "learning_rate": 5.5283934335864915e-08, + "loss": 0.1888, + "step": 12235 + }, + { + "epoch": 3.2559872272485366, + "grad_norm": 0.3271636962890625, + "learning_rate": 5.526882902243213e-08, + "loss": 0.1795, + "step": 12236 + }, + { + "epoch": 3.2562533262373603, + "grad_norm": 0.26031142473220825, + "learning_rate": 5.525372498482325e-08, + "loss": 0.1716, + "step": 12237 + }, + { + "epoch": 3.256519425226184, + "grad_norm": 0.34192442893981934, + "learning_rate": 5.52386222234691e-08, + "loss": 0.1797, + "step": 12238 + }, + { + "epoch": 3.256785524215008, + "grad_norm": 0.4130774140357971, + "learning_rate": 5.5223520738800366e-08, + "loss": 0.1828, + "step": 12239 + }, + { + "epoch": 3.257051623203832, + "grad_norm": 0.45956748723983765, + "learning_rate": 5.520842053124784e-08, + "loss": 0.2009, + "step": 12240 + }, + { + "epoch": 3.2573177221926555, + "grad_norm": 0.26391881704330444, + "learning_rate": 5.519332160124215e-08, + "loss": 0.167, + "step": 12241 + }, + { + "epoch": 3.2575838211814796, + "grad_norm": 0.2891414761543274, + "learning_rate": 5.5178223949214033e-08, + "loss": 0.1856, + "step": 12242 + }, + { + "epoch": 3.2578499201703033, + "grad_norm": 0.29292964935302734, + "learning_rate": 5.516312757559399e-08, + "loss": 0.1646, + "step": 12243 + }, + { + "epoch": 3.258116019159127, + "grad_norm": 0.31613433361053467, + "learning_rate": 5.5148032480812676e-08, + "loss": 0.1775, + "step": 12244 + }, + { + "epoch": 3.258382118147951, + "grad_norm": 0.2796704173088074, + "learning_rate": 5.51329386653006e-08, + "loss": 0.1882, + "step": 12245 + }, + { + "epoch": 3.2586482171367748, + "grad_norm": 0.28205540776252747, + "learning_rate": 5.511784612948836e-08, + "loss": 0.198, + "step": 12246 + }, + { + "epoch": 3.258914316125599, + "grad_norm": 0.294421523809433, + "learning_rate": 5.510275487380628e-08, + "loss": 0.1692, + "step": 12247 + }, + { + "epoch": 3.2591804151144226, + "grad_norm": 0.2782621383666992, + "learning_rate": 5.5087664898684926e-08, + "loss": 0.1833, + "step": 12248 + }, + { + "epoch": 3.2594465141032463, + "grad_norm": 0.2594435214996338, + "learning_rate": 5.507257620455459e-08, + "loss": 0.1737, + "step": 12249 + }, + { + "epoch": 3.2597126130920704, + "grad_norm": 0.2681294083595276, + "learning_rate": 5.5057488791845665e-08, + "loss": 0.1751, + "step": 12250 + }, + { + "epoch": 3.259978712080894, + "grad_norm": 0.32832759618759155, + "learning_rate": 5.5042402660988506e-08, + "loss": 0.1724, + "step": 12251 + }, + { + "epoch": 3.2602448110697178, + "grad_norm": 0.3361770212650299, + "learning_rate": 5.5027317812413366e-08, + "loss": 0.1876, + "step": 12252 + }, + { + "epoch": 3.260510910058542, + "grad_norm": 0.7259566187858582, + "learning_rate": 5.501223424655057e-08, + "loss": 0.1888, + "step": 12253 + }, + { + "epoch": 3.2607770090473656, + "grad_norm": 0.27473321557044983, + "learning_rate": 5.499715196383021e-08, + "loss": 0.1769, + "step": 12254 + }, + { + "epoch": 3.2610431080361897, + "grad_norm": 0.33360955119132996, + "learning_rate": 5.498207096468259e-08, + "loss": 0.1794, + "step": 12255 + }, + { + "epoch": 3.2613092070250134, + "grad_norm": 0.3296770453453064, + "learning_rate": 5.496699124953773e-08, + "loss": 0.1684, + "step": 12256 + }, + { + "epoch": 3.261575306013837, + "grad_norm": 0.26050838828086853, + "learning_rate": 5.495191281882583e-08, + "loss": 0.1653, + "step": 12257 + }, + { + "epoch": 3.261841405002661, + "grad_norm": 0.29281771183013916, + "learning_rate": 5.493683567297689e-08, + "loss": 0.1654, + "step": 12258 + }, + { + "epoch": 3.262107503991485, + "grad_norm": 0.26721230149269104, + "learning_rate": 5.4921759812420964e-08, + "loss": 0.1707, + "step": 12259 + }, + { + "epoch": 3.2623736029803085, + "grad_norm": 0.312137246131897, + "learning_rate": 5.4906685237588055e-08, + "loss": 0.1633, + "step": 12260 + }, + { + "epoch": 3.2626397019691327, + "grad_norm": 0.36791306734085083, + "learning_rate": 5.4891611948908145e-08, + "loss": 0.185, + "step": 12261 + }, + { + "epoch": 3.2629058009579563, + "grad_norm": 0.34376025199890137, + "learning_rate": 5.48765399468111e-08, + "loss": 0.1767, + "step": 12262 + }, + { + "epoch": 3.26317189994678, + "grad_norm": 0.3116283118724823, + "learning_rate": 5.486146923172688e-08, + "loss": 0.1794, + "step": 12263 + }, + { + "epoch": 3.263437998935604, + "grad_norm": 0.295680969953537, + "learning_rate": 5.484639980408522e-08, + "loss": 0.1906, + "step": 12264 + }, + { + "epoch": 3.263704097924428, + "grad_norm": 0.29226598143577576, + "learning_rate": 5.4831331664315995e-08, + "loss": 0.1775, + "step": 12265 + }, + { + "epoch": 3.2639701969132515, + "grad_norm": 0.26483801007270813, + "learning_rate": 5.4816264812849e-08, + "loss": 0.1716, + "step": 12266 + }, + { + "epoch": 3.2642362959020756, + "grad_norm": 0.2514621615409851, + "learning_rate": 5.480119925011397e-08, + "loss": 0.1673, + "step": 12267 + }, + { + "epoch": 3.2645023948908993, + "grad_norm": 0.36085209250450134, + "learning_rate": 5.4786134976540554e-08, + "loss": 0.1884, + "step": 12268 + }, + { + "epoch": 3.264768493879723, + "grad_norm": 0.33434611558914185, + "learning_rate": 5.4771071992558436e-08, + "loss": 0.191, + "step": 12269 + }, + { + "epoch": 3.265034592868547, + "grad_norm": 0.35239917039871216, + "learning_rate": 5.475601029859731e-08, + "loss": 0.1661, + "step": 12270 + }, + { + "epoch": 3.265300691857371, + "grad_norm": 0.3077377378940582, + "learning_rate": 5.474094989508666e-08, + "loss": 0.1703, + "step": 12271 + }, + { + "epoch": 3.265566790846195, + "grad_norm": 0.2630208432674408, + "learning_rate": 5.472589078245614e-08, + "loss": 0.1751, + "step": 12272 + }, + { + "epoch": 3.2658328898350186, + "grad_norm": 0.3527478873729706, + "learning_rate": 5.471083296113513e-08, + "loss": 0.1657, + "step": 12273 + }, + { + "epoch": 3.2660989888238423, + "grad_norm": 0.3834361732006073, + "learning_rate": 5.469577643155328e-08, + "loss": 0.1857, + "step": 12274 + }, + { + "epoch": 3.2663650878126664, + "grad_norm": 0.3663342595100403, + "learning_rate": 5.468072119413991e-08, + "loss": 0.1737, + "step": 12275 + }, + { + "epoch": 3.26663118680149, + "grad_norm": 0.4026341140270233, + "learning_rate": 5.4665667249324496e-08, + "loss": 0.1708, + "step": 12276 + }, + { + "epoch": 3.2668972857903142, + "grad_norm": 0.3051842153072357, + "learning_rate": 5.465061459753635e-08, + "loss": 0.1695, + "step": 12277 + }, + { + "epoch": 3.267163384779138, + "grad_norm": 0.3499433696269989, + "learning_rate": 5.4635563239204854e-08, + "loss": 0.1651, + "step": 12278 + }, + { + "epoch": 3.2674294837679616, + "grad_norm": 0.2731682062149048, + "learning_rate": 5.462051317475924e-08, + "loss": 0.1683, + "step": 12279 + }, + { + "epoch": 3.2676955827567857, + "grad_norm": 0.27208811044692993, + "learning_rate": 5.460546440462881e-08, + "loss": 0.175, + "step": 12280 + }, + { + "epoch": 3.2679616817456094, + "grad_norm": 0.352323442697525, + "learning_rate": 5.459041692924279e-08, + "loss": 0.1744, + "step": 12281 + }, + { + "epoch": 3.268227780734433, + "grad_norm": 0.2692272365093231, + "learning_rate": 5.457537074903038e-08, + "loss": 0.174, + "step": 12282 + }, + { + "epoch": 3.2684938797232572, + "grad_norm": 0.2561049461364746, + "learning_rate": 5.4560325864420675e-08, + "loss": 0.1746, + "step": 12283 + }, + { + "epoch": 3.268759978712081, + "grad_norm": 0.2763020992279053, + "learning_rate": 5.454528227584285e-08, + "loss": 0.169, + "step": 12284 + }, + { + "epoch": 3.2690260777009046, + "grad_norm": 0.28289595246315, + "learning_rate": 5.453023998372591e-08, + "loss": 0.1808, + "step": 12285 + }, + { + "epoch": 3.2692921766897287, + "grad_norm": 0.28134453296661377, + "learning_rate": 5.451519898849892e-08, + "loss": 0.1704, + "step": 12286 + }, + { + "epoch": 3.2695582756785524, + "grad_norm": 0.6001635789871216, + "learning_rate": 5.4500159290590885e-08, + "loss": 0.1632, + "step": 12287 + }, + { + "epoch": 3.269824374667376, + "grad_norm": 0.3001920282840729, + "learning_rate": 5.448512089043076e-08, + "loss": 0.1914, + "step": 12288 + }, + { + "epoch": 3.2700904736562, + "grad_norm": 0.2929721176624298, + "learning_rate": 5.447008378844753e-08, + "loss": 0.1747, + "step": 12289 + }, + { + "epoch": 3.270356572645024, + "grad_norm": 0.29536664485931396, + "learning_rate": 5.445504798506998e-08, + "loss": 0.1783, + "step": 12290 + }, + { + "epoch": 3.2706226716338476, + "grad_norm": 0.3485582172870636, + "learning_rate": 5.444001348072707e-08, + "loss": 0.1732, + "step": 12291 + }, + { + "epoch": 3.2708887706226717, + "grad_norm": 0.2843112051486969, + "learning_rate": 5.4424980275847495e-08, + "loss": 0.1811, + "step": 12292 + }, + { + "epoch": 3.2711548696114954, + "grad_norm": 0.3481845557689667, + "learning_rate": 5.4409948370860146e-08, + "loss": 0.195, + "step": 12293 + }, + { + "epoch": 3.2714209686003195, + "grad_norm": 0.2699359655380249, + "learning_rate": 5.439491776619367e-08, + "loss": 0.1666, + "step": 12294 + }, + { + "epoch": 3.271687067589143, + "grad_norm": 0.26507607102394104, + "learning_rate": 5.437988846227681e-08, + "loss": 0.1733, + "step": 12295 + }, + { + "epoch": 3.271953166577967, + "grad_norm": 0.269160658121109, + "learning_rate": 5.4364860459538256e-08, + "loss": 0.1785, + "step": 12296 + }, + { + "epoch": 3.272219265566791, + "grad_norm": 0.2644832730293274, + "learning_rate": 5.4349833758406636e-08, + "loss": 0.1789, + "step": 12297 + }, + { + "epoch": 3.2724853645556147, + "grad_norm": 0.2651659846305847, + "learning_rate": 5.433480835931049e-08, + "loss": 0.1613, + "step": 12298 + }, + { + "epoch": 3.2727514635444384, + "grad_norm": 0.33256277441978455, + "learning_rate": 5.4319784262678446e-08, + "loss": 0.1904, + "step": 12299 + }, + { + "epoch": 3.2730175625332625, + "grad_norm": 0.30112719535827637, + "learning_rate": 5.430476146893894e-08, + "loss": 0.1693, + "step": 12300 + }, + { + "epoch": 3.273283661522086, + "grad_norm": 0.2930763065814972, + "learning_rate": 5.42897399785205e-08, + "loss": 0.179, + "step": 12301 + }, + { + "epoch": 3.2735497605109103, + "grad_norm": 0.44418036937713623, + "learning_rate": 5.4274719791851564e-08, + "loss": 0.2084, + "step": 12302 + }, + { + "epoch": 3.273815859499734, + "grad_norm": 0.33770036697387695, + "learning_rate": 5.4259700909360564e-08, + "loss": 0.1672, + "step": 12303 + }, + { + "epoch": 3.2740819584885577, + "grad_norm": 0.2822202444076538, + "learning_rate": 5.4244683331475804e-08, + "loss": 0.1776, + "step": 12304 + }, + { + "epoch": 3.274348057477382, + "grad_norm": 0.32127103209495544, + "learning_rate": 5.422966705862567e-08, + "loss": 0.1789, + "step": 12305 + }, + { + "epoch": 3.2746141564662055, + "grad_norm": 0.2565620541572571, + "learning_rate": 5.421465209123848e-08, + "loss": 0.1699, + "step": 12306 + }, + { + "epoch": 3.274880255455029, + "grad_norm": 0.30692413449287415, + "learning_rate": 5.4199638429742403e-08, + "loss": 0.1995, + "step": 12307 + }, + { + "epoch": 3.2751463544438533, + "grad_norm": 0.3336438238620758, + "learning_rate": 5.418462607456575e-08, + "loss": 0.1831, + "step": 12308 + }, + { + "epoch": 3.275412453432677, + "grad_norm": 0.2839397192001343, + "learning_rate": 5.416961502613662e-08, + "loss": 0.1623, + "step": 12309 + }, + { + "epoch": 3.2756785524215006, + "grad_norm": 0.38614290952682495, + "learning_rate": 5.41546052848832e-08, + "loss": 0.1797, + "step": 12310 + }, + { + "epoch": 3.2759446514103248, + "grad_norm": 0.4095298647880554, + "learning_rate": 5.413959685123361e-08, + "loss": 0.1932, + "step": 12311 + }, + { + "epoch": 3.2762107503991484, + "grad_norm": 0.2853960394859314, + "learning_rate": 5.4124589725615944e-08, + "loss": 0.1937, + "step": 12312 + }, + { + "epoch": 3.276476849387972, + "grad_norm": 0.29530254006385803, + "learning_rate": 5.4109583908458164e-08, + "loss": 0.2022, + "step": 12313 + }, + { + "epoch": 3.2767429483767962, + "grad_norm": 0.44075775146484375, + "learning_rate": 5.409457940018834e-08, + "loss": 0.1915, + "step": 12314 + }, + { + "epoch": 3.27700904736562, + "grad_norm": 0.29890695214271545, + "learning_rate": 5.4079576201234364e-08, + "loss": 0.1859, + "step": 12315 + }, + { + "epoch": 3.2772751463544436, + "grad_norm": 0.3236764073371887, + "learning_rate": 5.406457431202418e-08, + "loss": 0.1822, + "step": 12316 + }, + { + "epoch": 3.2775412453432677, + "grad_norm": 0.42694348096847534, + "learning_rate": 5.404957373298569e-08, + "loss": 0.1942, + "step": 12317 + }, + { + "epoch": 3.2778073443320914, + "grad_norm": 0.3497564196586609, + "learning_rate": 5.403457446454678e-08, + "loss": 0.1876, + "step": 12318 + }, + { + "epoch": 3.2780734433209155, + "grad_norm": 0.35944435000419617, + "learning_rate": 5.4019576507135156e-08, + "loss": 0.1738, + "step": 12319 + }, + { + "epoch": 3.2783395423097392, + "grad_norm": 0.32614579796791077, + "learning_rate": 5.40045798611787e-08, + "loss": 0.1675, + "step": 12320 + }, + { + "epoch": 3.278605641298563, + "grad_norm": 0.2904215157032013, + "learning_rate": 5.3989584527105045e-08, + "loss": 0.1724, + "step": 12321 + }, + { + "epoch": 3.278871740287387, + "grad_norm": 0.33574384450912476, + "learning_rate": 5.3974590505341974e-08, + "loss": 0.1764, + "step": 12322 + }, + { + "epoch": 3.2791378392762107, + "grad_norm": 0.33882543444633484, + "learning_rate": 5.395959779631707e-08, + "loss": 0.195, + "step": 12323 + }, + { + "epoch": 3.2794039382650344, + "grad_norm": 0.31561994552612305, + "learning_rate": 5.394460640045797e-08, + "loss": 0.1732, + "step": 12324 + }, + { + "epoch": 3.2796700372538585, + "grad_norm": 0.36800315976142883, + "learning_rate": 5.392961631819235e-08, + "loss": 0.1755, + "step": 12325 + }, + { + "epoch": 3.279936136242682, + "grad_norm": 0.3000093996524811, + "learning_rate": 5.391462754994767e-08, + "loss": 0.1648, + "step": 12326 + }, + { + "epoch": 3.2802022352315063, + "grad_norm": 0.36822667717933655, + "learning_rate": 5.389964009615148e-08, + "loss": 0.1789, + "step": 12327 + }, + { + "epoch": 3.28046833422033, + "grad_norm": 0.2855810821056366, + "learning_rate": 5.388465395723121e-08, + "loss": 0.1764, + "step": 12328 + }, + { + "epoch": 3.2807344332091537, + "grad_norm": 0.2697422206401825, + "learning_rate": 5.386966913361435e-08, + "loss": 0.1634, + "step": 12329 + }, + { + "epoch": 3.281000532197978, + "grad_norm": 0.30241942405700684, + "learning_rate": 5.385468562572822e-08, + "loss": 0.1813, + "step": 12330 + }, + { + "epoch": 3.2812666311868015, + "grad_norm": 0.2796487510204315, + "learning_rate": 5.383970343400024e-08, + "loss": 0.1829, + "step": 12331 + }, + { + "epoch": 3.281532730175625, + "grad_norm": 0.31943777203559875, + "learning_rate": 5.3824722558857725e-08, + "loss": 0.1866, + "step": 12332 + }, + { + "epoch": 3.2817988291644493, + "grad_norm": 0.32562750577926636, + "learning_rate": 5.380974300072799e-08, + "loss": 0.1967, + "step": 12333 + }, + { + "epoch": 3.282064928153273, + "grad_norm": 0.3432185649871826, + "learning_rate": 5.3794764760038214e-08, + "loss": 0.1854, + "step": 12334 + }, + { + "epoch": 3.2823310271420967, + "grad_norm": 0.2785980999469757, + "learning_rate": 5.377978783721566e-08, + "loss": 0.167, + "step": 12335 + }, + { + "epoch": 3.282597126130921, + "grad_norm": 0.2638835906982422, + "learning_rate": 5.376481223268746e-08, + "loss": 0.1611, + "step": 12336 + }, + { + "epoch": 3.2828632251197445, + "grad_norm": 0.249586284160614, + "learning_rate": 5.3749837946880773e-08, + "loss": 0.1587, + "step": 12337 + }, + { + "epoch": 3.283129324108568, + "grad_norm": 0.29886144399642944, + "learning_rate": 5.3734864980222685e-08, + "loss": 0.1931, + "step": 12338 + }, + { + "epoch": 3.2833954230973923, + "grad_norm": 0.2883365750312805, + "learning_rate": 5.37198933331403e-08, + "loss": 0.1762, + "step": 12339 + }, + { + "epoch": 3.283661522086216, + "grad_norm": 0.3807225525379181, + "learning_rate": 5.3704923006060556e-08, + "loss": 0.1854, + "step": 12340 + }, + { + "epoch": 3.28392762107504, + "grad_norm": 0.3582896888256073, + "learning_rate": 5.3689953999410496e-08, + "loss": 0.1696, + "step": 12341 + }, + { + "epoch": 3.284193720063864, + "grad_norm": 0.27915412187576294, + "learning_rate": 5.367498631361709e-08, + "loss": 0.1773, + "step": 12342 + }, + { + "epoch": 3.2844598190526875, + "grad_norm": 0.3173505663871765, + "learning_rate": 5.366001994910718e-08, + "loss": 0.1868, + "step": 12343 + }, + { + "epoch": 3.2847259180415116, + "grad_norm": 0.28783661127090454, + "learning_rate": 5.3645054906307696e-08, + "loss": 0.1841, + "step": 12344 + }, + { + "epoch": 3.2849920170303353, + "grad_norm": 0.2790614664554596, + "learning_rate": 5.363009118564541e-08, + "loss": 0.1867, + "step": 12345 + }, + { + "epoch": 3.285258116019159, + "grad_norm": 0.26471206545829773, + "learning_rate": 5.361512878754715e-08, + "loss": 0.1635, + "step": 12346 + }, + { + "epoch": 3.285524215007983, + "grad_norm": 0.25475022196769714, + "learning_rate": 5.360016771243967e-08, + "loss": 0.1663, + "step": 12347 + }, + { + "epoch": 3.2857903139968068, + "grad_norm": 0.2833635210990906, + "learning_rate": 5.3585207960749744e-08, + "loss": 0.1936, + "step": 12348 + }, + { + "epoch": 3.286056412985631, + "grad_norm": 0.26395395398139954, + "learning_rate": 5.357024953290397e-08, + "loss": 0.1807, + "step": 12349 + }, + { + "epoch": 3.2863225119744546, + "grad_norm": 0.3453965485095978, + "learning_rate": 5.355529242932906e-08, + "loss": 0.1874, + "step": 12350 + }, + { + "epoch": 3.2865886109632783, + "grad_norm": 0.2569204866886139, + "learning_rate": 5.354033665045156e-08, + "loss": 0.1571, + "step": 12351 + }, + { + "epoch": 3.2868547099521024, + "grad_norm": 0.34104666113853455, + "learning_rate": 5.3525382196698064e-08, + "loss": 0.1851, + "step": 12352 + }, + { + "epoch": 3.287120808940926, + "grad_norm": 0.2841075360774994, + "learning_rate": 5.35104290684951e-08, + "loss": 0.1788, + "step": 12353 + }, + { + "epoch": 3.2873869079297497, + "grad_norm": 0.36823850870132446, + "learning_rate": 5.349547726626922e-08, + "loss": 0.189, + "step": 12354 + }, + { + "epoch": 3.287653006918574, + "grad_norm": 0.28802773356437683, + "learning_rate": 5.348052679044679e-08, + "loss": 0.1712, + "step": 12355 + }, + { + "epoch": 3.2879191059073976, + "grad_norm": 0.35482412576675415, + "learning_rate": 5.34655776414543e-08, + "loss": 0.1683, + "step": 12356 + }, + { + "epoch": 3.2881852048962212, + "grad_norm": 0.27798858284950256, + "learning_rate": 5.3450629819718065e-08, + "loss": 0.1662, + "step": 12357 + }, + { + "epoch": 3.2884513038850454, + "grad_norm": 0.26427415013313293, + "learning_rate": 5.3435683325664506e-08, + "loss": 0.1735, + "step": 12358 + }, + { + "epoch": 3.288717402873869, + "grad_norm": 0.2573038637638092, + "learning_rate": 5.3420738159719836e-08, + "loss": 0.1591, + "step": 12359 + }, + { + "epoch": 3.2889835018626927, + "grad_norm": 0.35340189933776855, + "learning_rate": 5.340579432231036e-08, + "loss": 0.1801, + "step": 12360 + }, + { + "epoch": 3.289249600851517, + "grad_norm": 0.297355979681015, + "learning_rate": 5.339085181386232e-08, + "loss": 0.1698, + "step": 12361 + }, + { + "epoch": 3.2895156998403405, + "grad_norm": 0.2655862867832184, + "learning_rate": 5.33759106348019e-08, + "loss": 0.1772, + "step": 12362 + }, + { + "epoch": 3.289781798829164, + "grad_norm": 0.35836049914360046, + "learning_rate": 5.336097078555529e-08, + "loss": 0.1837, + "step": 12363 + }, + { + "epoch": 3.2900478978179883, + "grad_norm": 0.24989573657512665, + "learning_rate": 5.3346032266548534e-08, + "loss": 0.1601, + "step": 12364 + }, + { + "epoch": 3.290313996806812, + "grad_norm": 0.26763176918029785, + "learning_rate": 5.3331095078207786e-08, + "loss": 0.1621, + "step": 12365 + }, + { + "epoch": 3.290580095795636, + "grad_norm": 0.3063808083534241, + "learning_rate": 5.3316159220959e-08, + "loss": 0.201, + "step": 12366 + }, + { + "epoch": 3.29084619478446, + "grad_norm": 0.328764945268631, + "learning_rate": 5.330122469522822e-08, + "loss": 0.1892, + "step": 12367 + }, + { + "epoch": 3.2911122937732835, + "grad_norm": 0.3457759916782379, + "learning_rate": 5.3286291501441395e-08, + "loss": 0.1708, + "step": 12368 + }, + { + "epoch": 3.2913783927621076, + "grad_norm": 0.31921613216400146, + "learning_rate": 5.3271359640024514e-08, + "loss": 0.1896, + "step": 12369 + }, + { + "epoch": 3.2916444917509313, + "grad_norm": 0.29698261618614197, + "learning_rate": 5.325642911140338e-08, + "loss": 0.1724, + "step": 12370 + }, + { + "epoch": 3.291910590739755, + "grad_norm": 0.27451714873313904, + "learning_rate": 5.3241499916003906e-08, + "loss": 0.183, + "step": 12371 + }, + { + "epoch": 3.292176689728579, + "grad_norm": 0.2642975151538849, + "learning_rate": 5.3226572054251825e-08, + "loss": 0.1597, + "step": 12372 + }, + { + "epoch": 3.292442788717403, + "grad_norm": 0.3509422838687897, + "learning_rate": 5.3211645526573e-08, + "loss": 0.1672, + "step": 12373 + }, + { + "epoch": 3.292708887706227, + "grad_norm": 0.2708192467689514, + "learning_rate": 5.3196720333393094e-08, + "loss": 0.1731, + "step": 12374 + }, + { + "epoch": 3.2929749866950506, + "grad_norm": 0.29260045289993286, + "learning_rate": 5.318179647513783e-08, + "loss": 0.1704, + "step": 12375 + }, + { + "epoch": 3.2932410856838743, + "grad_norm": 0.2752620577812195, + "learning_rate": 5.316687395223287e-08, + "loss": 0.1759, + "step": 12376 + }, + { + "epoch": 3.2935071846726984, + "grad_norm": 0.33334556221961975, + "learning_rate": 5.315195276510387e-08, + "loss": 0.1782, + "step": 12377 + }, + { + "epoch": 3.293773283661522, + "grad_norm": 0.259194552898407, + "learning_rate": 5.313703291417634e-08, + "loss": 0.1652, + "step": 12378 + }, + { + "epoch": 3.294039382650346, + "grad_norm": 0.3598499596118927, + "learning_rate": 5.3122114399875864e-08, + "loss": 0.1723, + "step": 12379 + }, + { + "epoch": 3.29430548163917, + "grad_norm": 0.2831892669200897, + "learning_rate": 5.310719722262797e-08, + "loss": 0.1962, + "step": 12380 + }, + { + "epoch": 3.2945715806279936, + "grad_norm": 0.4081547260284424, + "learning_rate": 5.309228138285808e-08, + "loss": 0.1907, + "step": 12381 + }, + { + "epoch": 3.2948376796168173, + "grad_norm": 0.2917519807815552, + "learning_rate": 5.3077366880991647e-08, + "loss": 0.17, + "step": 12382 + }, + { + "epoch": 3.2951037786056414, + "grad_norm": 0.34650686383247375, + "learning_rate": 5.3062453717454057e-08, + "loss": 0.1748, + "step": 12383 + }, + { + "epoch": 3.295369877594465, + "grad_norm": 0.37451285123825073, + "learning_rate": 5.30475418926707e-08, + "loss": 0.1873, + "step": 12384 + }, + { + "epoch": 3.2956359765832888, + "grad_norm": 0.44676896929740906, + "learning_rate": 5.3032631407066817e-08, + "loss": 0.1895, + "step": 12385 + }, + { + "epoch": 3.295902075572113, + "grad_norm": 0.2760350704193115, + "learning_rate": 5.3017722261067786e-08, + "loss": 0.1682, + "step": 12386 + }, + { + "epoch": 3.2961681745609366, + "grad_norm": 0.34271737933158875, + "learning_rate": 5.300281445509874e-08, + "loss": 0.18, + "step": 12387 + }, + { + "epoch": 3.2964342735497603, + "grad_norm": 0.27762937545776367, + "learning_rate": 5.2987907989584924e-08, + "loss": 0.1733, + "step": 12388 + }, + { + "epoch": 3.2967003725385844, + "grad_norm": 0.2758611738681793, + "learning_rate": 5.297300286495152e-08, + "loss": 0.1782, + "step": 12389 + }, + { + "epoch": 3.296966471527408, + "grad_norm": 0.48225098848342896, + "learning_rate": 5.295809908162367e-08, + "loss": 0.1991, + "step": 12390 + }, + { + "epoch": 3.297232570516232, + "grad_norm": 0.2526378929615021, + "learning_rate": 5.2943196640026376e-08, + "loss": 0.1553, + "step": 12391 + }, + { + "epoch": 3.297498669505056, + "grad_norm": 0.28423064947128296, + "learning_rate": 5.29282955405848e-08, + "loss": 0.17, + "step": 12392 + }, + { + "epoch": 3.2977647684938796, + "grad_norm": 0.4880373477935791, + "learning_rate": 5.291339578372382e-08, + "loss": 0.1769, + "step": 12393 + }, + { + "epoch": 3.2980308674827037, + "grad_norm": 0.29893526434898376, + "learning_rate": 5.289849736986854e-08, + "loss": 0.1865, + "step": 12394 + }, + { + "epoch": 3.2982969664715274, + "grad_norm": 0.4825393855571747, + "learning_rate": 5.2883600299443786e-08, + "loss": 0.1763, + "step": 12395 + }, + { + "epoch": 3.2985630654603515, + "grad_norm": 0.4237307608127594, + "learning_rate": 5.28687045728745e-08, + "loss": 0.1992, + "step": 12396 + }, + { + "epoch": 3.298829164449175, + "grad_norm": 0.3836026191711426, + "learning_rate": 5.285381019058553e-08, + "loss": 0.1815, + "step": 12397 + }, + { + "epoch": 3.299095263437999, + "grad_norm": 0.2698113024234772, + "learning_rate": 5.28389171530017e-08, + "loss": 0.1657, + "step": 12398 + }, + { + "epoch": 3.299361362426823, + "grad_norm": 0.28218477964401245, + "learning_rate": 5.282402546054783e-08, + "loss": 0.1537, + "step": 12399 + }, + { + "epoch": 3.2996274614156467, + "grad_norm": 0.28012436628341675, + "learning_rate": 5.2809135113648574e-08, + "loss": 0.1757, + "step": 12400 + }, + { + "epoch": 3.2998935604044703, + "grad_norm": 0.26459991931915283, + "learning_rate": 5.2794246112728725e-08, + "loss": 0.1789, + "step": 12401 + }, + { + "epoch": 3.3001596593932945, + "grad_norm": 0.43441253900527954, + "learning_rate": 5.277935845821286e-08, + "loss": 0.1869, + "step": 12402 + }, + { + "epoch": 3.300425758382118, + "grad_norm": 0.2758955955505371, + "learning_rate": 5.276447215052565e-08, + "loss": 0.1787, + "step": 12403 + }, + { + "epoch": 3.300691857370942, + "grad_norm": 0.2913987934589386, + "learning_rate": 5.274958719009168e-08, + "loss": 0.1995, + "step": 12404 + }, + { + "epoch": 3.300957956359766, + "grad_norm": 0.3755806088447571, + "learning_rate": 5.273470357733555e-08, + "loss": 0.1704, + "step": 12405 + }, + { + "epoch": 3.3012240553485896, + "grad_norm": 0.27466076612472534, + "learning_rate": 5.271982131268168e-08, + "loss": 0.1649, + "step": 12406 + }, + { + "epoch": 3.3014901543374133, + "grad_norm": 0.2818080186843872, + "learning_rate": 5.270494039655462e-08, + "loss": 0.1666, + "step": 12407 + }, + { + "epoch": 3.3017562533262375, + "grad_norm": 0.36715754866600037, + "learning_rate": 5.2690060829378726e-08, + "loss": 0.1945, + "step": 12408 + }, + { + "epoch": 3.302022352315061, + "grad_norm": 0.2997303009033203, + "learning_rate": 5.267518261157847e-08, + "loss": 0.1717, + "step": 12409 + }, + { + "epoch": 3.302288451303885, + "grad_norm": 0.3488905429840088, + "learning_rate": 5.266030574357815e-08, + "loss": 0.1758, + "step": 12410 + }, + { + "epoch": 3.302554550292709, + "grad_norm": 0.26902344822883606, + "learning_rate": 5.26454302258021e-08, + "loss": 0.1988, + "step": 12411 + }, + { + "epoch": 3.3028206492815326, + "grad_norm": 0.2962630093097687, + "learning_rate": 5.2630556058674615e-08, + "loss": 0.1781, + "step": 12412 + }, + { + "epoch": 3.3030867482703568, + "grad_norm": 0.27038678526878357, + "learning_rate": 5.2615683242619956e-08, + "loss": 0.1632, + "step": 12413 + }, + { + "epoch": 3.3033528472591804, + "grad_norm": 0.2931004464626312, + "learning_rate": 5.260081177806227e-08, + "loss": 0.1829, + "step": 12414 + }, + { + "epoch": 3.303618946248004, + "grad_norm": 0.3619477152824402, + "learning_rate": 5.258594166542576e-08, + "loss": 0.1835, + "step": 12415 + }, + { + "epoch": 3.3038850452368282, + "grad_norm": 0.2819141745567322, + "learning_rate": 5.257107290513456e-08, + "loss": 0.1837, + "step": 12416 + }, + { + "epoch": 3.304151144225652, + "grad_norm": 0.3458420932292938, + "learning_rate": 5.2556205497612726e-08, + "loss": 0.1851, + "step": 12417 + }, + { + "epoch": 3.3044172432144756, + "grad_norm": 0.40907853841781616, + "learning_rate": 5.2541339443284316e-08, + "loss": 0.1855, + "step": 12418 + }, + { + "epoch": 3.3046833422032997, + "grad_norm": 0.6771871447563171, + "learning_rate": 5.252647474257333e-08, + "loss": 0.1844, + "step": 12419 + }, + { + "epoch": 3.3049494411921234, + "grad_norm": 0.28871187567710876, + "learning_rate": 5.251161139590381e-08, + "loss": 0.1763, + "step": 12420 + }, + { + "epoch": 3.3052155401809475, + "grad_norm": 0.3434561789035797, + "learning_rate": 5.2496749403699595e-08, + "loss": 0.2026, + "step": 12421 + }, + { + "epoch": 3.3054816391697712, + "grad_norm": 0.2953728139400482, + "learning_rate": 5.2481888766384654e-08, + "loss": 0.1757, + "step": 12422 + }, + { + "epoch": 3.305747738158595, + "grad_norm": 0.2603994309902191, + "learning_rate": 5.246702948438276e-08, + "loss": 0.1702, + "step": 12423 + }, + { + "epoch": 3.306013837147419, + "grad_norm": 0.30886462330818176, + "learning_rate": 5.245217155811783e-08, + "loss": 0.1956, + "step": 12424 + }, + { + "epoch": 3.3062799361362427, + "grad_norm": 0.40646854043006897, + "learning_rate": 5.2437314988013504e-08, + "loss": 0.173, + "step": 12425 + }, + { + "epoch": 3.3065460351250664, + "grad_norm": 0.3347559869289398, + "learning_rate": 5.242245977449369e-08, + "loss": 0.2039, + "step": 12426 + }, + { + "epoch": 3.3068121341138905, + "grad_norm": 0.28787726163864136, + "learning_rate": 5.2407605917981954e-08, + "loss": 0.1757, + "step": 12427 + }, + { + "epoch": 3.307078233102714, + "grad_norm": 0.27483999729156494, + "learning_rate": 5.239275341890206e-08, + "loss": 0.1557, + "step": 12428 + }, + { + "epoch": 3.307344332091538, + "grad_norm": 0.26920396089553833, + "learning_rate": 5.237790227767753e-08, + "loss": 0.1571, + "step": 12429 + }, + { + "epoch": 3.307610431080362, + "grad_norm": 0.29754069447517395, + "learning_rate": 5.236305249473205e-08, + "loss": 0.1574, + "step": 12430 + }, + { + "epoch": 3.3078765300691857, + "grad_norm": 0.45640677213668823, + "learning_rate": 5.2348204070489075e-08, + "loss": 0.2051, + "step": 12431 + }, + { + "epoch": 3.3081426290580094, + "grad_norm": 0.4360087811946869, + "learning_rate": 5.233335700537215e-08, + "loss": 0.2016, + "step": 12432 + }, + { + "epoch": 3.3084087280468335, + "grad_norm": 0.34751784801483154, + "learning_rate": 5.2318511299804744e-08, + "loss": 0.1774, + "step": 12433 + }, + { + "epoch": 3.308674827035657, + "grad_norm": 0.2762938141822815, + "learning_rate": 5.230366695421029e-08, + "loss": 0.1728, + "step": 12434 + }, + { + "epoch": 3.308940926024481, + "grad_norm": 0.39402368664741516, + "learning_rate": 5.228882396901222e-08, + "loss": 0.2052, + "step": 12435 + }, + { + "epoch": 3.309207025013305, + "grad_norm": 0.2804535925388336, + "learning_rate": 5.227398234463381e-08, + "loss": 0.1802, + "step": 12436 + }, + { + "epoch": 3.3094731240021287, + "grad_norm": 0.2792995572090149, + "learning_rate": 5.2259142081498444e-08, + "loss": 0.1765, + "step": 12437 + }, + { + "epoch": 3.309739222990953, + "grad_norm": 0.29518693685531616, + "learning_rate": 5.224430318002932e-08, + "loss": 0.1746, + "step": 12438 + }, + { + "epoch": 3.3100053219797765, + "grad_norm": 0.2733859419822693, + "learning_rate": 5.222946564064971e-08, + "loss": 0.173, + "step": 12439 + }, + { + "epoch": 3.3102714209686, + "grad_norm": 0.27910441160202026, + "learning_rate": 5.221462946378281e-08, + "loss": 0.1817, + "step": 12440 + }, + { + "epoch": 3.3105375199574243, + "grad_norm": 0.42113980650901794, + "learning_rate": 5.219979464985183e-08, + "loss": 0.1861, + "step": 12441 + }, + { + "epoch": 3.310803618946248, + "grad_norm": 0.27442365884780884, + "learning_rate": 5.21849611992798e-08, + "loss": 0.1681, + "step": 12442 + }, + { + "epoch": 3.3110697179350717, + "grad_norm": 0.25993236899375916, + "learning_rate": 5.217012911248988e-08, + "loss": 0.182, + "step": 12443 + }, + { + "epoch": 3.311335816923896, + "grad_norm": 0.28104549646377563, + "learning_rate": 5.215529838990504e-08, + "loss": 0.1847, + "step": 12444 + }, + { + "epoch": 3.3116019159127195, + "grad_norm": 0.3003043234348297, + "learning_rate": 5.214046903194833e-08, + "loss": 0.1762, + "step": 12445 + }, + { + "epoch": 3.3118680149015436, + "grad_norm": 0.25351300835609436, + "learning_rate": 5.212564103904269e-08, + "loss": 0.1645, + "step": 12446 + }, + { + "epoch": 3.3121341138903673, + "grad_norm": 0.3208017945289612, + "learning_rate": 5.211081441161104e-08, + "loss": 0.1854, + "step": 12447 + }, + { + "epoch": 3.312400212879191, + "grad_norm": 0.3769542872905731, + "learning_rate": 5.209598915007627e-08, + "loss": 0.1885, + "step": 12448 + }, + { + "epoch": 3.312666311868015, + "grad_norm": 0.2927875518798828, + "learning_rate": 5.208116525486128e-08, + "loss": 0.1675, + "step": 12449 + }, + { + "epoch": 3.3129324108568388, + "grad_norm": 0.27724847197532654, + "learning_rate": 5.20663427263888e-08, + "loss": 0.178, + "step": 12450 + }, + { + "epoch": 3.3131985098456624, + "grad_norm": 0.2889168858528137, + "learning_rate": 5.205152156508164e-08, + "loss": 0.1932, + "step": 12451 + }, + { + "epoch": 3.3134646088344866, + "grad_norm": 0.2928692400455475, + "learning_rate": 5.2036701771362536e-08, + "loss": 0.186, + "step": 12452 + }, + { + "epoch": 3.3137307078233103, + "grad_norm": 0.2807301878929138, + "learning_rate": 5.202188334565414e-08, + "loss": 0.1716, + "step": 12453 + }, + { + "epoch": 3.313996806812134, + "grad_norm": 0.2834559977054596, + "learning_rate": 5.200706628837912e-08, + "loss": 0.1758, + "step": 12454 + }, + { + "epoch": 3.314262905800958, + "grad_norm": 0.2619899809360504, + "learning_rate": 5.1992250599960094e-08, + "loss": 0.17, + "step": 12455 + }, + { + "epoch": 3.3145290047897817, + "grad_norm": 0.3120448589324951, + "learning_rate": 5.197743628081969e-08, + "loss": 0.1881, + "step": 12456 + }, + { + "epoch": 3.3147951037786054, + "grad_norm": 0.29382604360580444, + "learning_rate": 5.196262333138034e-08, + "loss": 0.1661, + "step": 12457 + }, + { + "epoch": 3.3150612027674295, + "grad_norm": 0.28048640489578247, + "learning_rate": 5.194781175206463e-08, + "loss": 0.1757, + "step": 12458 + }, + { + "epoch": 3.3153273017562532, + "grad_norm": 0.27332374453544617, + "learning_rate": 5.193300154329494e-08, + "loss": 0.1738, + "step": 12459 + }, + { + "epoch": 3.3155934007450774, + "grad_norm": 0.26351794600486755, + "learning_rate": 5.191819270549376e-08, + "loss": 0.1718, + "step": 12460 + }, + { + "epoch": 3.315859499733901, + "grad_norm": 0.28325989842414856, + "learning_rate": 5.190338523908337e-08, + "loss": 0.1852, + "step": 12461 + }, + { + "epoch": 3.3161255987227247, + "grad_norm": 0.35740774869918823, + "learning_rate": 5.188857914448621e-08, + "loss": 0.1855, + "step": 12462 + }, + { + "epoch": 3.316391697711549, + "grad_norm": 0.2722872793674469, + "learning_rate": 5.1873774422124507e-08, + "loss": 0.1729, + "step": 12463 + }, + { + "epoch": 3.3166577967003725, + "grad_norm": 0.39617183804512024, + "learning_rate": 5.185897107242061e-08, + "loss": 0.1871, + "step": 12464 + }, + { + "epoch": 3.316923895689196, + "grad_norm": 0.27876004576683044, + "learning_rate": 5.1844169095796655e-08, + "loss": 0.1671, + "step": 12465 + }, + { + "epoch": 3.3171899946780203, + "grad_norm": 0.27359214425086975, + "learning_rate": 5.182936849267487e-08, + "loss": 0.1624, + "step": 12466 + }, + { + "epoch": 3.317456093666844, + "grad_norm": 0.3540359437465668, + "learning_rate": 5.1814569263477357e-08, + "loss": 0.1721, + "step": 12467 + }, + { + "epoch": 3.317722192655668, + "grad_norm": 0.4136073887348175, + "learning_rate": 5.179977140862625e-08, + "loss": 0.1858, + "step": 12468 + }, + { + "epoch": 3.317988291644492, + "grad_norm": 0.3552986681461334, + "learning_rate": 5.17849749285436e-08, + "loss": 0.1798, + "step": 12469 + }, + { + "epoch": 3.3182543906333155, + "grad_norm": 0.3201591670513153, + "learning_rate": 5.177017982365145e-08, + "loss": 0.1569, + "step": 12470 + }, + { + "epoch": 3.3185204896221396, + "grad_norm": 0.270000696182251, + "learning_rate": 5.1755386094371825e-08, + "loss": 0.1719, + "step": 12471 + }, + { + "epoch": 3.3187865886109633, + "grad_norm": 0.27454936504364014, + "learning_rate": 5.174059374112657e-08, + "loss": 0.192, + "step": 12472 + }, + { + "epoch": 3.319052687599787, + "grad_norm": 0.4366722106933594, + "learning_rate": 5.172580276433769e-08, + "loss": 0.1928, + "step": 12473 + }, + { + "epoch": 3.319318786588611, + "grad_norm": 0.26356571912765503, + "learning_rate": 5.171101316442699e-08, + "loss": 0.1712, + "step": 12474 + }, + { + "epoch": 3.319584885577435, + "grad_norm": 0.29326581954956055, + "learning_rate": 5.1696224941816345e-08, + "loss": 0.1626, + "step": 12475 + }, + { + "epoch": 3.3198509845662585, + "grad_norm": 0.2741190493106842, + "learning_rate": 5.1681438096927455e-08, + "loss": 0.1567, + "step": 12476 + }, + { + "epoch": 3.3201170835550826, + "grad_norm": 0.37297332286834717, + "learning_rate": 5.166665263018222e-08, + "loss": 0.2107, + "step": 12477 + }, + { + "epoch": 3.3203831825439063, + "grad_norm": 0.3039363622665405, + "learning_rate": 5.165186854200221e-08, + "loss": 0.1766, + "step": 12478 + }, + { + "epoch": 3.32064928153273, + "grad_norm": 0.38334888219833374, + "learning_rate": 5.163708583280921e-08, + "loss": 0.1908, + "step": 12479 + }, + { + "epoch": 3.320915380521554, + "grad_norm": 0.3102414309978485, + "learning_rate": 5.162230450302476e-08, + "loss": 0.1646, + "step": 12480 + }, + { + "epoch": 3.321181479510378, + "grad_norm": 0.27398136258125305, + "learning_rate": 5.1607524553070535e-08, + "loss": 0.172, + "step": 12481 + }, + { + "epoch": 3.3214475784992015, + "grad_norm": 0.32100680470466614, + "learning_rate": 5.1592745983368014e-08, + "loss": 0.1827, + "step": 12482 + }, + { + "epoch": 3.3217136774880256, + "grad_norm": 0.27438268065452576, + "learning_rate": 5.157796879433873e-08, + "loss": 0.1689, + "step": 12483 + }, + { + "epoch": 3.3219797764768493, + "grad_norm": 0.3228605389595032, + "learning_rate": 5.156319298640419e-08, + "loss": 0.1759, + "step": 12484 + }, + { + "epoch": 3.3222458754656734, + "grad_norm": 0.2752191424369812, + "learning_rate": 5.154841855998585e-08, + "loss": 0.181, + "step": 12485 + }, + { + "epoch": 3.322511974454497, + "grad_norm": 0.3640809655189514, + "learning_rate": 5.153364551550503e-08, + "loss": 0.1939, + "step": 12486 + }, + { + "epoch": 3.3227780734433208, + "grad_norm": 0.43638625741004944, + "learning_rate": 5.151887385338313e-08, + "loss": 0.1921, + "step": 12487 + }, + { + "epoch": 3.323044172432145, + "grad_norm": 0.423450767993927, + "learning_rate": 5.15041035740415e-08, + "loss": 0.1825, + "step": 12488 + }, + { + "epoch": 3.3233102714209686, + "grad_norm": 0.29353687167167664, + "learning_rate": 5.148933467790135e-08, + "loss": 0.1626, + "step": 12489 + }, + { + "epoch": 3.3235763704097923, + "grad_norm": 0.3266110420227051, + "learning_rate": 5.147456716538395e-08, + "loss": 0.1972, + "step": 12490 + }, + { + "epoch": 3.3238424693986164, + "grad_norm": 0.27963730692863464, + "learning_rate": 5.14598010369105e-08, + "loss": 0.1774, + "step": 12491 + }, + { + "epoch": 3.32410856838744, + "grad_norm": 0.27801987528800964, + "learning_rate": 5.144503629290221e-08, + "loss": 0.1682, + "step": 12492 + }, + { + "epoch": 3.324374667376264, + "grad_norm": 0.3512701690196991, + "learning_rate": 5.143027293378012e-08, + "loss": 0.1888, + "step": 12493 + }, + { + "epoch": 3.324640766365088, + "grad_norm": 0.2870713174343109, + "learning_rate": 5.1415510959965367e-08, + "loss": 0.1946, + "step": 12494 + }, + { + "epoch": 3.3249068653539116, + "grad_norm": 0.278573215007782, + "learning_rate": 5.140075037187894e-08, + "loss": 0.2061, + "step": 12495 + }, + { + "epoch": 3.3251729643427357, + "grad_norm": 0.285287469625473, + "learning_rate": 5.138599116994191e-08, + "loss": 0.1805, + "step": 12496 + }, + { + "epoch": 3.3254390633315594, + "grad_norm": 0.26283013820648193, + "learning_rate": 5.137123335457516e-08, + "loss": 0.1539, + "step": 12497 + }, + { + "epoch": 3.325705162320383, + "grad_norm": 0.4244054853916168, + "learning_rate": 5.1356476926199644e-08, + "loss": 0.2004, + "step": 12498 + }, + { + "epoch": 3.325971261309207, + "grad_norm": 0.39995425939559937, + "learning_rate": 5.134172188523627e-08, + "loss": 0.1905, + "step": 12499 + }, + { + "epoch": 3.326237360298031, + "grad_norm": 0.36486631631851196, + "learning_rate": 5.132696823210589e-08, + "loss": 0.1806, + "step": 12500 + }, + { + "epoch": 3.3265034592868545, + "grad_norm": 0.2684774100780487, + "learning_rate": 5.1312215967229254e-08, + "loss": 0.1775, + "step": 12501 + }, + { + "epoch": 3.3267695582756787, + "grad_norm": 0.26602551341056824, + "learning_rate": 5.1297465091027195e-08, + "loss": 0.1656, + "step": 12502 + }, + { + "epoch": 3.3270356572645023, + "grad_norm": 0.2960500419139862, + "learning_rate": 5.1282715603920365e-08, + "loss": 0.1669, + "step": 12503 + }, + { + "epoch": 3.327301756253326, + "grad_norm": 0.28964725136756897, + "learning_rate": 5.126796750632948e-08, + "loss": 0.1853, + "step": 12504 + }, + { + "epoch": 3.32756785524215, + "grad_norm": 0.2868959605693817, + "learning_rate": 5.125322079867519e-08, + "loss": 0.1683, + "step": 12505 + }, + { + "epoch": 3.327833954230974, + "grad_norm": 0.34214141964912415, + "learning_rate": 5.1238475481378096e-08, + "loss": 0.171, + "step": 12506 + }, + { + "epoch": 3.3281000532197975, + "grad_norm": 0.33680230379104614, + "learning_rate": 5.122373155485882e-08, + "loss": 0.1676, + "step": 12507 + }, + { + "epoch": 3.3283661522086216, + "grad_norm": 0.3737117052078247, + "learning_rate": 5.1208989019537786e-08, + "loss": 0.1749, + "step": 12508 + }, + { + "epoch": 3.3286322511974453, + "grad_norm": 0.45443105697631836, + "learning_rate": 5.1194247875835586e-08, + "loss": 0.1902, + "step": 12509 + }, + { + "epoch": 3.3288983501862695, + "grad_norm": 0.26115915179252625, + "learning_rate": 5.117950812417256e-08, + "loss": 0.1642, + "step": 12510 + }, + { + "epoch": 3.329164449175093, + "grad_norm": 0.290690153837204, + "learning_rate": 5.1164769764969217e-08, + "loss": 0.2024, + "step": 12511 + }, + { + "epoch": 3.329430548163917, + "grad_norm": 0.4076809287071228, + "learning_rate": 5.115003279864585e-08, + "loss": 0.1728, + "step": 12512 + }, + { + "epoch": 3.329696647152741, + "grad_norm": 0.3395390808582306, + "learning_rate": 5.1135297225622796e-08, + "loss": 0.1804, + "step": 12513 + }, + { + "epoch": 3.3299627461415646, + "grad_norm": 0.29974061250686646, + "learning_rate": 5.1120563046320375e-08, + "loss": 0.1908, + "step": 12514 + }, + { + "epoch": 3.3302288451303887, + "grad_norm": 0.26839494705200195, + "learning_rate": 5.1105830261158866e-08, + "loss": 0.1729, + "step": 12515 + }, + { + "epoch": 3.3304949441192124, + "grad_norm": 0.32600706815719604, + "learning_rate": 5.109109887055838e-08, + "loss": 0.1632, + "step": 12516 + }, + { + "epoch": 3.330761043108036, + "grad_norm": 0.2598027288913727, + "learning_rate": 5.10763688749392e-08, + "loss": 0.1669, + "step": 12517 + }, + { + "epoch": 3.3310271420968602, + "grad_norm": 0.32500597834587097, + "learning_rate": 5.1061640274721354e-08, + "loss": 0.1694, + "step": 12518 + }, + { + "epoch": 3.331293241085684, + "grad_norm": 0.3399195671081543, + "learning_rate": 5.104691307032496e-08, + "loss": 0.185, + "step": 12519 + }, + { + "epoch": 3.3315593400745076, + "grad_norm": 0.37131503224372864, + "learning_rate": 5.103218726217009e-08, + "loss": 0.2073, + "step": 12520 + }, + { + "epoch": 3.3318254390633317, + "grad_norm": 0.3010699450969696, + "learning_rate": 5.10174628506768e-08, + "loss": 0.1644, + "step": 12521 + }, + { + "epoch": 3.3320915380521554, + "grad_norm": 0.29981449246406555, + "learning_rate": 5.100273983626495e-08, + "loss": 0.2032, + "step": 12522 + }, + { + "epoch": 3.332357637040979, + "grad_norm": 0.2991047501564026, + "learning_rate": 5.098801821935458e-08, + "loss": 0.1697, + "step": 12523 + }, + { + "epoch": 3.332623736029803, + "grad_norm": 0.45836690068244934, + "learning_rate": 5.097329800036547e-08, + "loss": 0.1965, + "step": 12524 + }, + { + "epoch": 3.332889835018627, + "grad_norm": 0.3527619242668152, + "learning_rate": 5.0958579179717556e-08, + "loss": 0.1774, + "step": 12525 + }, + { + "epoch": 3.3331559340074506, + "grad_norm": 0.2626502811908722, + "learning_rate": 5.094386175783063e-08, + "loss": 0.1805, + "step": 12526 + }, + { + "epoch": 3.3334220329962747, + "grad_norm": 0.2975959777832031, + "learning_rate": 5.092914573512439e-08, + "loss": 0.1809, + "step": 12527 + }, + { + "epoch": 3.3336881319850984, + "grad_norm": 0.2783142924308777, + "learning_rate": 5.091443111201871e-08, + "loss": 0.1862, + "step": 12528 + }, + { + "epoch": 3.333954230973922, + "grad_norm": 0.2855806350708008, + "learning_rate": 5.089971788893316e-08, + "loss": 0.1873, + "step": 12529 + }, + { + "epoch": 3.334220329962746, + "grad_norm": 0.31475573778152466, + "learning_rate": 5.088500606628747e-08, + "loss": 0.1798, + "step": 12530 + }, + { + "epoch": 3.33448642895157, + "grad_norm": 0.3528982102870941, + "learning_rate": 5.0870295644501185e-08, + "loss": 0.1848, + "step": 12531 + }, + { + "epoch": 3.334752527940394, + "grad_norm": 0.3060515522956848, + "learning_rate": 5.0855586623993915e-08, + "loss": 0.1822, + "step": 12532 + }, + { + "epoch": 3.3350186269292177, + "grad_norm": 0.3595610558986664, + "learning_rate": 5.084087900518517e-08, + "loss": 0.1879, + "step": 12533 + }, + { + "epoch": 3.3352847259180414, + "grad_norm": 0.4411676526069641, + "learning_rate": 5.082617278849444e-08, + "loss": 0.1862, + "step": 12534 + }, + { + "epoch": 3.3355508249068655, + "grad_norm": 0.3694998025894165, + "learning_rate": 5.0811467974341196e-08, + "loss": 0.1803, + "step": 12535 + }, + { + "epoch": 3.335816923895689, + "grad_norm": 0.27690187096595764, + "learning_rate": 5.079676456314488e-08, + "loss": 0.1708, + "step": 12536 + }, + { + "epoch": 3.336083022884513, + "grad_norm": 0.31432268023490906, + "learning_rate": 5.078206255532478e-08, + "loss": 0.1722, + "step": 12537 + }, + { + "epoch": 3.336349121873337, + "grad_norm": 0.33912137150764465, + "learning_rate": 5.07673619513003e-08, + "loss": 0.1767, + "step": 12538 + }, + { + "epoch": 3.3366152208621607, + "grad_norm": 0.3047918677330017, + "learning_rate": 5.0752662751490684e-08, + "loss": 0.1874, + "step": 12539 + }, + { + "epoch": 3.336881319850985, + "grad_norm": 0.27062228322029114, + "learning_rate": 5.0737964956315215e-08, + "loss": 0.1691, + "step": 12540 + }, + { + "epoch": 3.3371474188398085, + "grad_norm": 0.3046836853027344, + "learning_rate": 5.0723268566193036e-08, + "loss": 0.1632, + "step": 12541 + }, + { + "epoch": 3.337413517828632, + "grad_norm": 0.29058265686035156, + "learning_rate": 5.0708573581543414e-08, + "loss": 0.1815, + "step": 12542 + }, + { + "epoch": 3.3376796168174563, + "grad_norm": 0.27622726559638977, + "learning_rate": 5.0693880002785447e-08, + "loss": 0.1774, + "step": 12543 + }, + { + "epoch": 3.33794571580628, + "grad_norm": 0.28873997926712036, + "learning_rate": 5.067918783033821e-08, + "loss": 0.1881, + "step": 12544 + }, + { + "epoch": 3.3382118147951036, + "grad_norm": 0.42929673194885254, + "learning_rate": 5.066449706462079e-08, + "loss": 0.2008, + "step": 12545 + }, + { + "epoch": 3.3384779137839278, + "grad_norm": 0.2610701024532318, + "learning_rate": 5.064980770605213e-08, + "loss": 0.1603, + "step": 12546 + }, + { + "epoch": 3.3387440127727515, + "grad_norm": 0.3052784502506256, + "learning_rate": 5.063511975505127e-08, + "loss": 0.1875, + "step": 12547 + }, + { + "epoch": 3.339010111761575, + "grad_norm": 0.27332305908203125, + "learning_rate": 5.0620433212037084e-08, + "loss": 0.1624, + "step": 12548 + }, + { + "epoch": 3.3392762107503993, + "grad_norm": 0.35336557030677795, + "learning_rate": 5.0605748077428475e-08, + "loss": 0.1758, + "step": 12549 + }, + { + "epoch": 3.339542309739223, + "grad_norm": 0.773794949054718, + "learning_rate": 5.0591064351644305e-08, + "loss": 0.1767, + "step": 12550 + }, + { + "epoch": 3.3398084087280466, + "grad_norm": 0.41832467913627625, + "learning_rate": 5.0576382035103426e-08, + "loss": 0.1863, + "step": 12551 + }, + { + "epoch": 3.3400745077168708, + "grad_norm": 0.31309598684310913, + "learning_rate": 5.056170112822452e-08, + "loss": 0.1643, + "step": 12552 + }, + { + "epoch": 3.3403406067056944, + "grad_norm": 0.35287246108055115, + "learning_rate": 5.0547021631426414e-08, + "loss": 0.1836, + "step": 12553 + }, + { + "epoch": 3.340606705694518, + "grad_norm": 0.2886407971382141, + "learning_rate": 5.05323435451277e-08, + "loss": 0.1706, + "step": 12554 + }, + { + "epoch": 3.3408728046833422, + "grad_norm": 0.2844550907611847, + "learning_rate": 5.051766686974706e-08, + "loss": 0.1513, + "step": 12555 + }, + { + "epoch": 3.341138903672166, + "grad_norm": 0.34952202439308167, + "learning_rate": 5.050299160570313e-08, + "loss": 0.1947, + "step": 12556 + }, + { + "epoch": 3.34140500266099, + "grad_norm": 0.3117370009422302, + "learning_rate": 5.048831775341449e-08, + "loss": 0.1852, + "step": 12557 + }, + { + "epoch": 3.3416711016498137, + "grad_norm": 0.32329899072647095, + "learning_rate": 5.047364531329961e-08, + "loss": 0.173, + "step": 12558 + }, + { + "epoch": 3.3419372006386374, + "grad_norm": 0.28270238637924194, + "learning_rate": 5.045897428577703e-08, + "loss": 0.167, + "step": 12559 + }, + { + "epoch": 3.3422032996274615, + "grad_norm": 0.38682031631469727, + "learning_rate": 5.044430467126515e-08, + "loss": 0.1881, + "step": 12560 + }, + { + "epoch": 3.3424693986162852, + "grad_norm": 0.31628304719924927, + "learning_rate": 5.042963647018239e-08, + "loss": 0.1767, + "step": 12561 + }, + { + "epoch": 3.342735497605109, + "grad_norm": 0.4937950372695923, + "learning_rate": 5.0414969682947186e-08, + "loss": 0.2055, + "step": 12562 + }, + { + "epoch": 3.343001596593933, + "grad_norm": 0.26667284965515137, + "learning_rate": 5.040030430997777e-08, + "loss": 0.1723, + "step": 12563 + }, + { + "epoch": 3.3432676955827567, + "grad_norm": 0.2807544767856598, + "learning_rate": 5.038564035169246e-08, + "loss": 0.1775, + "step": 12564 + }, + { + "epoch": 3.343533794571581, + "grad_norm": 0.26525232195854187, + "learning_rate": 5.0370977808509496e-08, + "loss": 0.1735, + "step": 12565 + }, + { + "epoch": 3.3437998935604045, + "grad_norm": 0.3449978828430176, + "learning_rate": 5.035631668084715e-08, + "loss": 0.1765, + "step": 12566 + }, + { + "epoch": 3.344065992549228, + "grad_norm": 0.25881829857826233, + "learning_rate": 5.0341656969123493e-08, + "loss": 0.1747, + "step": 12567 + }, + { + "epoch": 3.3443320915380523, + "grad_norm": 0.26948097348213196, + "learning_rate": 5.032699867375672e-08, + "loss": 0.1582, + "step": 12568 + }, + { + "epoch": 3.344598190526876, + "grad_norm": 0.27848634123802185, + "learning_rate": 5.031234179516486e-08, + "loss": 0.1728, + "step": 12569 + }, + { + "epoch": 3.3448642895156997, + "grad_norm": 0.2645917236804962, + "learning_rate": 5.0297686333765965e-08, + "loss": 0.1662, + "step": 12570 + }, + { + "epoch": 3.345130388504524, + "grad_norm": 0.25171715021133423, + "learning_rate": 5.0283032289978075e-08, + "loss": 0.1662, + "step": 12571 + }, + { + "epoch": 3.3453964874933475, + "grad_norm": 0.3998943269252777, + "learning_rate": 5.026837966421915e-08, + "loss": 0.1637, + "step": 12572 + }, + { + "epoch": 3.345662586482171, + "grad_norm": 0.26549890637397766, + "learning_rate": 5.0253728456907075e-08, + "loss": 0.1682, + "step": 12573 + }, + { + "epoch": 3.3459286854709953, + "grad_norm": 0.2911273241043091, + "learning_rate": 5.02390786684598e-08, + "loss": 0.1719, + "step": 12574 + }, + { + "epoch": 3.346194784459819, + "grad_norm": 0.2748877704143524, + "learning_rate": 5.022443029929506e-08, + "loss": 0.1778, + "step": 12575 + }, + { + "epoch": 3.3464608834486427, + "grad_norm": 0.3729454278945923, + "learning_rate": 5.020978334983077e-08, + "loss": 0.184, + "step": 12576 + }, + { + "epoch": 3.346726982437467, + "grad_norm": 0.3287717401981354, + "learning_rate": 5.0195137820484587e-08, + "loss": 0.1796, + "step": 12577 + }, + { + "epoch": 3.3469930814262905, + "grad_norm": 0.31033968925476074, + "learning_rate": 5.01804937116743e-08, + "loss": 0.1814, + "step": 12578 + }, + { + "epoch": 3.3472591804151146, + "grad_norm": 0.3126065731048584, + "learning_rate": 5.016585102381755e-08, + "loss": 0.1806, + "step": 12579 + }, + { + "epoch": 3.3475252794039383, + "grad_norm": 0.31797945499420166, + "learning_rate": 5.0151209757332e-08, + "loss": 0.1944, + "step": 12580 + }, + { + "epoch": 3.347791378392762, + "grad_norm": 0.2662513554096222, + "learning_rate": 5.01365699126353e-08, + "loss": 0.1595, + "step": 12581 + }, + { + "epoch": 3.348057477381586, + "grad_norm": 0.3900655508041382, + "learning_rate": 5.01219314901449e-08, + "loss": 0.1774, + "step": 12582 + }, + { + "epoch": 3.34832357637041, + "grad_norm": 0.39684683084487915, + "learning_rate": 5.010729449027842e-08, + "loss": 0.1702, + "step": 12583 + }, + { + "epoch": 3.3485896753592335, + "grad_norm": 0.280505895614624, + "learning_rate": 5.0092658913453235e-08, + "loss": 0.1711, + "step": 12584 + }, + { + "epoch": 3.3488557743480576, + "grad_norm": 0.3299263119697571, + "learning_rate": 5.007802476008684e-08, + "loss": 0.1907, + "step": 12585 + }, + { + "epoch": 3.3491218733368813, + "grad_norm": 0.2820180356502533, + "learning_rate": 5.006339203059663e-08, + "loss": 0.1749, + "step": 12586 + }, + { + "epoch": 3.3493879723257054, + "grad_norm": 0.2638777792453766, + "learning_rate": 5.0048760725399985e-08, + "loss": 0.173, + "step": 12587 + }, + { + "epoch": 3.349654071314529, + "grad_norm": 0.29333433508872986, + "learning_rate": 5.0034130844914165e-08, + "loss": 0.1753, + "step": 12588 + }, + { + "epoch": 3.3499201703033528, + "grad_norm": 0.3259660005569458, + "learning_rate": 5.00195023895565e-08, + "loss": 0.1957, + "step": 12589 + }, + { + "epoch": 3.350186269292177, + "grad_norm": 0.3066575527191162, + "learning_rate": 5.000487535974416e-08, + "loss": 0.1725, + "step": 12590 + }, + { + "epoch": 3.3504523682810006, + "grad_norm": 0.302761048078537, + "learning_rate": 4.9990249755894356e-08, + "loss": 0.1857, + "step": 12591 + }, + { + "epoch": 3.3507184672698243, + "grad_norm": 0.44697630405426025, + "learning_rate": 4.997562557842426e-08, + "loss": 0.2013, + "step": 12592 + }, + { + "epoch": 3.3509845662586484, + "grad_norm": 0.2854468822479248, + "learning_rate": 4.996100282775102e-08, + "loss": 0.1694, + "step": 12593 + }, + { + "epoch": 3.351250665247472, + "grad_norm": 0.2660176157951355, + "learning_rate": 4.994638150429162e-08, + "loss": 0.1643, + "step": 12594 + }, + { + "epoch": 3.3515167642362957, + "grad_norm": 0.29111388325691223, + "learning_rate": 4.993176160846317e-08, + "loss": 0.1742, + "step": 12595 + }, + { + "epoch": 3.35178286322512, + "grad_norm": 0.34272533655166626, + "learning_rate": 4.9917143140682596e-08, + "loss": 0.1583, + "step": 12596 + }, + { + "epoch": 3.3520489622139436, + "grad_norm": 0.29189109802246094, + "learning_rate": 4.990252610136686e-08, + "loss": 0.1774, + "step": 12597 + }, + { + "epoch": 3.3523150612027672, + "grad_norm": 0.33002427220344543, + "learning_rate": 4.9887910490932925e-08, + "loss": 0.1565, + "step": 12598 + }, + { + "epoch": 3.3525811601915914, + "grad_norm": 0.43065720796585083, + "learning_rate": 4.9873296309797577e-08, + "loss": 0.202, + "step": 12599 + }, + { + "epoch": 3.352847259180415, + "grad_norm": 0.35984793305397034, + "learning_rate": 4.9858683558377676e-08, + "loss": 0.1692, + "step": 12600 + }, + { + "epoch": 3.3531133581692387, + "grad_norm": 0.26690173149108887, + "learning_rate": 4.984407223709002e-08, + "loss": 0.1671, + "step": 12601 + }, + { + "epoch": 3.353379457158063, + "grad_norm": 0.33484190702438354, + "learning_rate": 4.982946234635138e-08, + "loss": 0.1836, + "step": 12602 + }, + { + "epoch": 3.3536455561468865, + "grad_norm": 0.3443395793437958, + "learning_rate": 4.981485388657839e-08, + "loss": 0.1812, + "step": 12603 + }, + { + "epoch": 3.3539116551357107, + "grad_norm": 0.26331910490989685, + "learning_rate": 4.980024685818778e-08, + "loss": 0.1691, + "step": 12604 + }, + { + "epoch": 3.3541777541245343, + "grad_norm": 0.3245111405849457, + "learning_rate": 4.9785641261596115e-08, + "loss": 0.1888, + "step": 12605 + }, + { + "epoch": 3.354443853113358, + "grad_norm": 0.31827205419540405, + "learning_rate": 4.977103709721999e-08, + "loss": 0.1726, + "step": 12606 + }, + { + "epoch": 3.354709952102182, + "grad_norm": 0.3247320353984833, + "learning_rate": 4.9756434365475955e-08, + "loss": 0.1871, + "step": 12607 + }, + { + "epoch": 3.354976051091006, + "grad_norm": 0.2659866213798523, + "learning_rate": 4.974183306678056e-08, + "loss": 0.1717, + "step": 12608 + }, + { + "epoch": 3.3552421500798295, + "grad_norm": 0.2676429748535156, + "learning_rate": 4.972723320155017e-08, + "loss": 0.1559, + "step": 12609 + }, + { + "epoch": 3.3555082490686536, + "grad_norm": 0.2915846109390259, + "learning_rate": 4.971263477020129e-08, + "loss": 0.186, + "step": 12610 + }, + { + "epoch": 3.3557743480574773, + "grad_norm": 0.38581299781799316, + "learning_rate": 4.9698037773150215e-08, + "loss": 0.2048, + "step": 12611 + }, + { + "epoch": 3.3560404470463014, + "grad_norm": 0.2922471761703491, + "learning_rate": 4.9683442210813356e-08, + "loss": 0.1772, + "step": 12612 + }, + { + "epoch": 3.356306546035125, + "grad_norm": 0.2692398130893707, + "learning_rate": 4.966884808360694e-08, + "loss": 0.1709, + "step": 12613 + }, + { + "epoch": 3.356572645023949, + "grad_norm": 0.2795031666755676, + "learning_rate": 4.965425539194725e-08, + "loss": 0.1758, + "step": 12614 + }, + { + "epoch": 3.356838744012773, + "grad_norm": 0.34345969557762146, + "learning_rate": 4.9639664136250516e-08, + "loss": 0.1795, + "step": 12615 + }, + { + "epoch": 3.3571048430015966, + "grad_norm": 0.4068634510040283, + "learning_rate": 4.962507431693288e-08, + "loss": 0.1931, + "step": 12616 + }, + { + "epoch": 3.3573709419904203, + "grad_norm": 0.40392231941223145, + "learning_rate": 4.961048593441054e-08, + "loss": 0.1795, + "step": 12617 + }, + { + "epoch": 3.3576370409792444, + "grad_norm": 0.2971474230289459, + "learning_rate": 4.959589898909951e-08, + "loss": 0.1764, + "step": 12618 + }, + { + "epoch": 3.357903139968068, + "grad_norm": 0.28254735469818115, + "learning_rate": 4.958131348141589e-08, + "loss": 0.1741, + "step": 12619 + }, + { + "epoch": 3.358169238956892, + "grad_norm": 0.25984060764312744, + "learning_rate": 4.9566729411775624e-08, + "loss": 0.178, + "step": 12620 + }, + { + "epoch": 3.358435337945716, + "grad_norm": 0.27435025572776794, + "learning_rate": 4.9552146780594726e-08, + "loss": 0.1719, + "step": 12621 + }, + { + "epoch": 3.3587014369345396, + "grad_norm": 0.31843048334121704, + "learning_rate": 4.953756558828912e-08, + "loss": 0.1876, + "step": 12622 + }, + { + "epoch": 3.3589675359233633, + "grad_norm": 0.3523067235946655, + "learning_rate": 4.9522985835274725e-08, + "loss": 0.1817, + "step": 12623 + }, + { + "epoch": 3.3592336349121874, + "grad_norm": 0.2763971984386444, + "learning_rate": 4.95084075219673e-08, + "loss": 0.1774, + "step": 12624 + }, + { + "epoch": 3.359499733901011, + "grad_norm": 0.3836674392223358, + "learning_rate": 4.949383064878274e-08, + "loss": 0.2026, + "step": 12625 + }, + { + "epoch": 3.3597658328898348, + "grad_norm": 0.43256112933158875, + "learning_rate": 4.947925521613673e-08, + "loss": 0.1989, + "step": 12626 + }, + { + "epoch": 3.360031931878659, + "grad_norm": 0.29855799674987793, + "learning_rate": 4.946468122444506e-08, + "loss": 0.1898, + "step": 12627 + }, + { + "epoch": 3.3602980308674826, + "grad_norm": 0.3324216902256012, + "learning_rate": 4.945010867412334e-08, + "loss": 0.1868, + "step": 12628 + }, + { + "epoch": 3.3605641298563067, + "grad_norm": 0.3472404181957245, + "learning_rate": 4.943553756558724e-08, + "loss": 0.1939, + "step": 12629 + }, + { + "epoch": 3.3608302288451304, + "grad_norm": 0.404715359210968, + "learning_rate": 4.9420967899252365e-08, + "loss": 0.19, + "step": 12630 + }, + { + "epoch": 3.361096327833954, + "grad_norm": 0.2640696167945862, + "learning_rate": 4.9406399675534296e-08, + "loss": 0.1745, + "step": 12631 + }, + { + "epoch": 3.361362426822778, + "grad_norm": 0.38618865609169006, + "learning_rate": 4.939183289484848e-08, + "loss": 0.1831, + "step": 12632 + }, + { + "epoch": 3.361628525811602, + "grad_norm": 0.2863845229148865, + "learning_rate": 4.9377267557610433e-08, + "loss": 0.1795, + "step": 12633 + }, + { + "epoch": 3.361894624800426, + "grad_norm": 0.2702878713607788, + "learning_rate": 4.9362703664235625e-08, + "loss": 0.173, + "step": 12634 + }, + { + "epoch": 3.3621607237892497, + "grad_norm": 0.27424871921539307, + "learning_rate": 4.934814121513938e-08, + "loss": 0.1843, + "step": 12635 + }, + { + "epoch": 3.3624268227780734, + "grad_norm": 0.5021799206733704, + "learning_rate": 4.933358021073707e-08, + "loss": 0.1806, + "step": 12636 + }, + { + "epoch": 3.3626929217668975, + "grad_norm": 0.3327428698539734, + "learning_rate": 4.931902065144401e-08, + "loss": 0.1891, + "step": 12637 + }, + { + "epoch": 3.362959020755721, + "grad_norm": 0.275770366191864, + "learning_rate": 4.930446253767551e-08, + "loss": 0.1774, + "step": 12638 + }, + { + "epoch": 3.363225119744545, + "grad_norm": 0.3384608030319214, + "learning_rate": 4.9289905869846714e-08, + "loss": 0.1784, + "step": 12639 + }, + { + "epoch": 3.363491218733369, + "grad_norm": 0.2730993628501892, + "learning_rate": 4.927535064837289e-08, + "loss": 0.19, + "step": 12640 + }, + { + "epoch": 3.3637573177221927, + "grad_norm": 0.27581432461738586, + "learning_rate": 4.926079687366911e-08, + "loss": 0.1681, + "step": 12641 + }, + { + "epoch": 3.3640234167110163, + "grad_norm": 0.2818411886692047, + "learning_rate": 4.924624454615051e-08, + "loss": 0.1857, + "step": 12642 + }, + { + "epoch": 3.3642895156998405, + "grad_norm": 0.2949777841567993, + "learning_rate": 4.923169366623214e-08, + "loss": 0.1832, + "step": 12643 + }, + { + "epoch": 3.364555614688664, + "grad_norm": 0.3298301100730896, + "learning_rate": 4.921714423432908e-08, + "loss": 0.1606, + "step": 12644 + }, + { + "epoch": 3.364821713677488, + "grad_norm": 0.3750016987323761, + "learning_rate": 4.920259625085622e-08, + "loss": 0.202, + "step": 12645 + }, + { + "epoch": 3.365087812666312, + "grad_norm": 0.27930158376693726, + "learning_rate": 4.918804971622858e-08, + "loss": 0.1694, + "step": 12646 + }, + { + "epoch": 3.3653539116551356, + "grad_norm": 0.273288756608963, + "learning_rate": 4.917350463086098e-08, + "loss": 0.164, + "step": 12647 + }, + { + "epoch": 3.3656200106439593, + "grad_norm": 0.3604280352592468, + "learning_rate": 4.915896099516835e-08, + "loss": 0.1878, + "step": 12648 + }, + { + "epoch": 3.3658861096327835, + "grad_norm": 0.29606306552886963, + "learning_rate": 4.914441880956544e-08, + "loss": 0.1828, + "step": 12649 + }, + { + "epoch": 3.366152208621607, + "grad_norm": 0.27688610553741455, + "learning_rate": 4.912987807446703e-08, + "loss": 0.1744, + "step": 12650 + }, + { + "epoch": 3.3664183076104313, + "grad_norm": 0.40063899755477905, + "learning_rate": 4.9115338790287876e-08, + "loss": 0.1642, + "step": 12651 + }, + { + "epoch": 3.366684406599255, + "grad_norm": 0.34992650151252747, + "learning_rate": 4.9100800957442666e-08, + "loss": 0.1852, + "step": 12652 + }, + { + "epoch": 3.3669505055880786, + "grad_norm": 0.2924349308013916, + "learning_rate": 4.9086264576346084e-08, + "loss": 0.1713, + "step": 12653 + }, + { + "epoch": 3.3672166045769027, + "grad_norm": 0.27757900953292847, + "learning_rate": 4.9071729647412653e-08, + "loss": 0.1763, + "step": 12654 + }, + { + "epoch": 3.3674827035657264, + "grad_norm": 0.4954482913017273, + "learning_rate": 4.9057196171057037e-08, + "loss": 0.1772, + "step": 12655 + }, + { + "epoch": 3.36774880255455, + "grad_norm": 0.26408207416534424, + "learning_rate": 4.9042664147693644e-08, + "loss": 0.1492, + "step": 12656 + }, + { + "epoch": 3.3680149015433742, + "grad_norm": 0.33908092975616455, + "learning_rate": 4.902813357773704e-08, + "loss": 0.1818, + "step": 12657 + }, + { + "epoch": 3.368281000532198, + "grad_norm": 0.2674720585346222, + "learning_rate": 4.901360446160164e-08, + "loss": 0.1733, + "step": 12658 + }, + { + "epoch": 3.368547099521022, + "grad_norm": 0.3332258462905884, + "learning_rate": 4.899907679970189e-08, + "loss": 0.1715, + "step": 12659 + }, + { + "epoch": 3.3688131985098457, + "grad_norm": 0.32765576243400574, + "learning_rate": 4.898455059245207e-08, + "loss": 0.1732, + "step": 12660 + }, + { + "epoch": 3.3690792974986694, + "grad_norm": 0.30358320474624634, + "learning_rate": 4.897002584026657e-08, + "loss": 0.1808, + "step": 12661 + }, + { + "epoch": 3.3693453964874935, + "grad_norm": 0.3243454098701477, + "learning_rate": 4.895550254355959e-08, + "loss": 0.1924, + "step": 12662 + }, + { + "epoch": 3.369611495476317, + "grad_norm": 0.43355560302734375, + "learning_rate": 4.894098070274545e-08, + "loss": 0.1903, + "step": 12663 + }, + { + "epoch": 3.369877594465141, + "grad_norm": 0.40096724033355713, + "learning_rate": 4.892646031823826e-08, + "loss": 0.1929, + "step": 12664 + }, + { + "epoch": 3.370143693453965, + "grad_norm": 0.3174104392528534, + "learning_rate": 4.8911941390452204e-08, + "loss": 0.1748, + "step": 12665 + }, + { + "epoch": 3.3704097924427887, + "grad_norm": 0.2936214804649353, + "learning_rate": 4.889742391980139e-08, + "loss": 0.1931, + "step": 12666 + }, + { + "epoch": 3.3706758914316124, + "grad_norm": 0.2951182723045349, + "learning_rate": 4.888290790669994e-08, + "loss": 0.1951, + "step": 12667 + }, + { + "epoch": 3.3709419904204365, + "grad_norm": 0.33935773372650146, + "learning_rate": 4.8868393351561787e-08, + "loss": 0.1725, + "step": 12668 + }, + { + "epoch": 3.37120808940926, + "grad_norm": 0.3506588935852051, + "learning_rate": 4.885388025480101e-08, + "loss": 0.1827, + "step": 12669 + }, + { + "epoch": 3.371474188398084, + "grad_norm": 0.26027175784111023, + "learning_rate": 4.8839368616831444e-08, + "loss": 0.1715, + "step": 12670 + }, + { + "epoch": 3.371740287386908, + "grad_norm": 0.35657209157943726, + "learning_rate": 4.882485843806706e-08, + "loss": 0.1651, + "step": 12671 + }, + { + "epoch": 3.3720063863757317, + "grad_norm": 0.28489336371421814, + "learning_rate": 4.881034971892171e-08, + "loss": 0.1709, + "step": 12672 + }, + { + "epoch": 3.3722724853645554, + "grad_norm": 0.2716493010520935, + "learning_rate": 4.87958424598092e-08, + "loss": 0.1792, + "step": 12673 + }, + { + "epoch": 3.3725385843533795, + "grad_norm": 0.2695724666118622, + "learning_rate": 4.8781336661143354e-08, + "loss": 0.1748, + "step": 12674 + }, + { + "epoch": 3.372804683342203, + "grad_norm": 0.3310728669166565, + "learning_rate": 4.8766832323337836e-08, + "loss": 0.1892, + "step": 12675 + }, + { + "epoch": 3.3730707823310273, + "grad_norm": 0.3433209955692291, + "learning_rate": 4.8752329446806407e-08, + "loss": 0.1748, + "step": 12676 + }, + { + "epoch": 3.373336881319851, + "grad_norm": 0.3462161421775818, + "learning_rate": 4.873782803196264e-08, + "loss": 0.1735, + "step": 12677 + }, + { + "epoch": 3.3736029803086747, + "grad_norm": 0.2557775676250458, + "learning_rate": 4.872332807922023e-08, + "loss": 0.1683, + "step": 12678 + }, + { + "epoch": 3.373869079297499, + "grad_norm": 0.5112306475639343, + "learning_rate": 4.8708829588992616e-08, + "loss": 0.1806, + "step": 12679 + }, + { + "epoch": 3.3741351782863225, + "grad_norm": 0.40323352813720703, + "learning_rate": 4.8694332561693493e-08, + "loss": 0.1897, + "step": 12680 + }, + { + "epoch": 3.374401277275146, + "grad_norm": 0.4172048270702362, + "learning_rate": 4.867983699773623e-08, + "loss": 0.1821, + "step": 12681 + }, + { + "epoch": 3.3746673762639703, + "grad_norm": 0.38451847434043884, + "learning_rate": 4.8665342897534344e-08, + "loss": 0.1972, + "step": 12682 + }, + { + "epoch": 3.374933475252794, + "grad_norm": 0.2744213044643402, + "learning_rate": 4.8650850261501155e-08, + "loss": 0.1695, + "step": 12683 + }, + { + "epoch": 3.375199574241618, + "grad_norm": 0.4795511066913605, + "learning_rate": 4.8636359090050096e-08, + "loss": 0.1735, + "step": 12684 + }, + { + "epoch": 3.3754656732304418, + "grad_norm": 0.2667846381664276, + "learning_rate": 4.8621869383594406e-08, + "loss": 0.1777, + "step": 12685 + }, + { + "epoch": 3.3757317722192655, + "grad_norm": 0.3128509223461151, + "learning_rate": 4.860738114254742e-08, + "loss": 0.1886, + "step": 12686 + }, + { + "epoch": 3.3759978712080896, + "grad_norm": 0.31914281845092773, + "learning_rate": 4.859289436732236e-08, + "loss": 0.1809, + "step": 12687 + }, + { + "epoch": 3.3762639701969133, + "grad_norm": 0.2676210403442383, + "learning_rate": 4.85784090583324e-08, + "loss": 0.1637, + "step": 12688 + }, + { + "epoch": 3.376530069185737, + "grad_norm": 0.3421271741390228, + "learning_rate": 4.8563925215990764e-08, + "loss": 0.1855, + "step": 12689 + }, + { + "epoch": 3.376796168174561, + "grad_norm": 0.2880382239818573, + "learning_rate": 4.854944284071045e-08, + "loss": 0.195, + "step": 12690 + }, + { + "epoch": 3.3770622671633848, + "grad_norm": 0.42306891083717346, + "learning_rate": 4.853496193290464e-08, + "loss": 0.1987, + "step": 12691 + }, + { + "epoch": 3.3773283661522084, + "grad_norm": 0.3465459644794464, + "learning_rate": 4.8520482492986233e-08, + "loss": 0.1896, + "step": 12692 + }, + { + "epoch": 3.3775944651410326, + "grad_norm": 0.2759523093700409, + "learning_rate": 4.850600452136829e-08, + "loss": 0.1837, + "step": 12693 + }, + { + "epoch": 3.3778605641298562, + "grad_norm": 0.26783594489097595, + "learning_rate": 4.849152801846375e-08, + "loss": 0.1587, + "step": 12694 + }, + { + "epoch": 3.37812666311868, + "grad_norm": 0.32943785190582275, + "learning_rate": 4.847705298468552e-08, + "loss": 0.1855, + "step": 12695 + }, + { + "epoch": 3.378392762107504, + "grad_norm": 0.26940593123435974, + "learning_rate": 4.8462579420446406e-08, + "loss": 0.1874, + "step": 12696 + }, + { + "epoch": 3.3786588610963277, + "grad_norm": 0.2788417339324951, + "learning_rate": 4.844810732615929e-08, + "loss": 0.1806, + "step": 12697 + }, + { + "epoch": 3.378924960085152, + "grad_norm": 0.27066677808761597, + "learning_rate": 4.843363670223688e-08, + "loss": 0.1529, + "step": 12698 + }, + { + "epoch": 3.3791910590739755, + "grad_norm": 0.46511906385421753, + "learning_rate": 4.8419167549091975e-08, + "loss": 0.1954, + "step": 12699 + }, + { + "epoch": 3.3794571580627992, + "grad_norm": 0.35582536458969116, + "learning_rate": 4.84046998671372e-08, + "loss": 0.1925, + "step": 12700 + }, + { + "epoch": 3.3797232570516234, + "grad_norm": 0.2865985631942749, + "learning_rate": 4.8390233656785215e-08, + "loss": 0.1778, + "step": 12701 + }, + { + "epoch": 3.379989356040447, + "grad_norm": 0.2713455557823181, + "learning_rate": 4.837576891844866e-08, + "loss": 0.1787, + "step": 12702 + }, + { + "epoch": 3.3802554550292707, + "grad_norm": 0.2849949300289154, + "learning_rate": 4.836130565254011e-08, + "loss": 0.1739, + "step": 12703 + }, + { + "epoch": 3.380521554018095, + "grad_norm": 0.3132776916027069, + "learning_rate": 4.8346843859472006e-08, + "loss": 0.1811, + "step": 12704 + }, + { + "epoch": 3.3807876530069185, + "grad_norm": 0.2973535358905792, + "learning_rate": 4.833238353965694e-08, + "loss": 0.1945, + "step": 12705 + }, + { + "epoch": 3.3810537519957427, + "grad_norm": 0.35155922174453735, + "learning_rate": 4.831792469350724e-08, + "loss": 0.191, + "step": 12706 + }, + { + "epoch": 3.3813198509845663, + "grad_norm": 0.3463151454925537, + "learning_rate": 4.830346732143535e-08, + "loss": 0.1782, + "step": 12707 + }, + { + "epoch": 3.38158594997339, + "grad_norm": 0.2818974256515503, + "learning_rate": 4.828901142385363e-08, + "loss": 0.1852, + "step": 12708 + }, + { + "epoch": 3.381852048962214, + "grad_norm": 0.3115909695625305, + "learning_rate": 4.827455700117438e-08, + "loss": 0.1913, + "step": 12709 + }, + { + "epoch": 3.382118147951038, + "grad_norm": 0.2943471372127533, + "learning_rate": 4.8260104053809915e-08, + "loss": 0.1734, + "step": 12710 + }, + { + "epoch": 3.3823842469398615, + "grad_norm": 0.2839491367340088, + "learning_rate": 4.824565258217239e-08, + "loss": 0.1906, + "step": 12711 + }, + { + "epoch": 3.3826503459286856, + "grad_norm": 0.4150611162185669, + "learning_rate": 4.823120258667406e-08, + "loss": 0.1926, + "step": 12712 + }, + { + "epoch": 3.3829164449175093, + "grad_norm": 0.3706526756286621, + "learning_rate": 4.821675406772698e-08, + "loss": 0.1651, + "step": 12713 + }, + { + "epoch": 3.383182543906333, + "grad_norm": 0.271262526512146, + "learning_rate": 4.820230702574336e-08, + "loss": 0.17, + "step": 12714 + }, + { + "epoch": 3.383448642895157, + "grad_norm": 0.27950507402420044, + "learning_rate": 4.8187861461135146e-08, + "loss": 0.1902, + "step": 12715 + }, + { + "epoch": 3.383714741883981, + "grad_norm": 0.25503936409950256, + "learning_rate": 4.8173417374314415e-08, + "loss": 0.1653, + "step": 12716 + }, + { + "epoch": 3.3839808408728045, + "grad_norm": 0.3779054284095764, + "learning_rate": 4.815897476569315e-08, + "loss": 0.1779, + "step": 12717 + }, + { + "epoch": 3.3842469398616286, + "grad_norm": 0.3650691509246826, + "learning_rate": 4.81445336356833e-08, + "loss": 0.1743, + "step": 12718 + }, + { + "epoch": 3.3845130388504523, + "grad_norm": 0.3193766474723816, + "learning_rate": 4.813009398469669e-08, + "loss": 0.1731, + "step": 12719 + }, + { + "epoch": 3.384779137839276, + "grad_norm": 0.2590502202510834, + "learning_rate": 4.811565581314524e-08, + "loss": 0.1531, + "step": 12720 + }, + { + "epoch": 3.3850452368281, + "grad_norm": 0.7315687537193298, + "learning_rate": 4.8101219121440684e-08, + "loss": 0.1716, + "step": 12721 + }, + { + "epoch": 3.385311335816924, + "grad_norm": 0.2826782166957855, + "learning_rate": 4.8086783909994825e-08, + "loss": 0.1605, + "step": 12722 + }, + { + "epoch": 3.385577434805748, + "grad_norm": 0.3677504062652588, + "learning_rate": 4.807235017921937e-08, + "loss": 0.1915, + "step": 12723 + }, + { + "epoch": 3.3858435337945716, + "grad_norm": 0.32097744941711426, + "learning_rate": 4.805791792952605e-08, + "loss": 0.1771, + "step": 12724 + }, + { + "epoch": 3.3861096327833953, + "grad_norm": 0.31708258390426636, + "learning_rate": 4.804348716132643e-08, + "loss": 0.1807, + "step": 12725 + }, + { + "epoch": 3.3863757317722194, + "grad_norm": 0.34099358320236206, + "learning_rate": 4.802905787503214e-08, + "loss": 0.1746, + "step": 12726 + }, + { + "epoch": 3.386641830761043, + "grad_norm": 0.31611862778663635, + "learning_rate": 4.801463007105476e-08, + "loss": 0.1794, + "step": 12727 + }, + { + "epoch": 3.3869079297498668, + "grad_norm": 0.3170066475868225, + "learning_rate": 4.8000203749805736e-08, + "loss": 0.1802, + "step": 12728 + }, + { + "epoch": 3.387174028738691, + "grad_norm": 0.31027111411094666, + "learning_rate": 4.7985778911696606e-08, + "loss": 0.1835, + "step": 12729 + }, + { + "epoch": 3.3874401277275146, + "grad_norm": 0.40003326535224915, + "learning_rate": 4.7971355557138684e-08, + "loss": 0.1789, + "step": 12730 + }, + { + "epoch": 3.3877062267163387, + "grad_norm": 0.34449324011802673, + "learning_rate": 4.7956933686543496e-08, + "loss": 0.1719, + "step": 12731 + }, + { + "epoch": 3.3879723257051624, + "grad_norm": 0.28498685359954834, + "learning_rate": 4.7942513300322296e-08, + "loss": 0.1677, + "step": 12732 + }, + { + "epoch": 3.388238424693986, + "grad_norm": 0.39801082015037537, + "learning_rate": 4.792809439888643e-08, + "loss": 0.1993, + "step": 12733 + }, + { + "epoch": 3.38850452368281, + "grad_norm": 0.32154935598373413, + "learning_rate": 4.791367698264709e-08, + "loss": 0.1765, + "step": 12734 + }, + { + "epoch": 3.388770622671634, + "grad_norm": 0.3283633887767792, + "learning_rate": 4.7899261052015564e-08, + "loss": 0.1684, + "step": 12735 + }, + { + "epoch": 3.3890367216604576, + "grad_norm": 0.3036288619041443, + "learning_rate": 4.788484660740295e-08, + "loss": 0.1706, + "step": 12736 + }, + { + "epoch": 3.3893028206492817, + "grad_norm": 0.3494951128959656, + "learning_rate": 4.787043364922041e-08, + "loss": 0.1789, + "step": 12737 + }, + { + "epoch": 3.3895689196381054, + "grad_norm": 0.3030520975589752, + "learning_rate": 4.785602217787904e-08, + "loss": 0.1656, + "step": 12738 + }, + { + "epoch": 3.389835018626929, + "grad_norm": 0.4126211404800415, + "learning_rate": 4.7841612193789916e-08, + "loss": 0.1926, + "step": 12739 + }, + { + "epoch": 3.390101117615753, + "grad_norm": 0.3448926508426666, + "learning_rate": 4.782720369736396e-08, + "loss": 0.167, + "step": 12740 + }, + { + "epoch": 3.390367216604577, + "grad_norm": 0.2564486861228943, + "learning_rate": 4.781279668901222e-08, + "loss": 0.1587, + "step": 12741 + }, + { + "epoch": 3.3906333155934005, + "grad_norm": 0.34471869468688965, + "learning_rate": 4.779839116914552e-08, + "loss": 0.1821, + "step": 12742 + }, + { + "epoch": 3.3908994145822247, + "grad_norm": 0.2874964475631714, + "learning_rate": 4.7783987138174796e-08, + "loss": 0.173, + "step": 12743 + }, + { + "epoch": 3.3911655135710483, + "grad_norm": 0.2848646342754364, + "learning_rate": 4.776958459651086e-08, + "loss": 0.1837, + "step": 12744 + }, + { + "epoch": 3.3914316125598725, + "grad_norm": 0.2870907783508301, + "learning_rate": 4.775518354456451e-08, + "loss": 0.1878, + "step": 12745 + }, + { + "epoch": 3.391697711548696, + "grad_norm": 0.3688041567802429, + "learning_rate": 4.7740783982746536e-08, + "loss": 0.1865, + "step": 12746 + }, + { + "epoch": 3.39196381053752, + "grad_norm": 0.35151779651641846, + "learning_rate": 4.772638591146755e-08, + "loss": 0.1822, + "step": 12747 + }, + { + "epoch": 3.392229909526344, + "grad_norm": 0.28820890188217163, + "learning_rate": 4.77119893311383e-08, + "loss": 0.1744, + "step": 12748 + }, + { + "epoch": 3.3924960085151676, + "grad_norm": 0.5507851243019104, + "learning_rate": 4.769759424216934e-08, + "loss": 0.1923, + "step": 12749 + }, + { + "epoch": 3.3927621075039913, + "grad_norm": 0.40911969542503357, + "learning_rate": 4.7683200644971314e-08, + "loss": 0.1844, + "step": 12750 + }, + { + "epoch": 3.3930282064928154, + "grad_norm": 0.2781142592430115, + "learning_rate": 4.7668808539954665e-08, + "loss": 0.1764, + "step": 12751 + }, + { + "epoch": 3.393294305481639, + "grad_norm": 0.2662924528121948, + "learning_rate": 4.765441792752996e-08, + "loss": 0.1709, + "step": 12752 + }, + { + "epoch": 3.3935604044704633, + "grad_norm": 0.27484145760536194, + "learning_rate": 4.76400288081076e-08, + "loss": 0.1837, + "step": 12753 + }, + { + "epoch": 3.393826503459287, + "grad_norm": 0.46229082345962524, + "learning_rate": 4.7625641182098075e-08, + "loss": 0.1922, + "step": 12754 + }, + { + "epoch": 3.3940926024481106, + "grad_norm": 0.29327666759490967, + "learning_rate": 4.7611255049911655e-08, + "loss": 0.1829, + "step": 12755 + }, + { + "epoch": 3.3943587014369347, + "grad_norm": 0.4109322726726532, + "learning_rate": 4.759687041195873e-08, + "loss": 0.1796, + "step": 12756 + }, + { + "epoch": 3.3946248004257584, + "grad_norm": 0.34960463643074036, + "learning_rate": 4.758248726864953e-08, + "loss": 0.1851, + "step": 12757 + }, + { + "epoch": 3.394890899414582, + "grad_norm": 0.2895500659942627, + "learning_rate": 4.75681056203943e-08, + "loss": 0.1712, + "step": 12758 + }, + { + "epoch": 3.3951569984034062, + "grad_norm": 0.2678811252117157, + "learning_rate": 4.7553725467603233e-08, + "loss": 0.1633, + "step": 12759 + }, + { + "epoch": 3.39542309739223, + "grad_norm": 0.39178362488746643, + "learning_rate": 4.753934681068654e-08, + "loss": 0.1827, + "step": 12760 + }, + { + "epoch": 3.3956891963810536, + "grad_norm": 0.28804096579551697, + "learning_rate": 4.752496965005425e-08, + "loss": 0.2026, + "step": 12761 + }, + { + "epoch": 3.3959552953698777, + "grad_norm": 0.28206223249435425, + "learning_rate": 4.7510593986116446e-08, + "loss": 0.1837, + "step": 12762 + }, + { + "epoch": 3.3962213943587014, + "grad_norm": 0.40358400344848633, + "learning_rate": 4.74962198192832e-08, + "loss": 0.1839, + "step": 12763 + }, + { + "epoch": 3.396487493347525, + "grad_norm": 0.26384031772613525, + "learning_rate": 4.748184714996444e-08, + "loss": 0.1561, + "step": 12764 + }, + { + "epoch": 3.396753592336349, + "grad_norm": 0.29425081610679626, + "learning_rate": 4.7467475978570136e-08, + "loss": 0.1754, + "step": 12765 + }, + { + "epoch": 3.397019691325173, + "grad_norm": 0.26392894983291626, + "learning_rate": 4.745310630551015e-08, + "loss": 0.1658, + "step": 12766 + }, + { + "epoch": 3.3972857903139966, + "grad_norm": 0.26383280754089355, + "learning_rate": 4.743873813119435e-08, + "loss": 0.1705, + "step": 12767 + }, + { + "epoch": 3.3975518893028207, + "grad_norm": 0.3514374792575836, + "learning_rate": 4.742437145603253e-08, + "loss": 0.1894, + "step": 12768 + }, + { + "epoch": 3.3978179882916444, + "grad_norm": 0.28391963243484497, + "learning_rate": 4.741000628043453e-08, + "loss": 0.1758, + "step": 12769 + }, + { + "epoch": 3.3980840872804685, + "grad_norm": 0.3285646140575409, + "learning_rate": 4.7395642604809974e-08, + "loss": 0.1669, + "step": 12770 + }, + { + "epoch": 3.398350186269292, + "grad_norm": 0.24830541014671326, + "learning_rate": 4.7381280429568626e-08, + "loss": 0.1529, + "step": 12771 + }, + { + "epoch": 3.398616285258116, + "grad_norm": 0.2736298441886902, + "learning_rate": 4.736691975512005e-08, + "loss": 0.1792, + "step": 12772 + }, + { + "epoch": 3.39888238424694, + "grad_norm": 0.2787647545337677, + "learning_rate": 4.735256058187388e-08, + "loss": 0.169, + "step": 12773 + }, + { + "epoch": 3.3991484832357637, + "grad_norm": 0.43314263224601746, + "learning_rate": 4.7338202910239666e-08, + "loss": 0.1917, + "step": 12774 + }, + { + "epoch": 3.3994145822245874, + "grad_norm": 0.277165025472641, + "learning_rate": 4.732384674062696e-08, + "loss": 0.1741, + "step": 12775 + }, + { + "epoch": 3.3996806812134115, + "grad_norm": 0.47136634588241577, + "learning_rate": 4.730949207344514e-08, + "loss": 0.2015, + "step": 12776 + }, + { + "epoch": 3.399946780202235, + "grad_norm": 0.27243661880493164, + "learning_rate": 4.7295138909103725e-08, + "loss": 0.1787, + "step": 12777 + }, + { + "epoch": 3.4002128791910593, + "grad_norm": 0.47645968198776245, + "learning_rate": 4.728078724801201e-08, + "loss": 0.1936, + "step": 12778 + }, + { + "epoch": 3.400478978179883, + "grad_norm": 0.2982167899608612, + "learning_rate": 4.726643709057941e-08, + "loss": 0.1692, + "step": 12779 + }, + { + "epoch": 3.4007450771687067, + "grad_norm": 0.8660852909088135, + "learning_rate": 4.7252088437215154e-08, + "loss": 0.1863, + "step": 12780 + }, + { + "epoch": 3.401011176157531, + "grad_norm": 0.2617814242839813, + "learning_rate": 4.723774128832848e-08, + "loss": 0.1637, + "step": 12781 + }, + { + "epoch": 3.4012772751463545, + "grad_norm": 0.2731413245201111, + "learning_rate": 4.7223395644328714e-08, + "loss": 0.1653, + "step": 12782 + }, + { + "epoch": 3.401543374135178, + "grad_norm": 0.3134324252605438, + "learning_rate": 4.720905150562492e-08, + "loss": 0.2053, + "step": 12783 + }, + { + "epoch": 3.4018094731240023, + "grad_norm": 0.3197225332260132, + "learning_rate": 4.7194708872626286e-08, + "loss": 0.1825, + "step": 12784 + }, + { + "epoch": 3.402075572112826, + "grad_norm": 0.264885276556015, + "learning_rate": 4.718036774574182e-08, + "loss": 0.1661, + "step": 12785 + }, + { + "epoch": 3.4023416711016496, + "grad_norm": 0.2768230140209198, + "learning_rate": 4.716602812538065e-08, + "loss": 0.1684, + "step": 12786 + }, + { + "epoch": 3.4026077700904738, + "grad_norm": 0.28580522537231445, + "learning_rate": 4.7151690011951686e-08, + "loss": 0.177, + "step": 12787 + }, + { + "epoch": 3.4028738690792975, + "grad_norm": 0.2725575566291809, + "learning_rate": 4.71373534058639e-08, + "loss": 0.1706, + "step": 12788 + }, + { + "epoch": 3.403139968068121, + "grad_norm": 0.26941728591918945, + "learning_rate": 4.712301830752623e-08, + "loss": 0.1651, + "step": 12789 + }, + { + "epoch": 3.4034060670569453, + "grad_norm": 0.27030444145202637, + "learning_rate": 4.710868471734756e-08, + "loss": 0.1846, + "step": 12790 + }, + { + "epoch": 3.403672166045769, + "grad_norm": 0.32930758595466614, + "learning_rate": 4.7094352635736646e-08, + "loss": 0.1899, + "step": 12791 + }, + { + "epoch": 3.4039382650345926, + "grad_norm": 0.2580977976322174, + "learning_rate": 4.708002206310233e-08, + "loss": 0.1585, + "step": 12792 + }, + { + "epoch": 3.4042043640234168, + "grad_norm": 0.292881578207016, + "learning_rate": 4.70656929998533e-08, + "loss": 0.1745, + "step": 12793 + }, + { + "epoch": 3.4044704630122404, + "grad_norm": 0.29180097579956055, + "learning_rate": 4.7051365446398304e-08, + "loss": 0.1836, + "step": 12794 + }, + { + "epoch": 3.4047365620010646, + "grad_norm": 0.3214211165904999, + "learning_rate": 4.7037039403145896e-08, + "loss": 0.171, + "step": 12795 + }, + { + "epoch": 3.4050026609898882, + "grad_norm": 0.3816308379173279, + "learning_rate": 4.702271487050482e-08, + "loss": 0.1925, + "step": 12796 + }, + { + "epoch": 3.405268759978712, + "grad_norm": 0.2968594431877136, + "learning_rate": 4.700839184888353e-08, + "loss": 0.1715, + "step": 12797 + }, + { + "epoch": 3.405534858967536, + "grad_norm": 0.3677350580692291, + "learning_rate": 4.69940703386906e-08, + "loss": 0.1834, + "step": 12798 + }, + { + "epoch": 3.4058009579563597, + "grad_norm": 0.3319767713546753, + "learning_rate": 4.6979750340334524e-08, + "loss": 0.1747, + "step": 12799 + }, + { + "epoch": 3.406067056945184, + "grad_norm": 0.27423763275146484, + "learning_rate": 4.696543185422368e-08, + "loss": 0.1804, + "step": 12800 + }, + { + "epoch": 3.4063331559340075, + "grad_norm": 0.2797483503818512, + "learning_rate": 4.6951114880766515e-08, + "loss": 0.1901, + "step": 12801 + }, + { + "epoch": 3.406599254922831, + "grad_norm": 0.3198275566101074, + "learning_rate": 4.693679942037133e-08, + "loss": 0.1768, + "step": 12802 + }, + { + "epoch": 3.4068653539116553, + "grad_norm": 0.3838849663734436, + "learning_rate": 4.6922485473446446e-08, + "loss": 0.1996, + "step": 12803 + }, + { + "epoch": 3.407131452900479, + "grad_norm": 0.2773436903953552, + "learning_rate": 4.690817304040015e-08, + "loss": 0.1696, + "step": 12804 + }, + { + "epoch": 3.4073975518893027, + "grad_norm": 0.270097017288208, + "learning_rate": 4.689386212164067e-08, + "loss": 0.1716, + "step": 12805 + }, + { + "epoch": 3.407663650878127, + "grad_norm": 0.28095024824142456, + "learning_rate": 4.687955271757612e-08, + "loss": 0.1817, + "step": 12806 + }, + { + "epoch": 3.4079297498669505, + "grad_norm": 0.31614458560943604, + "learning_rate": 4.686524482861473e-08, + "loss": 0.2003, + "step": 12807 + }, + { + "epoch": 3.408195848855774, + "grad_norm": 0.4781763255596161, + "learning_rate": 4.685093845516447e-08, + "loss": 0.1768, + "step": 12808 + }, + { + "epoch": 3.4084619478445983, + "grad_norm": 0.37265074253082275, + "learning_rate": 4.6836633597633456e-08, + "loss": 0.1769, + "step": 12809 + }, + { + "epoch": 3.408728046833422, + "grad_norm": 0.2569989562034607, + "learning_rate": 4.6822330256429696e-08, + "loss": 0.1742, + "step": 12810 + }, + { + "epoch": 3.4089941458222457, + "grad_norm": 0.2772442400455475, + "learning_rate": 4.680802843196117e-08, + "loss": 0.1655, + "step": 12811 + }, + { + "epoch": 3.40926024481107, + "grad_norm": 0.3694859743118286, + "learning_rate": 4.6793728124635725e-08, + "loss": 0.2069, + "step": 12812 + }, + { + "epoch": 3.4095263437998935, + "grad_norm": 0.2599116861820221, + "learning_rate": 4.6779429334861306e-08, + "loss": 0.1525, + "step": 12813 + }, + { + "epoch": 3.409792442788717, + "grad_norm": 0.3175489008426666, + "learning_rate": 4.6765132063045686e-08, + "loss": 0.1796, + "step": 12814 + }, + { + "epoch": 3.4100585417775413, + "grad_norm": 0.3626452088356018, + "learning_rate": 4.675083630959671e-08, + "loss": 0.1784, + "step": 12815 + }, + { + "epoch": 3.410324640766365, + "grad_norm": 0.34394603967666626, + "learning_rate": 4.673654207492206e-08, + "loss": 0.1893, + "step": 12816 + }, + { + "epoch": 3.410590739755189, + "grad_norm": 0.2570471465587616, + "learning_rate": 4.6722249359429454e-08, + "loss": 0.1502, + "step": 12817 + }, + { + "epoch": 3.410856838744013, + "grad_norm": 0.30611854791641235, + "learning_rate": 4.670795816352657e-08, + "loss": 0.1545, + "step": 12818 + }, + { + "epoch": 3.4111229377328365, + "grad_norm": 0.30513355135917664, + "learning_rate": 4.669366848762101e-08, + "loss": 0.1916, + "step": 12819 + }, + { + "epoch": 3.4113890367216606, + "grad_norm": 0.4237916171550751, + "learning_rate": 4.667938033212039e-08, + "loss": 0.1765, + "step": 12820 + }, + { + "epoch": 3.4116551357104843, + "grad_norm": 0.340166836977005, + "learning_rate": 4.666509369743215e-08, + "loss": 0.1647, + "step": 12821 + }, + { + "epoch": 3.411921234699308, + "grad_norm": 0.2701134979724884, + "learning_rate": 4.665080858396386e-08, + "loss": 0.1642, + "step": 12822 + }, + { + "epoch": 3.412187333688132, + "grad_norm": 0.31941521167755127, + "learning_rate": 4.663652499212289e-08, + "loss": 0.1669, + "step": 12823 + }, + { + "epoch": 3.4124534326769558, + "grad_norm": 0.29138123989105225, + "learning_rate": 4.662224292231667e-08, + "loss": 0.1776, + "step": 12824 + }, + { + "epoch": 3.41271953166578, + "grad_norm": 0.3627709448337555, + "learning_rate": 4.660796237495254e-08, + "loss": 0.19, + "step": 12825 + }, + { + "epoch": 3.4129856306546036, + "grad_norm": 0.26996326446533203, + "learning_rate": 4.659368335043786e-08, + "loss": 0.1769, + "step": 12826 + }, + { + "epoch": 3.4132517296434273, + "grad_norm": 0.2707536220550537, + "learning_rate": 4.657940584917982e-08, + "loss": 0.1741, + "step": 12827 + }, + { + "epoch": 3.4135178286322514, + "grad_norm": 0.2897144854068756, + "learning_rate": 4.656512987158572e-08, + "loss": 0.1687, + "step": 12828 + }, + { + "epoch": 3.413783927621075, + "grad_norm": 0.2694264054298401, + "learning_rate": 4.655085541806266e-08, + "loss": 0.1878, + "step": 12829 + }, + { + "epoch": 3.4140500266098988, + "grad_norm": 0.27714774012565613, + "learning_rate": 4.653658248901786e-08, + "loss": 0.1698, + "step": 12830 + }, + { + "epoch": 3.414316125598723, + "grad_norm": 0.33292052149772644, + "learning_rate": 4.6522311084858336e-08, + "loss": 0.1842, + "step": 12831 + }, + { + "epoch": 3.4145822245875466, + "grad_norm": 0.2679791748523712, + "learning_rate": 4.6508041205991165e-08, + "loss": 0.1544, + "step": 12832 + }, + { + "epoch": 3.4148483235763702, + "grad_norm": 0.2876748740673065, + "learning_rate": 4.6493772852823357e-08, + "loss": 0.1791, + "step": 12833 + }, + { + "epoch": 3.4151144225651944, + "grad_norm": 0.27764174342155457, + "learning_rate": 4.647950602576188e-08, + "loss": 0.1752, + "step": 12834 + }, + { + "epoch": 3.415380521554018, + "grad_norm": 0.33615440130233765, + "learning_rate": 4.64652407252137e-08, + "loss": 0.1732, + "step": 12835 + }, + { + "epoch": 3.4156466205428417, + "grad_norm": 0.2755594253540039, + "learning_rate": 4.645097695158559e-08, + "loss": 0.1814, + "step": 12836 + }, + { + "epoch": 3.415912719531666, + "grad_norm": 0.27443620562553406, + "learning_rate": 4.643671470528446e-08, + "loss": 0.1605, + "step": 12837 + }, + { + "epoch": 3.4161788185204895, + "grad_norm": 0.28442588448524475, + "learning_rate": 4.642245398671706e-08, + "loss": 0.1731, + "step": 12838 + }, + { + "epoch": 3.4164449175093132, + "grad_norm": 0.29219838976860046, + "learning_rate": 4.640819479629013e-08, + "loss": 0.1659, + "step": 12839 + }, + { + "epoch": 3.4167110164981374, + "grad_norm": 0.42939502000808716, + "learning_rate": 4.63939371344104e-08, + "loss": 0.186, + "step": 12840 + }, + { + "epoch": 3.416977115486961, + "grad_norm": 0.33844318985939026, + "learning_rate": 4.637968100148455e-08, + "loss": 0.1687, + "step": 12841 + }, + { + "epoch": 3.417243214475785, + "grad_norm": 0.37920325994491577, + "learning_rate": 4.6365426397919114e-08, + "loss": 0.2052, + "step": 12842 + }, + { + "epoch": 3.417509313464609, + "grad_norm": 0.3482537567615509, + "learning_rate": 4.6351173324120765e-08, + "loss": 0.2057, + "step": 12843 + }, + { + "epoch": 3.4177754124534325, + "grad_norm": 0.28920671343803406, + "learning_rate": 4.633692178049592e-08, + "loss": 0.1671, + "step": 12844 + }, + { + "epoch": 3.4180415114422567, + "grad_norm": 0.394144207239151, + "learning_rate": 4.6322671767451146e-08, + "loss": 0.175, + "step": 12845 + }, + { + "epoch": 3.4183076104310803, + "grad_norm": 0.2540631890296936, + "learning_rate": 4.630842328539278e-08, + "loss": 0.1718, + "step": 12846 + }, + { + "epoch": 3.418573709419904, + "grad_norm": 0.26997992396354675, + "learning_rate": 4.629417633472738e-08, + "loss": 0.1765, + "step": 12847 + }, + { + "epoch": 3.418839808408728, + "grad_norm": 0.28223663568496704, + "learning_rate": 4.627993091586114e-08, + "loss": 0.1875, + "step": 12848 + }, + { + "epoch": 3.419105907397552, + "grad_norm": 0.3607085347175598, + "learning_rate": 4.6265687029200496e-08, + "loss": 0.1855, + "step": 12849 + }, + { + "epoch": 3.419372006386376, + "grad_norm": 0.26444563269615173, + "learning_rate": 4.625144467515161e-08, + "loss": 0.1747, + "step": 12850 + }, + { + "epoch": 3.4196381053751996, + "grad_norm": 0.4346417486667633, + "learning_rate": 4.6237203854120774e-08, + "loss": 0.1676, + "step": 12851 + }, + { + "epoch": 3.4199042043640233, + "grad_norm": 0.2871024012565613, + "learning_rate": 4.62229645665141e-08, + "loss": 0.1615, + "step": 12852 + }, + { + "epoch": 3.4201703033528474, + "grad_norm": 0.28860217332839966, + "learning_rate": 4.6208726812737754e-08, + "loss": 0.1747, + "step": 12853 + }, + { + "epoch": 3.420436402341671, + "grad_norm": 0.33675336837768555, + "learning_rate": 4.619449059319783e-08, + "loss": 0.1924, + "step": 12854 + }, + { + "epoch": 3.420702501330495, + "grad_norm": 0.27286121249198914, + "learning_rate": 4.618025590830038e-08, + "loss": 0.1704, + "step": 12855 + }, + { + "epoch": 3.420968600319319, + "grad_norm": 0.28849270939826965, + "learning_rate": 4.616602275845143e-08, + "loss": 0.1859, + "step": 12856 + }, + { + "epoch": 3.4212346993081426, + "grad_norm": 0.32726362347602844, + "learning_rate": 4.615179114405687e-08, + "loss": 0.1851, + "step": 12857 + }, + { + "epoch": 3.4215007982969663, + "grad_norm": 0.30364251136779785, + "learning_rate": 4.613756106552269e-08, + "loss": 0.1995, + "step": 12858 + }, + { + "epoch": 3.4217668972857904, + "grad_norm": 0.2806204557418823, + "learning_rate": 4.6123332523254685e-08, + "loss": 0.1717, + "step": 12859 + }, + { + "epoch": 3.422032996274614, + "grad_norm": 0.366317480802536, + "learning_rate": 4.610910551765872e-08, + "loss": 0.1658, + "step": 12860 + }, + { + "epoch": 3.422299095263438, + "grad_norm": 0.2727493345737457, + "learning_rate": 4.6094880049140575e-08, + "loss": 0.1883, + "step": 12861 + }, + { + "epoch": 3.422565194252262, + "grad_norm": 0.26807600259780884, + "learning_rate": 4.608065611810602e-08, + "loss": 0.1625, + "step": 12862 + }, + { + "epoch": 3.4228312932410856, + "grad_norm": 0.27257516980171204, + "learning_rate": 4.6066433724960696e-08, + "loss": 0.1788, + "step": 12863 + }, + { + "epoch": 3.4230973922299097, + "grad_norm": 0.26580923795700073, + "learning_rate": 4.6052212870110316e-08, + "loss": 0.1765, + "step": 12864 + }, + { + "epoch": 3.4233634912187334, + "grad_norm": 0.3155457377433777, + "learning_rate": 4.603799355396041e-08, + "loss": 0.1846, + "step": 12865 + }, + { + "epoch": 3.423629590207557, + "grad_norm": 0.2726312577724457, + "learning_rate": 4.602377577691663e-08, + "loss": 0.1624, + "step": 12866 + }, + { + "epoch": 3.423895689196381, + "grad_norm": 0.35856592655181885, + "learning_rate": 4.60095595393844e-08, + "loss": 0.1717, + "step": 12867 + }, + { + "epoch": 3.424161788185205, + "grad_norm": 0.2626224458217621, + "learning_rate": 4.599534484176926e-08, + "loss": 0.1757, + "step": 12868 + }, + { + "epoch": 3.4244278871740286, + "grad_norm": 0.3272273540496826, + "learning_rate": 4.5981131684476616e-08, + "loss": 0.1747, + "step": 12869 + }, + { + "epoch": 3.4246939861628527, + "grad_norm": 0.25611332058906555, + "learning_rate": 4.596692006791191e-08, + "loss": 0.162, + "step": 12870 + }, + { + "epoch": 3.4249600851516764, + "grad_norm": 0.2530340254306793, + "learning_rate": 4.595270999248041e-08, + "loss": 0.1631, + "step": 12871 + }, + { + "epoch": 3.4252261841405005, + "grad_norm": 0.28994739055633545, + "learning_rate": 4.5938501458587444e-08, + "loss": 0.1756, + "step": 12872 + }, + { + "epoch": 3.425492283129324, + "grad_norm": 0.26358675956726074, + "learning_rate": 4.5924294466638315e-08, + "loss": 0.1653, + "step": 12873 + }, + { + "epoch": 3.425758382118148, + "grad_norm": 0.2855128049850464, + "learning_rate": 4.591008901703816e-08, + "loss": 0.181, + "step": 12874 + }, + { + "epoch": 3.426024481106972, + "grad_norm": 0.3548518121242523, + "learning_rate": 4.589588511019218e-08, + "loss": 0.1784, + "step": 12875 + }, + { + "epoch": 3.4262905800957957, + "grad_norm": 0.2795277237892151, + "learning_rate": 4.58816827465055e-08, + "loss": 0.1676, + "step": 12876 + }, + { + "epoch": 3.4265566790846194, + "grad_norm": 0.33852481842041016, + "learning_rate": 4.5867481926383246e-08, + "loss": 0.1807, + "step": 12877 + }, + { + "epoch": 3.4268227780734435, + "grad_norm": 0.29803982377052307, + "learning_rate": 4.5853282650230365e-08, + "loss": 0.1926, + "step": 12878 + }, + { + "epoch": 3.427088877062267, + "grad_norm": 0.42149943113327026, + "learning_rate": 4.583908491845193e-08, + "loss": 0.1824, + "step": 12879 + }, + { + "epoch": 3.427354976051091, + "grad_norm": 0.2871336042881012, + "learning_rate": 4.582488873145281e-08, + "loss": 0.1878, + "step": 12880 + }, + { + "epoch": 3.427621075039915, + "grad_norm": 0.4251781404018402, + "learning_rate": 4.581069408963801e-08, + "loss": 0.1962, + "step": 12881 + }, + { + "epoch": 3.4278871740287387, + "grad_norm": 0.32805386185646057, + "learning_rate": 4.579650099341228e-08, + "loss": 0.1911, + "step": 12882 + }, + { + "epoch": 3.4281532730175623, + "grad_norm": 0.4454341232776642, + "learning_rate": 4.57823094431805e-08, + "loss": 0.1768, + "step": 12883 + }, + { + "epoch": 3.4284193720063865, + "grad_norm": 0.27217355370521545, + "learning_rate": 4.576811943934742e-08, + "loss": 0.1771, + "step": 12884 + }, + { + "epoch": 3.42868547099521, + "grad_norm": 0.3665873408317566, + "learning_rate": 4.575393098231781e-08, + "loss": 0.1913, + "step": 12885 + }, + { + "epoch": 3.428951569984034, + "grad_norm": 0.280286967754364, + "learning_rate": 4.57397440724963e-08, + "loss": 0.182, + "step": 12886 + }, + { + "epoch": 3.429217668972858, + "grad_norm": 0.33440130949020386, + "learning_rate": 4.5725558710287584e-08, + "loss": 0.1801, + "step": 12887 + }, + { + "epoch": 3.4294837679616816, + "grad_norm": 0.26096826791763306, + "learning_rate": 4.571137489609618e-08, + "loss": 0.1644, + "step": 12888 + }, + { + "epoch": 3.4297498669505058, + "grad_norm": 0.32919299602508545, + "learning_rate": 4.5697192630326685e-08, + "loss": 0.1785, + "step": 12889 + }, + { + "epoch": 3.4300159659393294, + "grad_norm": 0.26896336674690247, + "learning_rate": 4.568301191338362e-08, + "loss": 0.16, + "step": 12890 + }, + { + "epoch": 3.430282064928153, + "grad_norm": 0.2924731969833374, + "learning_rate": 4.5668832745671416e-08, + "loss": 0.1769, + "step": 12891 + }, + { + "epoch": 3.4305481639169773, + "grad_norm": 0.31250616908073425, + "learning_rate": 4.5654655127594555e-08, + "loss": 0.176, + "step": 12892 + }, + { + "epoch": 3.430814262905801, + "grad_norm": 0.27255576848983765, + "learning_rate": 4.5640479059557316e-08, + "loss": 0.1547, + "step": 12893 + }, + { + "epoch": 3.4310803618946246, + "grad_norm": 0.3595731258392334, + "learning_rate": 4.5626304541964124e-08, + "loss": 0.171, + "step": 12894 + }, + { + "epoch": 3.4313464608834487, + "grad_norm": 0.39164289832115173, + "learning_rate": 4.561213157521918e-08, + "loss": 0.1774, + "step": 12895 + }, + { + "epoch": 3.4316125598722724, + "grad_norm": 0.2838212549686432, + "learning_rate": 4.5597960159726766e-08, + "loss": 0.184, + "step": 12896 + }, + { + "epoch": 3.4318786588610966, + "grad_norm": 0.32662728428840637, + "learning_rate": 4.558379029589108e-08, + "loss": 0.1896, + "step": 12897 + }, + { + "epoch": 3.4321447578499202, + "grad_norm": 0.3781350255012512, + "learning_rate": 4.55696219841163e-08, + "loss": 0.1894, + "step": 12898 + }, + { + "epoch": 3.432410856838744, + "grad_norm": 0.2798425257205963, + "learning_rate": 4.5555455224806484e-08, + "loss": 0.1702, + "step": 12899 + }, + { + "epoch": 3.432676955827568, + "grad_norm": 0.32095977663993835, + "learning_rate": 4.554129001836575e-08, + "loss": 0.1662, + "step": 12900 + }, + { + "epoch": 3.4329430548163917, + "grad_norm": 0.3153438866138458, + "learning_rate": 4.5527126365198065e-08, + "loss": 0.2035, + "step": 12901 + }, + { + "epoch": 3.4332091538052154, + "grad_norm": 0.4058948755264282, + "learning_rate": 4.551296426570745e-08, + "loss": 0.1921, + "step": 12902 + }, + { + "epoch": 3.4334752527940395, + "grad_norm": 0.3074467182159424, + "learning_rate": 4.5498803720297796e-08, + "loss": 0.1782, + "step": 12903 + }, + { + "epoch": 3.433741351782863, + "grad_norm": 0.2582472264766693, + "learning_rate": 4.548464472937301e-08, + "loss": 0.1553, + "step": 12904 + }, + { + "epoch": 3.434007450771687, + "grad_norm": 0.2901040017604828, + "learning_rate": 4.5470487293336935e-08, + "loss": 0.1883, + "step": 12905 + }, + { + "epoch": 3.434273549760511, + "grad_norm": 0.2659689784049988, + "learning_rate": 4.5456331412593405e-08, + "loss": 0.1714, + "step": 12906 + }, + { + "epoch": 3.4345396487493347, + "grad_norm": 0.37774917483329773, + "learning_rate": 4.544217708754611e-08, + "loss": 0.189, + "step": 12907 + }, + { + "epoch": 3.4348057477381584, + "grad_norm": 0.37687668204307556, + "learning_rate": 4.542802431859878e-08, + "loss": 0.1841, + "step": 12908 + }, + { + "epoch": 3.4350718467269825, + "grad_norm": 0.2799728512763977, + "learning_rate": 4.5413873106155133e-08, + "loss": 0.1813, + "step": 12909 + }, + { + "epoch": 3.435337945715806, + "grad_norm": 0.4128085970878601, + "learning_rate": 4.539972345061872e-08, + "loss": 0.2116, + "step": 12910 + }, + { + "epoch": 3.43560404470463, + "grad_norm": 0.2799268662929535, + "learning_rate": 4.538557535239315e-08, + "loss": 0.176, + "step": 12911 + }, + { + "epoch": 3.435870143693454, + "grad_norm": 0.2985495328903198, + "learning_rate": 4.537142881188194e-08, + "loss": 0.2017, + "step": 12912 + }, + { + "epoch": 3.4361362426822777, + "grad_norm": 0.2841099202632904, + "learning_rate": 4.5357283829488626e-08, + "loss": 0.1786, + "step": 12913 + }, + { + "epoch": 3.436402341671102, + "grad_norm": 0.270638644695282, + "learning_rate": 4.5343140405616586e-08, + "loss": 0.1803, + "step": 12914 + }, + { + "epoch": 3.4366684406599255, + "grad_norm": 0.43267834186553955, + "learning_rate": 4.53289985406693e-08, + "loss": 0.1858, + "step": 12915 + }, + { + "epoch": 3.436934539648749, + "grad_norm": 0.44506871700286865, + "learning_rate": 4.5314858235050015e-08, + "loss": 0.1899, + "step": 12916 + }, + { + "epoch": 3.4372006386375733, + "grad_norm": 0.3068370521068573, + "learning_rate": 4.5300719489162145e-08, + "loss": 0.1763, + "step": 12917 + }, + { + "epoch": 3.437466737626397, + "grad_norm": 0.26865431666374207, + "learning_rate": 4.528658230340887e-08, + "loss": 0.1717, + "step": 12918 + }, + { + "epoch": 3.437732836615221, + "grad_norm": 0.2644897401332855, + "learning_rate": 4.5272446678193446e-08, + "loss": 0.1822, + "step": 12919 + }, + { + "epoch": 3.437998935604045, + "grad_norm": 0.32484307885169983, + "learning_rate": 4.525831261391906e-08, + "loss": 0.1846, + "step": 12920 + }, + { + "epoch": 3.4382650345928685, + "grad_norm": 0.25759509205818176, + "learning_rate": 4.5244180110988874e-08, + "loss": 0.1739, + "step": 12921 + }, + { + "epoch": 3.4385311335816926, + "grad_norm": 0.2857699692249298, + "learning_rate": 4.523004916980591e-08, + "loss": 0.1916, + "step": 12922 + }, + { + "epoch": 3.4387972325705163, + "grad_norm": 0.2672097980976105, + "learning_rate": 4.5215919790773274e-08, + "loss": 0.1715, + "step": 12923 + }, + { + "epoch": 3.43906333155934, + "grad_norm": 0.29139429330825806, + "learning_rate": 4.520179197429389e-08, + "loss": 0.1701, + "step": 12924 + }, + { + "epoch": 3.439329430548164, + "grad_norm": 0.37762710452079773, + "learning_rate": 4.5187665720770765e-08, + "loss": 0.1992, + "step": 12925 + }, + { + "epoch": 3.4395955295369878, + "grad_norm": 0.25971177220344543, + "learning_rate": 4.517354103060679e-08, + "loss": 0.1588, + "step": 12926 + }, + { + "epoch": 3.4398616285258115, + "grad_norm": 0.28981947898864746, + "learning_rate": 4.515941790420483e-08, + "loss": 0.1859, + "step": 12927 + }, + { + "epoch": 3.4401277275146356, + "grad_norm": 0.3455190062522888, + "learning_rate": 4.514529634196777e-08, + "loss": 0.197, + "step": 12928 + }, + { + "epoch": 3.4403938265034593, + "grad_norm": 0.3840045630931854, + "learning_rate": 4.513117634429827e-08, + "loss": 0.1762, + "step": 12929 + }, + { + "epoch": 3.440659925492283, + "grad_norm": 0.8047231435775757, + "learning_rate": 4.5117057911599166e-08, + "loss": 0.1873, + "step": 12930 + }, + { + "epoch": 3.440926024481107, + "grad_norm": 0.3757857382297516, + "learning_rate": 4.510294104427307e-08, + "loss": 0.1714, + "step": 12931 + }, + { + "epoch": 3.4411921234699308, + "grad_norm": 0.2786999046802521, + "learning_rate": 4.508882574272268e-08, + "loss": 0.1678, + "step": 12932 + }, + { + "epoch": 3.4414582224587544, + "grad_norm": 0.2785547077655792, + "learning_rate": 4.507471200735049e-08, + "loss": 0.165, + "step": 12933 + }, + { + "epoch": 3.4417243214475786, + "grad_norm": 0.3382943868637085, + "learning_rate": 4.50605998385592e-08, + "loss": 0.169, + "step": 12934 + }, + { + "epoch": 3.4419904204364022, + "grad_norm": 1.767042875289917, + "learning_rate": 4.504648923675121e-08, + "loss": 0.1881, + "step": 12935 + }, + { + "epoch": 3.4422565194252264, + "grad_norm": 0.3241838216781616, + "learning_rate": 4.503238020232906e-08, + "loss": 0.1762, + "step": 12936 + }, + { + "epoch": 3.44252261841405, + "grad_norm": 0.27065619826316833, + "learning_rate": 4.501827273569507e-08, + "loss": 0.1554, + "step": 12937 + }, + { + "epoch": 3.4427887174028737, + "grad_norm": 0.3638705611228943, + "learning_rate": 4.5004166837251726e-08, + "loss": 0.1821, + "step": 12938 + }, + { + "epoch": 3.443054816391698, + "grad_norm": 0.27926093339920044, + "learning_rate": 4.499006250740126e-08, + "loss": 0.1645, + "step": 12939 + }, + { + "epoch": 3.4433209153805215, + "grad_norm": 0.25963330268859863, + "learning_rate": 4.497595974654599e-08, + "loss": 0.1733, + "step": 12940 + }, + { + "epoch": 3.4435870143693452, + "grad_norm": 0.4356421232223511, + "learning_rate": 4.496185855508817e-08, + "loss": 0.1698, + "step": 12941 + }, + { + "epoch": 3.4438531133581693, + "grad_norm": 0.3729296028614044, + "learning_rate": 4.4947758933430025e-08, + "loss": 0.192, + "step": 12942 + }, + { + "epoch": 3.444119212346993, + "grad_norm": 0.5320226550102234, + "learning_rate": 4.493366088197362e-08, + "loss": 0.1958, + "step": 12943 + }, + { + "epoch": 3.444385311335817, + "grad_norm": 0.26879221200942993, + "learning_rate": 4.4919564401121116e-08, + "loss": 0.175, + "step": 12944 + }, + { + "epoch": 3.444651410324641, + "grad_norm": 0.2732158601284027, + "learning_rate": 4.4905469491274594e-08, + "loss": 0.1758, + "step": 12945 + }, + { + "epoch": 3.4449175093134645, + "grad_norm": 0.33412981033325195, + "learning_rate": 4.4891376152836e-08, + "loss": 0.1711, + "step": 12946 + }, + { + "epoch": 3.4451836083022886, + "grad_norm": 0.27784037590026855, + "learning_rate": 4.487728438620736e-08, + "loss": 0.1833, + "step": 12947 + }, + { + "epoch": 3.4454497072911123, + "grad_norm": 0.30813783407211304, + "learning_rate": 4.4863194191790565e-08, + "loss": 0.2033, + "step": 12948 + }, + { + "epoch": 3.445715806279936, + "grad_norm": 0.32646819949150085, + "learning_rate": 4.4849105569987565e-08, + "loss": 0.1836, + "step": 12949 + }, + { + "epoch": 3.44598190526876, + "grad_norm": 0.36014071106910706, + "learning_rate": 4.483501852120011e-08, + "loss": 0.1686, + "step": 12950 + }, + { + "epoch": 3.446248004257584, + "grad_norm": 0.352059006690979, + "learning_rate": 4.482093304583006e-08, + "loss": 0.1945, + "step": 12951 + }, + { + "epoch": 3.4465141032464075, + "grad_norm": 0.2869018018245697, + "learning_rate": 4.480684914427909e-08, + "loss": 0.1881, + "step": 12952 + }, + { + "epoch": 3.4467802022352316, + "grad_norm": 0.2814719080924988, + "learning_rate": 4.479276681694898e-08, + "loss": 0.1959, + "step": 12953 + }, + { + "epoch": 3.4470463012240553, + "grad_norm": 0.276609867811203, + "learning_rate": 4.4778686064241314e-08, + "loss": 0.1818, + "step": 12954 + }, + { + "epoch": 3.447312400212879, + "grad_norm": 0.28357142210006714, + "learning_rate": 4.476460688655772e-08, + "loss": 0.1715, + "step": 12955 + }, + { + "epoch": 3.447578499201703, + "grad_norm": 0.36018043756484985, + "learning_rate": 4.47505292842998e-08, + "loss": 0.1893, + "step": 12956 + }, + { + "epoch": 3.447844598190527, + "grad_norm": 0.3132588565349579, + "learning_rate": 4.4736453257869076e-08, + "loss": 0.1716, + "step": 12957 + }, + { + "epoch": 3.4481106971793505, + "grad_norm": 0.29067233204841614, + "learning_rate": 4.472237880766698e-08, + "loss": 0.1806, + "step": 12958 + }, + { + "epoch": 3.4483767961681746, + "grad_norm": 0.3354114890098572, + "learning_rate": 4.470830593409502e-08, + "loss": 0.1694, + "step": 12959 + }, + { + "epoch": 3.4486428951569983, + "grad_norm": 0.26339736580848694, + "learning_rate": 4.469423463755447e-08, + "loss": 0.1577, + "step": 12960 + }, + { + "epoch": 3.4489089941458224, + "grad_norm": 0.49963605403900146, + "learning_rate": 4.468016491844675e-08, + "loss": 0.1791, + "step": 12961 + }, + { + "epoch": 3.449175093134646, + "grad_norm": 0.2726241648197174, + "learning_rate": 4.466609677717313e-08, + "loss": 0.1817, + "step": 12962 + }, + { + "epoch": 3.44944119212347, + "grad_norm": 0.34048527479171753, + "learning_rate": 4.46520302141349e-08, + "loss": 0.1777, + "step": 12963 + }, + { + "epoch": 3.449707291112294, + "grad_norm": 0.3639064431190491, + "learning_rate": 4.463796522973325e-08, + "loss": 0.1867, + "step": 12964 + }, + { + "epoch": 3.4499733901011176, + "grad_norm": 0.2875669002532959, + "learning_rate": 4.462390182436931e-08, + "loss": 0.1723, + "step": 12965 + }, + { + "epoch": 3.4502394890899413, + "grad_norm": 0.4422943890094757, + "learning_rate": 4.4609839998444254e-08, + "loss": 0.1853, + "step": 12966 + }, + { + "epoch": 3.4505055880787654, + "grad_norm": 0.3608105480670929, + "learning_rate": 4.459577975235907e-08, + "loss": 0.1857, + "step": 12967 + }, + { + "epoch": 3.450771687067589, + "grad_norm": 0.2989121079444885, + "learning_rate": 4.458172108651489e-08, + "loss": 0.1768, + "step": 12968 + }, + { + "epoch": 3.451037786056413, + "grad_norm": 0.26678866147994995, + "learning_rate": 4.45676640013126e-08, + "loss": 0.1589, + "step": 12969 + }, + { + "epoch": 3.451303885045237, + "grad_norm": 0.27475228905677795, + "learning_rate": 4.4553608497153174e-08, + "loss": 0.1767, + "step": 12970 + }, + { + "epoch": 3.4515699840340606, + "grad_norm": 0.29436609148979187, + "learning_rate": 4.45395545744375e-08, + "loss": 0.181, + "step": 12971 + }, + { + "epoch": 3.4518360830228847, + "grad_norm": 0.2618655264377594, + "learning_rate": 4.452550223356647e-08, + "loss": 0.1666, + "step": 12972 + }, + { + "epoch": 3.4521021820117084, + "grad_norm": 0.28881826996803284, + "learning_rate": 4.4511451474940806e-08, + "loss": 0.1615, + "step": 12973 + }, + { + "epoch": 3.452368281000532, + "grad_norm": 0.2670910358428955, + "learning_rate": 4.449740229896135e-08, + "loss": 0.1724, + "step": 12974 + }, + { + "epoch": 3.452634379989356, + "grad_norm": 0.2784675061702728, + "learning_rate": 4.448335470602873e-08, + "loss": 0.1808, + "step": 12975 + }, + { + "epoch": 3.45290047897818, + "grad_norm": 0.2696954905986786, + "learning_rate": 4.446930869654364e-08, + "loss": 0.1726, + "step": 12976 + }, + { + "epoch": 3.4531665779670035, + "grad_norm": 0.36478638648986816, + "learning_rate": 4.4455264270906714e-08, + "loss": 0.1765, + "step": 12977 + }, + { + "epoch": 3.4534326769558277, + "grad_norm": 0.2762921452522278, + "learning_rate": 4.444122142951856e-08, + "loss": 0.1708, + "step": 12978 + }, + { + "epoch": 3.4536987759446514, + "grad_norm": 0.3156493306159973, + "learning_rate": 4.442718017277963e-08, + "loss": 0.1922, + "step": 12979 + }, + { + "epoch": 3.453964874933475, + "grad_norm": 0.2775108814239502, + "learning_rate": 4.44131405010905e-08, + "loss": 0.1752, + "step": 12980 + }, + { + "epoch": 3.454230973922299, + "grad_norm": 0.2604793608188629, + "learning_rate": 4.439910241485152e-08, + "loss": 0.1849, + "step": 12981 + }, + { + "epoch": 3.454497072911123, + "grad_norm": 0.2801094651222229, + "learning_rate": 4.438506591446312e-08, + "loss": 0.1735, + "step": 12982 + }, + { + "epoch": 3.454763171899947, + "grad_norm": 0.2928157150745392, + "learning_rate": 4.43710310003257e-08, + "loss": 0.1756, + "step": 12983 + }, + { + "epoch": 3.4550292708887707, + "grad_norm": 0.27772051095962524, + "learning_rate": 4.435699767283944e-08, + "loss": 0.1793, + "step": 12984 + }, + { + "epoch": 3.4552953698775943, + "grad_norm": 0.29459258913993835, + "learning_rate": 4.4342965932404744e-08, + "loss": 0.1756, + "step": 12985 + }, + { + "epoch": 3.4555614688664185, + "grad_norm": 0.3085252344608307, + "learning_rate": 4.432893577942173e-08, + "loss": 0.2031, + "step": 12986 + }, + { + "epoch": 3.455827567855242, + "grad_norm": 0.29286059737205505, + "learning_rate": 4.4314907214290644e-08, + "loss": 0.1716, + "step": 12987 + }, + { + "epoch": 3.456093666844066, + "grad_norm": 0.6231741905212402, + "learning_rate": 4.430088023741151e-08, + "loss": 0.1862, + "step": 12988 + }, + { + "epoch": 3.45635976583289, + "grad_norm": 0.26603803038597107, + "learning_rate": 4.4286854849184495e-08, + "loss": 0.1683, + "step": 12989 + }, + { + "epoch": 3.4566258648217136, + "grad_norm": 0.2910522520542145, + "learning_rate": 4.4272831050009564e-08, + "loss": 0.1842, + "step": 12990 + }, + { + "epoch": 3.4568919638105378, + "grad_norm": 0.2656049430370331, + "learning_rate": 4.425880884028673e-08, + "loss": 0.1684, + "step": 12991 + }, + { + "epoch": 3.4571580627993614, + "grad_norm": 0.27216583490371704, + "learning_rate": 4.4244788220415955e-08, + "loss": 0.168, + "step": 12992 + }, + { + "epoch": 3.457424161788185, + "grad_norm": 0.27440735697746277, + "learning_rate": 4.423076919079713e-08, + "loss": 0.1655, + "step": 12993 + }, + { + "epoch": 3.4576902607770093, + "grad_norm": 0.2637251615524292, + "learning_rate": 4.4216751751830085e-08, + "loss": 0.1694, + "step": 12994 + }, + { + "epoch": 3.457956359765833, + "grad_norm": 0.2723725736141205, + "learning_rate": 4.420273590391466e-08, + "loss": 0.1656, + "step": 12995 + }, + { + "epoch": 3.4582224587546566, + "grad_norm": 0.2661675214767456, + "learning_rate": 4.418872164745056e-08, + "loss": 0.1618, + "step": 12996 + }, + { + "epoch": 3.4584885577434807, + "grad_norm": 0.30695176124572754, + "learning_rate": 4.4174708982837564e-08, + "loss": 0.2005, + "step": 12997 + }, + { + "epoch": 3.4587546567323044, + "grad_norm": 0.37880024313926697, + "learning_rate": 4.416069791047524e-08, + "loss": 0.1774, + "step": 12998 + }, + { + "epoch": 3.459020755721128, + "grad_norm": 0.2865888178348541, + "learning_rate": 4.414668843076332e-08, + "loss": 0.1623, + "step": 12999 + }, + { + "epoch": 3.4592868547099522, + "grad_norm": 0.2414524406194687, + "learning_rate": 4.413268054410139e-08, + "loss": 0.1489, + "step": 13000 + }, + { + "epoch": 3.459552953698776, + "grad_norm": 0.27648016810417175, + "learning_rate": 4.411867425088889e-08, + "loss": 0.1655, + "step": 13001 + }, + { + "epoch": 3.4598190526875996, + "grad_norm": 0.26695460081100464, + "learning_rate": 4.41046695515254e-08, + "loss": 0.1656, + "step": 13002 + }, + { + "epoch": 3.4600851516764237, + "grad_norm": 0.3299916386604309, + "learning_rate": 4.409066644641027e-08, + "loss": 0.1679, + "step": 13003 + }, + { + "epoch": 3.4603512506652474, + "grad_norm": 0.3049582242965698, + "learning_rate": 4.4076664935942995e-08, + "loss": 0.1814, + "step": 13004 + }, + { + "epoch": 3.460617349654071, + "grad_norm": 0.25424468517303467, + "learning_rate": 4.406266502052284e-08, + "loss": 0.1664, + "step": 13005 + }, + { + "epoch": 3.460883448642895, + "grad_norm": 0.40718773007392883, + "learning_rate": 4.404866670054914e-08, + "loss": 0.1722, + "step": 13006 + }, + { + "epoch": 3.461149547631719, + "grad_norm": 0.28086790442466736, + "learning_rate": 4.403466997642118e-08, + "loss": 0.1888, + "step": 13007 + }, + { + "epoch": 3.461415646620543, + "grad_norm": 0.23999124765396118, + "learning_rate": 4.402067484853819e-08, + "loss": 0.1469, + "step": 13008 + }, + { + "epoch": 3.4616817456093667, + "grad_norm": 0.30231598019599915, + "learning_rate": 4.400668131729926e-08, + "loss": 0.202, + "step": 13009 + }, + { + "epoch": 3.4619478445981904, + "grad_norm": 0.2909952700138092, + "learning_rate": 4.399268938310361e-08, + "loss": 0.1841, + "step": 13010 + }, + { + "epoch": 3.4622139435870145, + "grad_norm": 0.30063799023628235, + "learning_rate": 4.397869904635023e-08, + "loss": 0.1895, + "step": 13011 + }, + { + "epoch": 3.462480042575838, + "grad_norm": 0.27668115496635437, + "learning_rate": 4.39647103074382e-08, + "loss": 0.1541, + "step": 13012 + }, + { + "epoch": 3.462746141564662, + "grad_norm": 0.5258405208587646, + "learning_rate": 4.39507231667665e-08, + "loss": 0.1747, + "step": 13013 + }, + { + "epoch": 3.463012240553486, + "grad_norm": 0.42331376671791077, + "learning_rate": 4.393673762473411e-08, + "loss": 0.1823, + "step": 13014 + }, + { + "epoch": 3.4632783395423097, + "grad_norm": 0.24246948957443237, + "learning_rate": 4.392275368173984e-08, + "loss": 0.1682, + "step": 13015 + }, + { + "epoch": 3.463544438531134, + "grad_norm": 0.3662540316581726, + "learning_rate": 4.390877133818264e-08, + "loss": 0.1971, + "step": 13016 + }, + { + "epoch": 3.4638105375199575, + "grad_norm": 0.29210782051086426, + "learning_rate": 4.389479059446123e-08, + "loss": 0.1754, + "step": 13017 + }, + { + "epoch": 3.464076636508781, + "grad_norm": 0.28568655252456665, + "learning_rate": 4.38808114509744e-08, + "loss": 0.1842, + "step": 13018 + }, + { + "epoch": 3.4643427354976053, + "grad_norm": 0.27261975407600403, + "learning_rate": 4.386683390812089e-08, + "loss": 0.1735, + "step": 13019 + }, + { + "epoch": 3.464608834486429, + "grad_norm": 0.28332507610321045, + "learning_rate": 4.385285796629933e-08, + "loss": 0.1802, + "step": 13020 + }, + { + "epoch": 3.4648749334752527, + "grad_norm": 0.2697288990020752, + "learning_rate": 4.383888362590834e-08, + "loss": 0.1905, + "step": 13021 + }, + { + "epoch": 3.465141032464077, + "grad_norm": 0.3957979679107666, + "learning_rate": 4.382491088734652e-08, + "loss": 0.1801, + "step": 13022 + }, + { + "epoch": 3.4654071314529005, + "grad_norm": 0.3431856632232666, + "learning_rate": 4.381093975101243e-08, + "loss": 0.1724, + "step": 13023 + }, + { + "epoch": 3.465673230441724, + "grad_norm": 0.3115249574184418, + "learning_rate": 4.3796970217304485e-08, + "loss": 0.1728, + "step": 13024 + }, + { + "epoch": 3.4659393294305483, + "grad_norm": 0.35222089290618896, + "learning_rate": 4.3783002286621195e-08, + "loss": 0.1968, + "step": 13025 + }, + { + "epoch": 3.466205428419372, + "grad_norm": 0.34993913769721985, + "learning_rate": 4.376903595936089e-08, + "loss": 0.1758, + "step": 13026 + }, + { + "epoch": 3.4664715274081956, + "grad_norm": 0.2621249258518219, + "learning_rate": 4.375507123592194e-08, + "loss": 0.1687, + "step": 13027 + }, + { + "epoch": 3.4667376263970198, + "grad_norm": 0.47918134927749634, + "learning_rate": 4.374110811670264e-08, + "loss": 0.1606, + "step": 13028 + }, + { + "epoch": 3.4670037253858434, + "grad_norm": 0.2660551369190216, + "learning_rate": 4.372714660210132e-08, + "loss": 0.1777, + "step": 13029 + }, + { + "epoch": 3.467269824374667, + "grad_norm": 0.2900592088699341, + "learning_rate": 4.371318669251608e-08, + "loss": 0.1844, + "step": 13030 + }, + { + "epoch": 3.4675359233634913, + "grad_norm": 0.3738481104373932, + "learning_rate": 4.369922838834518e-08, + "loss": 0.1809, + "step": 13031 + }, + { + "epoch": 3.467802022352315, + "grad_norm": 0.429065465927124, + "learning_rate": 4.368527168998665e-08, + "loss": 0.1835, + "step": 13032 + }, + { + "epoch": 3.468068121341139, + "grad_norm": 0.2779127061367035, + "learning_rate": 4.367131659783865e-08, + "loss": 0.1766, + "step": 13033 + }, + { + "epoch": 3.4683342203299627, + "grad_norm": 0.2637160122394562, + "learning_rate": 4.365736311229913e-08, + "loss": 0.1659, + "step": 13034 + }, + { + "epoch": 3.4686003193187864, + "grad_norm": 0.356760710477829, + "learning_rate": 4.3643411233766056e-08, + "loss": 0.1892, + "step": 13035 + }, + { + "epoch": 3.4688664183076106, + "grad_norm": 0.3619346618652344, + "learning_rate": 4.362946096263751e-08, + "loss": 0.1681, + "step": 13036 + }, + { + "epoch": 3.4691325172964342, + "grad_norm": 0.384545236825943, + "learning_rate": 4.361551229931124e-08, + "loss": 0.1982, + "step": 13037 + }, + { + "epoch": 3.4693986162852584, + "grad_norm": 0.31126725673675537, + "learning_rate": 4.360156524418518e-08, + "loss": 0.1871, + "step": 13038 + }, + { + "epoch": 3.469664715274082, + "grad_norm": 0.2558231055736542, + "learning_rate": 4.358761979765705e-08, + "loss": 0.1773, + "step": 13039 + }, + { + "epoch": 3.4699308142629057, + "grad_norm": 0.2732544541358948, + "learning_rate": 4.357367596012468e-08, + "loss": 0.1758, + "step": 13040 + }, + { + "epoch": 3.47019691325173, + "grad_norm": 0.35904210805892944, + "learning_rate": 4.355973373198568e-08, + "loss": 0.1771, + "step": 13041 + }, + { + "epoch": 3.4704630122405535, + "grad_norm": 0.26060938835144043, + "learning_rate": 4.354579311363778e-08, + "loss": 0.1644, + "step": 13042 + }, + { + "epoch": 3.470729111229377, + "grad_norm": 0.3268696367740631, + "learning_rate": 4.353185410547858e-08, + "loss": 0.1823, + "step": 13043 + }, + { + "epoch": 3.4709952102182013, + "grad_norm": 0.3572935163974762, + "learning_rate": 4.351791670790569e-08, + "loss": 0.1976, + "step": 13044 + }, + { + "epoch": 3.471261309207025, + "grad_norm": 0.35112643241882324, + "learning_rate": 4.350398092131655e-08, + "loss": 0.1932, + "step": 13045 + }, + { + "epoch": 3.4715274081958487, + "grad_norm": 0.3783906400203705, + "learning_rate": 4.349004674610871e-08, + "loss": 0.1622, + "step": 13046 + }, + { + "epoch": 3.471793507184673, + "grad_norm": 0.33030980825424194, + "learning_rate": 4.3476114182679544e-08, + "loss": 0.1784, + "step": 13047 + }, + { + "epoch": 3.4720596061734965, + "grad_norm": 0.2755788564682007, + "learning_rate": 4.34621832314265e-08, + "loss": 0.1664, + "step": 13048 + }, + { + "epoch": 3.47232570516232, + "grad_norm": 0.40332725644111633, + "learning_rate": 4.34482538927468e-08, + "loss": 0.1778, + "step": 13049 + }, + { + "epoch": 3.4725918041511443, + "grad_norm": 0.3918001353740692, + "learning_rate": 4.34343261670379e-08, + "loss": 0.1632, + "step": 13050 + }, + { + "epoch": 3.472857903139968, + "grad_norm": 0.35297465324401855, + "learning_rate": 4.3420400054696924e-08, + "loss": 0.1807, + "step": 13051 + }, + { + "epoch": 3.4731240021287917, + "grad_norm": 0.3975728452205658, + "learning_rate": 4.340647555612115e-08, + "loss": 0.2, + "step": 13052 + }, + { + "epoch": 3.473390101117616, + "grad_norm": 0.29683879017829895, + "learning_rate": 4.339255267170766e-08, + "loss": 0.1966, + "step": 13053 + }, + { + "epoch": 3.4736562001064395, + "grad_norm": 0.37645021080970764, + "learning_rate": 4.3378631401853605e-08, + "loss": 0.1769, + "step": 13054 + }, + { + "epoch": 3.4739222990952636, + "grad_norm": 0.27998653054237366, + "learning_rate": 4.336471174695607e-08, + "loss": 0.1635, + "step": 13055 + }, + { + "epoch": 3.4741883980840873, + "grad_norm": 0.3328683376312256, + "learning_rate": 4.335079370741201e-08, + "loss": 0.1718, + "step": 13056 + }, + { + "epoch": 3.474454497072911, + "grad_norm": 0.2932267189025879, + "learning_rate": 4.333687728361843e-08, + "loss": 0.1627, + "step": 13057 + }, + { + "epoch": 3.474720596061735, + "grad_norm": 0.2813403904438019, + "learning_rate": 4.3322962475972256e-08, + "loss": 0.1743, + "step": 13058 + }, + { + "epoch": 3.474986695050559, + "grad_norm": 0.3491304814815521, + "learning_rate": 4.33090492848704e-08, + "loss": 0.17, + "step": 13059 + }, + { + "epoch": 3.4752527940393825, + "grad_norm": 0.3612247109413147, + "learning_rate": 4.329513771070963e-08, + "loss": 0.2032, + "step": 13060 + }, + { + "epoch": 3.4755188930282066, + "grad_norm": 0.29409950971603394, + "learning_rate": 4.32812277538868e-08, + "loss": 0.193, + "step": 13061 + }, + { + "epoch": 3.4757849920170303, + "grad_norm": 0.4895284175872803, + "learning_rate": 4.3267319414798564e-08, + "loss": 0.1665, + "step": 13062 + }, + { + "epoch": 3.4760510910058544, + "grad_norm": 0.36912745237350464, + "learning_rate": 4.3253412693841675e-08, + "loss": 0.1898, + "step": 13063 + }, + { + "epoch": 3.476317189994678, + "grad_norm": 0.34364455938339233, + "learning_rate": 4.323950759141277e-08, + "loss": 0.1954, + "step": 13064 + }, + { + "epoch": 3.4765832889835018, + "grad_norm": 0.2852301299571991, + "learning_rate": 4.3225604107908476e-08, + "loss": 0.1669, + "step": 13065 + }, + { + "epoch": 3.476849387972326, + "grad_norm": 0.2684181034564972, + "learning_rate": 4.32117022437253e-08, + "loss": 0.1698, + "step": 13066 + }, + { + "epoch": 3.4771154869611496, + "grad_norm": 0.35502889752388, + "learning_rate": 4.319780199925981e-08, + "loss": 0.1943, + "step": 13067 + }, + { + "epoch": 3.4773815859499733, + "grad_norm": 0.28201010823249817, + "learning_rate": 4.318390337490839e-08, + "loss": 0.1795, + "step": 13068 + }, + { + "epoch": 3.4776476849387974, + "grad_norm": 0.3509341776371002, + "learning_rate": 4.317000637106755e-08, + "loss": 0.1658, + "step": 13069 + }, + { + "epoch": 3.477913783927621, + "grad_norm": 0.3082030713558197, + "learning_rate": 4.315611098813356e-08, + "loss": 0.1703, + "step": 13070 + }, + { + "epoch": 3.4781798829164448, + "grad_norm": 0.27101293206214905, + "learning_rate": 4.314221722650282e-08, + "loss": 0.1856, + "step": 13071 + }, + { + "epoch": 3.478445981905269, + "grad_norm": 0.30185821652412415, + "learning_rate": 4.312832508657157e-08, + "loss": 0.1855, + "step": 13072 + }, + { + "epoch": 3.4787120808940926, + "grad_norm": 0.26381808519363403, + "learning_rate": 4.3114434568736055e-08, + "loss": 0.1771, + "step": 13073 + }, + { + "epoch": 3.4789781798829162, + "grad_norm": 0.28663238883018494, + "learning_rate": 4.310054567339251e-08, + "loss": 0.1882, + "step": 13074 + }, + { + "epoch": 3.4792442788717404, + "grad_norm": 0.36307403445243835, + "learning_rate": 4.308665840093698e-08, + "loss": 0.1942, + "step": 13075 + }, + { + "epoch": 3.479510377860564, + "grad_norm": 0.273445725440979, + "learning_rate": 4.3072772751765654e-08, + "loss": 0.1817, + "step": 13076 + }, + { + "epoch": 3.4797764768493877, + "grad_norm": 0.26549407839775085, + "learning_rate": 4.305888872627449e-08, + "loss": 0.1584, + "step": 13077 + }, + { + "epoch": 3.480042575838212, + "grad_norm": 0.2762823700904846, + "learning_rate": 4.3045006324859523e-08, + "loss": 0.1772, + "step": 13078 + }, + { + "epoch": 3.4803086748270355, + "grad_norm": 0.26235514879226685, + "learning_rate": 4.3031125547916715e-08, + "loss": 0.1578, + "step": 13079 + }, + { + "epoch": 3.4805747738158597, + "grad_norm": 0.3495742380619049, + "learning_rate": 4.301724639584201e-08, + "loss": 0.1792, + "step": 13080 + }, + { + "epoch": 3.4808408728046834, + "grad_norm": 0.26654401421546936, + "learning_rate": 4.3003368869031196e-08, + "loss": 0.176, + "step": 13081 + }, + { + "epoch": 3.481106971793507, + "grad_norm": 0.27133435010910034, + "learning_rate": 4.298949296788017e-08, + "loss": 0.1723, + "step": 13082 + }, + { + "epoch": 3.481373070782331, + "grad_norm": 0.3721228241920471, + "learning_rate": 4.297561869278461e-08, + "loss": 0.192, + "step": 13083 + }, + { + "epoch": 3.481639169771155, + "grad_norm": 0.31452739238739014, + "learning_rate": 4.2961746044140326e-08, + "loss": 0.1843, + "step": 13084 + }, + { + "epoch": 3.4819052687599785, + "grad_norm": 0.41606569290161133, + "learning_rate": 4.294787502234291e-08, + "loss": 0.1877, + "step": 13085 + }, + { + "epoch": 3.4821713677488026, + "grad_norm": 0.2786877453327179, + "learning_rate": 4.293400562778804e-08, + "loss": 0.1609, + "step": 13086 + }, + { + "epoch": 3.4824374667376263, + "grad_norm": 0.27868548035621643, + "learning_rate": 4.2920137860871286e-08, + "loss": 0.1785, + "step": 13087 + }, + { + "epoch": 3.4827035657264505, + "grad_norm": 0.29239344596862793, + "learning_rate": 4.290627172198823e-08, + "loss": 0.1819, + "step": 13088 + }, + { + "epoch": 3.482969664715274, + "grad_norm": 0.2649122178554535, + "learning_rate": 4.289240721153429e-08, + "loss": 0.1602, + "step": 13089 + }, + { + "epoch": 3.483235763704098, + "grad_norm": 0.2941528856754303, + "learning_rate": 4.287854432990494e-08, + "loss": 0.1899, + "step": 13090 + }, + { + "epoch": 3.483501862692922, + "grad_norm": 0.3665289878845215, + "learning_rate": 4.286468307749562e-08, + "loss": 0.1737, + "step": 13091 + }, + { + "epoch": 3.4837679616817456, + "grad_norm": 0.3486267924308777, + "learning_rate": 4.285082345470161e-08, + "loss": 0.1917, + "step": 13092 + }, + { + "epoch": 3.4840340606705693, + "grad_norm": 0.36728495359420776, + "learning_rate": 4.283696546191825e-08, + "loss": 0.1778, + "step": 13093 + }, + { + "epoch": 3.4843001596593934, + "grad_norm": 0.3422747254371643, + "learning_rate": 4.28231090995408e-08, + "loss": 0.1917, + "step": 13094 + }, + { + "epoch": 3.484566258648217, + "grad_norm": 0.3037070333957672, + "learning_rate": 4.2809254367964484e-08, + "loss": 0.1657, + "step": 13095 + }, + { + "epoch": 3.484832357637041, + "grad_norm": 0.30877685546875, + "learning_rate": 4.279540126758443e-08, + "loss": 0.1919, + "step": 13096 + }, + { + "epoch": 3.485098456625865, + "grad_norm": 0.28414350748062134, + "learning_rate": 4.2781549798795815e-08, + "loss": 0.1819, + "step": 13097 + }, + { + "epoch": 3.4853645556146886, + "grad_norm": 0.3409813642501831, + "learning_rate": 4.276769996199363e-08, + "loss": 0.1949, + "step": 13098 + }, + { + "epoch": 3.4856306546035123, + "grad_norm": 0.4920339286327362, + "learning_rate": 4.275385175757299e-08, + "loss": 0.2094, + "step": 13099 + }, + { + "epoch": 3.4858967535923364, + "grad_norm": 0.3349542021751404, + "learning_rate": 4.274000518592874e-08, + "loss": 0.1794, + "step": 13100 + }, + { + "epoch": 3.48616285258116, + "grad_norm": 0.2790173888206482, + "learning_rate": 4.272616024745598e-08, + "loss": 0.1768, + "step": 13101 + }, + { + "epoch": 3.4864289515699842, + "grad_norm": 0.37809139490127563, + "learning_rate": 4.2712316942549485e-08, + "loss": 0.1805, + "step": 13102 + }, + { + "epoch": 3.486695050558808, + "grad_norm": 0.26385295391082764, + "learning_rate": 4.269847527160415e-08, + "loss": 0.1657, + "step": 13103 + }, + { + "epoch": 3.4869611495476316, + "grad_norm": 0.45000168681144714, + "learning_rate": 4.268463523501471e-08, + "loss": 0.1881, + "step": 13104 + }, + { + "epoch": 3.4872272485364557, + "grad_norm": 0.2918621003627777, + "learning_rate": 4.2670796833175984e-08, + "loss": 0.1815, + "step": 13105 + }, + { + "epoch": 3.4874933475252794, + "grad_norm": 0.3079089820384979, + "learning_rate": 4.2656960066482594e-08, + "loss": 0.1708, + "step": 13106 + }, + { + "epoch": 3.487759446514103, + "grad_norm": 0.27611681818962097, + "learning_rate": 4.2643124935329237e-08, + "loss": 0.1784, + "step": 13107 + }, + { + "epoch": 3.488025545502927, + "grad_norm": 0.3133215308189392, + "learning_rate": 4.262929144011049e-08, + "loss": 0.1774, + "step": 13108 + }, + { + "epoch": 3.488291644491751, + "grad_norm": 0.3092651665210724, + "learning_rate": 4.2615459581220944e-08, + "loss": 0.1808, + "step": 13109 + }, + { + "epoch": 3.488557743480575, + "grad_norm": 0.4269164204597473, + "learning_rate": 4.2601629359055146e-08, + "loss": 0.2038, + "step": 13110 + }, + { + "epoch": 3.4888238424693987, + "grad_norm": 0.32398518919944763, + "learning_rate": 4.258780077400748e-08, + "loss": 0.1739, + "step": 13111 + }, + { + "epoch": 3.4890899414582224, + "grad_norm": 0.45061931014060974, + "learning_rate": 4.257397382647243e-08, + "loss": 0.2101, + "step": 13112 + }, + { + "epoch": 3.4893560404470465, + "grad_norm": 0.3634260892868042, + "learning_rate": 4.256014851684431e-08, + "loss": 0.2033, + "step": 13113 + }, + { + "epoch": 3.48962213943587, + "grad_norm": 0.8173118233680725, + "learning_rate": 4.254632484551748e-08, + "loss": 0.1921, + "step": 13114 + }, + { + "epoch": 3.489888238424694, + "grad_norm": 0.33706167340278625, + "learning_rate": 4.253250281288622e-08, + "loss": 0.193, + "step": 13115 + }, + { + "epoch": 3.490154337413518, + "grad_norm": 0.2787320911884308, + "learning_rate": 4.2518682419344786e-08, + "loss": 0.1819, + "step": 13116 + }, + { + "epoch": 3.4904204364023417, + "grad_norm": 0.2809279263019562, + "learning_rate": 4.2504863665287306e-08, + "loss": 0.1891, + "step": 13117 + }, + { + "epoch": 3.4906865353911654, + "grad_norm": 0.30493637919425964, + "learning_rate": 4.249104655110799e-08, + "loss": 0.1747, + "step": 13118 + }, + { + "epoch": 3.4909526343799895, + "grad_norm": 0.293461412191391, + "learning_rate": 4.247723107720085e-08, + "loss": 0.1846, + "step": 13119 + }, + { + "epoch": 3.491218733368813, + "grad_norm": 0.2808054983615875, + "learning_rate": 4.246341724396e-08, + "loss": 0.1668, + "step": 13120 + }, + { + "epoch": 3.491484832357637, + "grad_norm": 0.28638094663619995, + "learning_rate": 4.244960505177938e-08, + "loss": 0.1762, + "step": 13121 + }, + { + "epoch": 3.491750931346461, + "grad_norm": 0.23350778222084045, + "learning_rate": 4.243579450105298e-08, + "loss": 0.1408, + "step": 13122 + }, + { + "epoch": 3.4920170303352847, + "grad_norm": 0.39921167492866516, + "learning_rate": 4.242198559217468e-08, + "loss": 0.1887, + "step": 13123 + }, + { + "epoch": 3.4922831293241083, + "grad_norm": 0.2675648629665375, + "learning_rate": 4.24081783255384e-08, + "loss": 0.1601, + "step": 13124 + }, + { + "epoch": 3.4925492283129325, + "grad_norm": 0.37943634390830994, + "learning_rate": 4.239437270153788e-08, + "loss": 0.1931, + "step": 13125 + }, + { + "epoch": 3.492815327301756, + "grad_norm": 0.4277937114238739, + "learning_rate": 4.2380568720566933e-08, + "loss": 0.1887, + "step": 13126 + }, + { + "epoch": 3.4930814262905803, + "grad_norm": 0.29577064514160156, + "learning_rate": 4.236676638301922e-08, + "loss": 0.1775, + "step": 13127 + }, + { + "epoch": 3.493347525279404, + "grad_norm": 0.3306826651096344, + "learning_rate": 4.2352965689288454e-08, + "loss": 0.178, + "step": 13128 + }, + { + "epoch": 3.4936136242682276, + "grad_norm": 0.31022021174430847, + "learning_rate": 4.233916663976825e-08, + "loss": 0.195, + "step": 13129 + }, + { + "epoch": 3.4938797232570518, + "grad_norm": 0.26375076174736023, + "learning_rate": 4.232536923485218e-08, + "loss": 0.1592, + "step": 13130 + }, + { + "epoch": 3.4941458222458754, + "grad_norm": 0.3389727473258972, + "learning_rate": 4.2311573474933823e-08, + "loss": 0.1806, + "step": 13131 + }, + { + "epoch": 3.494411921234699, + "grad_norm": 0.37490522861480713, + "learning_rate": 4.229777936040657e-08, + "loss": 0.1804, + "step": 13132 + }, + { + "epoch": 3.4946780202235233, + "grad_norm": 0.2755609452724457, + "learning_rate": 4.228398689166396e-08, + "loss": 0.1615, + "step": 13133 + }, + { + "epoch": 3.494944119212347, + "grad_norm": 0.29375457763671875, + "learning_rate": 4.2270196069099284e-08, + "loss": 0.1926, + "step": 13134 + }, + { + "epoch": 3.495210218201171, + "grad_norm": 0.31656312942504883, + "learning_rate": 4.2256406893105956e-08, + "loss": 0.1708, + "step": 13135 + }, + { + "epoch": 3.4954763171899947, + "grad_norm": 0.26934558153152466, + "learning_rate": 4.224261936407721e-08, + "loss": 0.185, + "step": 13136 + }, + { + "epoch": 3.4957424161788184, + "grad_norm": 0.3587225079536438, + "learning_rate": 4.2228833482406344e-08, + "loss": 0.1853, + "step": 13137 + }, + { + "epoch": 3.4960085151676425, + "grad_norm": 0.4160096049308777, + "learning_rate": 4.2215049248486535e-08, + "loss": 0.2052, + "step": 13138 + }, + { + "epoch": 3.4962746141564662, + "grad_norm": 0.43256285786628723, + "learning_rate": 4.2201266662710975e-08, + "loss": 0.1997, + "step": 13139 + }, + { + "epoch": 3.49654071314529, + "grad_norm": 0.2501198947429657, + "learning_rate": 4.218748572547271e-08, + "loss": 0.1646, + "step": 13140 + }, + { + "epoch": 3.496806812134114, + "grad_norm": 0.2813185453414917, + "learning_rate": 4.217370643716488e-08, + "loss": 0.1651, + "step": 13141 + }, + { + "epoch": 3.4970729111229377, + "grad_norm": 0.27074238657951355, + "learning_rate": 4.21599287981804e-08, + "loss": 0.165, + "step": 13142 + }, + { + "epoch": 3.4973390101117614, + "grad_norm": 0.27914249897003174, + "learning_rate": 4.21461528089123e-08, + "loss": 0.1783, + "step": 13143 + }, + { + "epoch": 3.4976051091005855, + "grad_norm": 0.27105841040611267, + "learning_rate": 4.213237846975347e-08, + "loss": 0.1702, + "step": 13144 + }, + { + "epoch": 3.497871208089409, + "grad_norm": 0.4745299816131592, + "learning_rate": 4.211860578109682e-08, + "loss": 0.1837, + "step": 13145 + }, + { + "epoch": 3.498137307078233, + "grad_norm": 0.27775734663009644, + "learning_rate": 4.210483474333517e-08, + "loss": 0.1786, + "step": 13146 + }, + { + "epoch": 3.498403406067057, + "grad_norm": 0.26965904235839844, + "learning_rate": 4.2091065356861254e-08, + "loss": 0.1613, + "step": 13147 + }, + { + "epoch": 3.4986695050558807, + "grad_norm": 0.2869226932525635, + "learning_rate": 4.2077297622067864e-08, + "loss": 0.176, + "step": 13148 + }, + { + "epoch": 3.4989356040447044, + "grad_norm": 0.37632086873054504, + "learning_rate": 4.206353153934762e-08, + "loss": 0.1862, + "step": 13149 + }, + { + "epoch": 3.4992017030335285, + "grad_norm": 0.31169193983078003, + "learning_rate": 4.204976710909319e-08, + "loss": 0.1982, + "step": 13150 + }, + { + "epoch": 3.499467802022352, + "grad_norm": 0.3354925513267517, + "learning_rate": 4.2036004331697164e-08, + "loss": 0.1789, + "step": 13151 + }, + { + "epoch": 3.4997339010111763, + "grad_norm": 0.28312626481056213, + "learning_rate": 4.202224320755211e-08, + "loss": 0.1773, + "step": 13152 + }, + { + "epoch": 3.5, + "grad_norm": 0.27361297607421875, + "learning_rate": 4.200848373705046e-08, + "loss": 0.1752, + "step": 13153 + }, + { + "epoch": 3.5002660989888237, + "grad_norm": 0.32807478308677673, + "learning_rate": 4.199472592058474e-08, + "loss": 0.1859, + "step": 13154 + }, + { + "epoch": 3.500532197977648, + "grad_norm": 0.3196621537208557, + "learning_rate": 4.198096975854727e-08, + "loss": 0.1756, + "step": 13155 + }, + { + "epoch": 3.5007982969664715, + "grad_norm": 0.5289877653121948, + "learning_rate": 4.196721525133048e-08, + "loss": 0.2036, + "step": 13156 + }, + { + "epoch": 3.5010643959552956, + "grad_norm": 0.396870493888855, + "learning_rate": 4.19534623993266e-08, + "loss": 0.1757, + "step": 13157 + }, + { + "epoch": 3.5013304949441193, + "grad_norm": 0.27960121631622314, + "learning_rate": 4.193971120292793e-08, + "loss": 0.1726, + "step": 13158 + }, + { + "epoch": 3.501596593932943, + "grad_norm": 0.24447627365589142, + "learning_rate": 4.192596166252669e-08, + "loss": 0.1538, + "step": 13159 + }, + { + "epoch": 3.501862692921767, + "grad_norm": 0.277701735496521, + "learning_rate": 4.191221377851506e-08, + "loss": 0.1684, + "step": 13160 + }, + { + "epoch": 3.502128791910591, + "grad_norm": 0.26909777522087097, + "learning_rate": 4.1898467551285096e-08, + "loss": 0.1687, + "step": 13161 + }, + { + "epoch": 3.5023948908994145, + "grad_norm": 0.3329528570175171, + "learning_rate": 4.188472298122895e-08, + "loss": 0.1747, + "step": 13162 + }, + { + "epoch": 3.5026609898882386, + "grad_norm": 0.29790061712265015, + "learning_rate": 4.187098006873856e-08, + "loss": 0.1693, + "step": 13163 + }, + { + "epoch": 3.5029270888770623, + "grad_norm": 0.2853566110134125, + "learning_rate": 4.185723881420594e-08, + "loss": 0.1748, + "step": 13164 + }, + { + "epoch": 3.503193187865886, + "grad_norm": 0.27100110054016113, + "learning_rate": 4.184349921802303e-08, + "loss": 0.1723, + "step": 13165 + }, + { + "epoch": 3.50345928685471, + "grad_norm": 0.260219007730484, + "learning_rate": 4.18297612805817e-08, + "loss": 0.166, + "step": 13166 + }, + { + "epoch": 3.5037253858435338, + "grad_norm": 0.2823922038078308, + "learning_rate": 4.1816025002273814e-08, + "loss": 0.1865, + "step": 13167 + }, + { + "epoch": 3.5039914848323575, + "grad_norm": 0.2757996916770935, + "learning_rate": 4.18022903834911e-08, + "loss": 0.1714, + "step": 13168 + }, + { + "epoch": 3.5042575838211816, + "grad_norm": 0.38844966888427734, + "learning_rate": 4.178855742462536e-08, + "loss": 0.1896, + "step": 13169 + }, + { + "epoch": 3.5045236828100053, + "grad_norm": 0.27012643218040466, + "learning_rate": 4.177482612606822e-08, + "loss": 0.1696, + "step": 13170 + }, + { + "epoch": 3.504789781798829, + "grad_norm": 0.2908465266227722, + "learning_rate": 4.176109648821139e-08, + "loss": 0.1649, + "step": 13171 + }, + { + "epoch": 3.505055880787653, + "grad_norm": 0.3045855462551117, + "learning_rate": 4.17473685114464e-08, + "loss": 0.1621, + "step": 13172 + }, + { + "epoch": 3.5053219797764767, + "grad_norm": 0.3327179253101349, + "learning_rate": 4.1733642196164843e-08, + "loss": 0.1833, + "step": 13173 + }, + { + "epoch": 3.5055880787653004, + "grad_norm": 0.3338451683521271, + "learning_rate": 4.171991754275821e-08, + "loss": 0.1897, + "step": 13174 + }, + { + "epoch": 3.5058541777541246, + "grad_norm": 0.2684738337993622, + "learning_rate": 4.170619455161799e-08, + "loss": 0.1718, + "step": 13175 + }, + { + "epoch": 3.5061202767429482, + "grad_norm": 0.29084500670433044, + "learning_rate": 4.1692473223135526e-08, + "loss": 0.1696, + "step": 13176 + }, + { + "epoch": 3.5063863757317724, + "grad_norm": 0.2731078565120697, + "learning_rate": 4.167875355770225e-08, + "loss": 0.1773, + "step": 13177 + }, + { + "epoch": 3.506652474720596, + "grad_norm": 0.29539400339126587, + "learning_rate": 4.16650355557094e-08, + "loss": 0.1739, + "step": 13178 + }, + { + "epoch": 3.50691857370942, + "grad_norm": 0.27482497692108154, + "learning_rate": 4.1651319217548275e-08, + "loss": 0.1706, + "step": 13179 + }, + { + "epoch": 3.507184672698244, + "grad_norm": 0.3693768084049225, + "learning_rate": 4.163760454361011e-08, + "loss": 0.1993, + "step": 13180 + }, + { + "epoch": 3.5074507716870675, + "grad_norm": 0.26554229855537415, + "learning_rate": 4.162389153428608e-08, + "loss": 0.1766, + "step": 13181 + }, + { + "epoch": 3.5077168706758917, + "grad_norm": 0.29934293031692505, + "learning_rate": 4.161018018996727e-08, + "loss": 0.1675, + "step": 13182 + }, + { + "epoch": 3.5079829696647153, + "grad_norm": 0.3707190454006195, + "learning_rate": 4.159647051104476e-08, + "loss": 0.1775, + "step": 13183 + }, + { + "epoch": 3.508249068653539, + "grad_norm": 0.46099787950515747, + "learning_rate": 4.158276249790964e-08, + "loss": 0.1846, + "step": 13184 + }, + { + "epoch": 3.508515167642363, + "grad_norm": 0.32543739676475525, + "learning_rate": 4.1569056150952797e-08, + "loss": 0.1714, + "step": 13185 + }, + { + "epoch": 3.508781266631187, + "grad_norm": 0.2744888663291931, + "learning_rate": 4.155535147056525e-08, + "loss": 0.178, + "step": 13186 + }, + { + "epoch": 3.5090473656200105, + "grad_norm": 0.4759913384914398, + "learning_rate": 4.154164845713777e-08, + "loss": 0.1768, + "step": 13187 + }, + { + "epoch": 3.5093134646088346, + "grad_norm": 0.33379384875297546, + "learning_rate": 4.1527947111061345e-08, + "loss": 0.1866, + "step": 13188 + }, + { + "epoch": 3.5095795635976583, + "grad_norm": 0.3668862283229828, + "learning_rate": 4.151424743272666e-08, + "loss": 0.1927, + "step": 13189 + }, + { + "epoch": 3.509845662586482, + "grad_norm": 0.2698590159416199, + "learning_rate": 4.150054942252451e-08, + "loss": 0.163, + "step": 13190 + }, + { + "epoch": 3.510111761575306, + "grad_norm": 0.3185855448246002, + "learning_rate": 4.148685308084554e-08, + "loss": 0.1655, + "step": 13191 + }, + { + "epoch": 3.51037786056413, + "grad_norm": 0.33140829205513, + "learning_rate": 4.147315840808046e-08, + "loss": 0.1753, + "step": 13192 + }, + { + "epoch": 3.5106439595529535, + "grad_norm": 0.2938186526298523, + "learning_rate": 4.1459465404619784e-08, + "loss": 0.1746, + "step": 13193 + }, + { + "epoch": 3.5109100585417776, + "grad_norm": 0.2698246240615845, + "learning_rate": 4.1445774070854134e-08, + "loss": 0.1691, + "step": 13194 + }, + { + "epoch": 3.5111761575306013, + "grad_norm": 0.325025737285614, + "learning_rate": 4.143208440717398e-08, + "loss": 0.1715, + "step": 13195 + }, + { + "epoch": 3.511442256519425, + "grad_norm": 0.3382161259651184, + "learning_rate": 4.141839641396984e-08, + "loss": 0.1831, + "step": 13196 + }, + { + "epoch": 3.511708355508249, + "grad_norm": 0.2807219922542572, + "learning_rate": 4.1404710091632043e-08, + "loss": 0.1739, + "step": 13197 + }, + { + "epoch": 3.511974454497073, + "grad_norm": 0.2883906960487366, + "learning_rate": 4.139102544055102e-08, + "loss": 0.1857, + "step": 13198 + }, + { + "epoch": 3.5122405534858965, + "grad_norm": 0.2701641321182251, + "learning_rate": 4.1377342461117004e-08, + "loss": 0.1697, + "step": 13199 + }, + { + "epoch": 3.5125066524747206, + "grad_norm": 0.34954383969306946, + "learning_rate": 4.136366115372032e-08, + "loss": 0.1954, + "step": 13200 + }, + { + "epoch": 3.5127727514635443, + "grad_norm": 0.30970317125320435, + "learning_rate": 4.1349981518751174e-08, + "loss": 0.1753, + "step": 13201 + }, + { + "epoch": 3.5130388504523684, + "grad_norm": 0.39648377895355225, + "learning_rate": 4.133630355659973e-08, + "loss": 0.1663, + "step": 13202 + }, + { + "epoch": 3.513304949441192, + "grad_norm": 0.2922108471393585, + "learning_rate": 4.132262726765616e-08, + "loss": 0.1873, + "step": 13203 + }, + { + "epoch": 3.513571048430016, + "grad_norm": 0.3120129704475403, + "learning_rate": 4.1308952652310456e-08, + "loss": 0.1676, + "step": 13204 + }, + { + "epoch": 3.51383714741884, + "grad_norm": 0.2575710415840149, + "learning_rate": 4.1295279710952737e-08, + "loss": 0.1803, + "step": 13205 + }, + { + "epoch": 3.5141032464076636, + "grad_norm": 0.320766419172287, + "learning_rate": 4.1281608443972894e-08, + "loss": 0.1788, + "step": 13206 + }, + { + "epoch": 3.5143693453964877, + "grad_norm": 0.29238417744636536, + "learning_rate": 4.126793885176093e-08, + "loss": 0.1717, + "step": 13207 + }, + { + "epoch": 3.5146354443853114, + "grad_norm": 0.3426893949508667, + "learning_rate": 4.125427093470667e-08, + "loss": 0.1768, + "step": 13208 + }, + { + "epoch": 3.514901543374135, + "grad_norm": 0.28766465187072754, + "learning_rate": 4.1240604693199976e-08, + "loss": 0.1901, + "step": 13209 + }, + { + "epoch": 3.515167642362959, + "grad_norm": 0.3014630377292633, + "learning_rate": 4.1226940127630637e-08, + "loss": 0.1871, + "step": 13210 + }, + { + "epoch": 3.515433741351783, + "grad_norm": 0.4021039307117462, + "learning_rate": 4.121327723838844e-08, + "loss": 0.1911, + "step": 13211 + }, + { + "epoch": 3.5156998403406066, + "grad_norm": 0.3508133292198181, + "learning_rate": 4.1199616025863e-08, + "loss": 0.1831, + "step": 13212 + }, + { + "epoch": 3.5159659393294307, + "grad_norm": 0.2905958592891693, + "learning_rate": 4.118595649044404e-08, + "loss": 0.1899, + "step": 13213 + }, + { + "epoch": 3.5162320383182544, + "grad_norm": 0.3274346590042114, + "learning_rate": 4.1172298632521075e-08, + "loss": 0.1805, + "step": 13214 + }, + { + "epoch": 3.516498137307078, + "grad_norm": 0.3290729224681854, + "learning_rate": 4.1158642452483693e-08, + "loss": 0.1792, + "step": 13215 + }, + { + "epoch": 3.516764236295902, + "grad_norm": 0.278256356716156, + "learning_rate": 4.11449879507214e-08, + "loss": 0.1685, + "step": 13216 + }, + { + "epoch": 3.517030335284726, + "grad_norm": 0.28106606006622314, + "learning_rate": 4.113133512762369e-08, + "loss": 0.1721, + "step": 13217 + }, + { + "epoch": 3.5172964342735495, + "grad_norm": 0.4359264075756073, + "learning_rate": 4.1117683983579886e-08, + "loss": 0.2073, + "step": 13218 + }, + { + "epoch": 3.5175625332623737, + "grad_norm": 0.25899970531463623, + "learning_rate": 4.11040345189794e-08, + "loss": 0.1617, + "step": 13219 + }, + { + "epoch": 3.5178286322511974, + "grad_norm": 0.25203830003738403, + "learning_rate": 4.109038673421157e-08, + "loss": 0.1519, + "step": 13220 + }, + { + "epoch": 3.518094731240021, + "grad_norm": 0.26765698194503784, + "learning_rate": 4.107674062966557e-08, + "loss": 0.1665, + "step": 13221 + }, + { + "epoch": 3.518360830228845, + "grad_norm": 0.36640188097953796, + "learning_rate": 4.106309620573071e-08, + "loss": 0.1861, + "step": 13222 + }, + { + "epoch": 3.518626929217669, + "grad_norm": 0.2868354320526123, + "learning_rate": 4.104945346279608e-08, + "loss": 0.1813, + "step": 13223 + }, + { + "epoch": 3.518893028206493, + "grad_norm": 0.3216298520565033, + "learning_rate": 4.103581240125083e-08, + "loss": 0.2117, + "step": 13224 + }, + { + "epoch": 3.5191591271953166, + "grad_norm": 0.27806875109672546, + "learning_rate": 4.102217302148402e-08, + "loss": 0.1825, + "step": 13225 + }, + { + "epoch": 3.5194252261841408, + "grad_norm": 0.25022071599960327, + "learning_rate": 4.100853532388472e-08, + "loss": 0.1673, + "step": 13226 + }, + { + "epoch": 3.5196913251729645, + "grad_norm": 0.27745023369789124, + "learning_rate": 4.0994899308841846e-08, + "loss": 0.1825, + "step": 13227 + }, + { + "epoch": 3.519957424161788, + "grad_norm": 0.30347341299057007, + "learning_rate": 4.098126497674437e-08, + "loss": 0.1906, + "step": 13228 + }, + { + "epoch": 3.5202235231506123, + "grad_norm": 0.2682192623615265, + "learning_rate": 4.096763232798112e-08, + "loss": 0.1571, + "step": 13229 + }, + { + "epoch": 3.520489622139436, + "grad_norm": 0.2733428180217743, + "learning_rate": 4.095400136294096e-08, + "loss": 0.1606, + "step": 13230 + }, + { + "epoch": 3.5207557211282596, + "grad_norm": 0.3167898952960968, + "learning_rate": 4.094037208201266e-08, + "loss": 0.2176, + "step": 13231 + }, + { + "epoch": 3.5210218201170838, + "grad_norm": 0.3393464684486389, + "learning_rate": 4.092674448558501e-08, + "loss": 0.1662, + "step": 13232 + }, + { + "epoch": 3.5212879191059074, + "grad_norm": 0.27312996983528137, + "learning_rate": 4.09131185740466e-08, + "loss": 0.1692, + "step": 13233 + }, + { + "epoch": 3.521554018094731, + "grad_norm": 0.7174855470657349, + "learning_rate": 4.0899494347786156e-08, + "loss": 0.18, + "step": 13234 + }, + { + "epoch": 3.5218201170835552, + "grad_norm": 0.2655056416988373, + "learning_rate": 4.0885871807192205e-08, + "loss": 0.1682, + "step": 13235 + }, + { + "epoch": 3.522086216072379, + "grad_norm": 0.3864474892616272, + "learning_rate": 4.087225095265331e-08, + "loss": 0.1782, + "step": 13236 + }, + { + "epoch": 3.5223523150612026, + "grad_norm": 0.33046406507492065, + "learning_rate": 4.085863178455801e-08, + "loss": 0.1822, + "step": 13237 + }, + { + "epoch": 3.5226184140500267, + "grad_norm": 0.3479418456554413, + "learning_rate": 4.0845014303294636e-08, + "loss": 0.1904, + "step": 13238 + }, + { + "epoch": 3.5228845130388504, + "grad_norm": 0.28504154086112976, + "learning_rate": 4.083139850925174e-08, + "loss": 0.1621, + "step": 13239 + }, + { + "epoch": 3.523150612027674, + "grad_norm": 0.2922061085700989, + "learning_rate": 4.0817784402817555e-08, + "loss": 0.1705, + "step": 13240 + }, + { + "epoch": 3.5234167110164982, + "grad_norm": 0.35490885376930237, + "learning_rate": 4.080417198438046e-08, + "loss": 0.1797, + "step": 13241 + }, + { + "epoch": 3.523682810005322, + "grad_norm": 0.3123433291912079, + "learning_rate": 4.079056125432864e-08, + "loss": 0.1824, + "step": 13242 + }, + { + "epoch": 3.5239489089941456, + "grad_norm": 0.26593589782714844, + "learning_rate": 4.0776952213050365e-08, + "loss": 0.1716, + "step": 13243 + }, + { + "epoch": 3.5242150079829697, + "grad_norm": 0.26805099844932556, + "learning_rate": 4.0763344860933714e-08, + "loss": 0.1731, + "step": 13244 + }, + { + "epoch": 3.5244811069717934, + "grad_norm": 0.2671414911746979, + "learning_rate": 4.074973919836685e-08, + "loss": 0.1533, + "step": 13245 + }, + { + "epoch": 3.524747205960617, + "grad_norm": 0.4176866412162781, + "learning_rate": 4.0736135225737836e-08, + "loss": 0.2148, + "step": 13246 + }, + { + "epoch": 3.525013304949441, + "grad_norm": 0.28677427768707275, + "learning_rate": 4.0722532943434697e-08, + "loss": 0.1665, + "step": 13247 + }, + { + "epoch": 3.525279403938265, + "grad_norm": 0.2992790639400482, + "learning_rate": 4.0708932351845346e-08, + "loss": 0.17, + "step": 13248 + }, + { + "epoch": 3.525545502927089, + "grad_norm": 0.3826652765274048, + "learning_rate": 4.069533345135777e-08, + "loss": 0.1913, + "step": 13249 + }, + { + "epoch": 3.5258116019159127, + "grad_norm": 0.2700654864311218, + "learning_rate": 4.068173624235975e-08, + "loss": 0.1829, + "step": 13250 + }, + { + "epoch": 3.526077700904737, + "grad_norm": 0.4012110233306885, + "learning_rate": 4.06681407252392e-08, + "loss": 0.2027, + "step": 13251 + }, + { + "epoch": 3.5263437998935605, + "grad_norm": 0.28440240025520325, + "learning_rate": 4.065454690038377e-08, + "loss": 0.2016, + "step": 13252 + }, + { + "epoch": 3.526609898882384, + "grad_norm": 0.2873629033565521, + "learning_rate": 4.0640954768181325e-08, + "loss": 0.16, + "step": 13253 + }, + { + "epoch": 3.5268759978712083, + "grad_norm": 0.260504812002182, + "learning_rate": 4.062736432901944e-08, + "loss": 0.1741, + "step": 13254 + }, + { + "epoch": 3.527142096860032, + "grad_norm": 0.27543890476226807, + "learning_rate": 4.061377558328577e-08, + "loss": 0.1604, + "step": 13255 + }, + { + "epoch": 3.5274081958488557, + "grad_norm": 0.27759429812431335, + "learning_rate": 4.060018853136794e-08, + "loss": 0.1751, + "step": 13256 + }, + { + "epoch": 3.52767429483768, + "grad_norm": 0.2825740575790405, + "learning_rate": 4.058660317365341e-08, + "loss": 0.1765, + "step": 13257 + }, + { + "epoch": 3.5279403938265035, + "grad_norm": 0.3398466110229492, + "learning_rate": 4.057301951052972e-08, + "loss": 0.1994, + "step": 13258 + }, + { + "epoch": 3.528206492815327, + "grad_norm": 0.2788383662700653, + "learning_rate": 4.0559437542384237e-08, + "loss": 0.1824, + "step": 13259 + }, + { + "epoch": 3.5284725918041513, + "grad_norm": 0.29141125082969666, + "learning_rate": 4.054585726960439e-08, + "loss": 0.1612, + "step": 13260 + }, + { + "epoch": 3.528738690792975, + "grad_norm": 0.294861376285553, + "learning_rate": 4.0532278692577506e-08, + "loss": 0.183, + "step": 13261 + }, + { + "epoch": 3.5290047897817987, + "grad_norm": 0.3620225787162781, + "learning_rate": 4.0518701811690915e-08, + "loss": 0.178, + "step": 13262 + }, + { + "epoch": 3.529270888770623, + "grad_norm": 0.2768942713737488, + "learning_rate": 4.050512662733179e-08, + "loss": 0.1661, + "step": 13263 + }, + { + "epoch": 3.5295369877594465, + "grad_norm": 0.27373191714286804, + "learning_rate": 4.049155313988738e-08, + "loss": 0.1764, + "step": 13264 + }, + { + "epoch": 3.52980308674827, + "grad_norm": 0.4151915907859802, + "learning_rate": 4.047798134974477e-08, + "loss": 0.1971, + "step": 13265 + }, + { + "epoch": 3.5300691857370943, + "grad_norm": 0.27816930413246155, + "learning_rate": 4.046441125729109e-08, + "loss": 0.1648, + "step": 13266 + }, + { + "epoch": 3.530335284725918, + "grad_norm": 0.3329716920852661, + "learning_rate": 4.045084286291337e-08, + "loss": 0.1823, + "step": 13267 + }, + { + "epoch": 3.5306013837147416, + "grad_norm": 0.34658896923065186, + "learning_rate": 4.043727616699867e-08, + "loss": 0.1753, + "step": 13268 + }, + { + "epoch": 3.5308674827035658, + "grad_norm": 0.29942408204078674, + "learning_rate": 4.042371116993385e-08, + "loss": 0.1889, + "step": 13269 + }, + { + "epoch": 3.5311335816923894, + "grad_norm": 0.32082387804985046, + "learning_rate": 4.041014787210588e-08, + "loss": 0.2013, + "step": 13270 + }, + { + "epoch": 3.5313996806812136, + "grad_norm": 0.2690185010433197, + "learning_rate": 4.0396586273901565e-08, + "loss": 0.1586, + "step": 13271 + }, + { + "epoch": 3.5316657796700373, + "grad_norm": 0.2952505648136139, + "learning_rate": 4.0383026375707764e-08, + "loss": 0.1861, + "step": 13272 + }, + { + "epoch": 3.531931878658861, + "grad_norm": 0.2708589732646942, + "learning_rate": 4.036946817791115e-08, + "loss": 0.1709, + "step": 13273 + }, + { + "epoch": 3.532197977647685, + "grad_norm": 0.3740946352481842, + "learning_rate": 4.035591168089848e-08, + "loss": 0.16, + "step": 13274 + }, + { + "epoch": 3.5324640766365087, + "grad_norm": 0.28394782543182373, + "learning_rate": 4.0342356885056414e-08, + "loss": 0.1721, + "step": 13275 + }, + { + "epoch": 3.532730175625333, + "grad_norm": 0.2683465778827667, + "learning_rate": 4.032880379077157e-08, + "loss": 0.1751, + "step": 13276 + }, + { + "epoch": 3.5329962746141566, + "grad_norm": 0.28297629952430725, + "learning_rate": 4.0315252398430536e-08, + "loss": 0.1785, + "step": 13277 + }, + { + "epoch": 3.5332623736029802, + "grad_norm": 0.2913488447666168, + "learning_rate": 4.030170270841974e-08, + "loss": 0.1897, + "step": 13278 + }, + { + "epoch": 3.5335284725918044, + "grad_norm": 0.48100054264068604, + "learning_rate": 4.0288154721125734e-08, + "loss": 0.1969, + "step": 13279 + }, + { + "epoch": 3.533794571580628, + "grad_norm": 0.2839756906032562, + "learning_rate": 4.027460843693487e-08, + "loss": 0.1712, + "step": 13280 + }, + { + "epoch": 3.5340606705694517, + "grad_norm": 0.26576095819473267, + "learning_rate": 4.026106385623352e-08, + "loss": 0.1763, + "step": 13281 + }, + { + "epoch": 3.534326769558276, + "grad_norm": 0.2658696472644806, + "learning_rate": 4.024752097940805e-08, + "loss": 0.156, + "step": 13282 + }, + { + "epoch": 3.5345928685470995, + "grad_norm": 0.26840972900390625, + "learning_rate": 4.023397980684472e-08, + "loss": 0.1713, + "step": 13283 + }, + { + "epoch": 3.534858967535923, + "grad_norm": 0.28042832016944885, + "learning_rate": 4.0220440338929706e-08, + "loss": 0.1773, + "step": 13284 + }, + { + "epoch": 3.5351250665247473, + "grad_norm": 0.28338390588760376, + "learning_rate": 4.020690257604925e-08, + "loss": 0.1879, + "step": 13285 + }, + { + "epoch": 3.535391165513571, + "grad_norm": 0.29741814732551575, + "learning_rate": 4.01933665185894e-08, + "loss": 0.1775, + "step": 13286 + }, + { + "epoch": 3.5356572645023947, + "grad_norm": 0.3030729293823242, + "learning_rate": 4.017983216693631e-08, + "loss": 0.1717, + "step": 13287 + }, + { + "epoch": 3.535923363491219, + "grad_norm": 0.3355304002761841, + "learning_rate": 4.016629952147592e-08, + "loss": 0.1743, + "step": 13288 + }, + { + "epoch": 3.5361894624800425, + "grad_norm": 0.2629300653934479, + "learning_rate": 4.0152768582594265e-08, + "loss": 0.1667, + "step": 13289 + }, + { + "epoch": 3.536455561468866, + "grad_norm": 0.32200315594673157, + "learning_rate": 4.0139239350677265e-08, + "loss": 0.1874, + "step": 13290 + }, + { + "epoch": 3.5367216604576903, + "grad_norm": 0.2649589776992798, + "learning_rate": 4.0125711826110805e-08, + "loss": 0.1859, + "step": 13291 + }, + { + "epoch": 3.536987759446514, + "grad_norm": 0.2980673313140869, + "learning_rate": 4.0112186009280743e-08, + "loss": 0.1832, + "step": 13292 + }, + { + "epoch": 3.5372538584353377, + "grad_norm": 0.26307716965675354, + "learning_rate": 4.00986619005728e-08, + "loss": 0.1692, + "step": 13293 + }, + { + "epoch": 3.537519957424162, + "grad_norm": 0.3429239094257355, + "learning_rate": 4.0085139500372775e-08, + "loss": 0.1835, + "step": 13294 + }, + { + "epoch": 3.5377860564129855, + "grad_norm": 0.26757147908210754, + "learning_rate": 4.00716188090663e-08, + "loss": 0.1593, + "step": 13295 + }, + { + "epoch": 3.5380521554018096, + "grad_norm": 0.28493544459342957, + "learning_rate": 4.005809982703904e-08, + "loss": 0.1717, + "step": 13296 + }, + { + "epoch": 3.5383182543906333, + "grad_norm": 0.2738594710826874, + "learning_rate": 4.0044582554676566e-08, + "loss": 0.1692, + "step": 13297 + }, + { + "epoch": 3.5385843533794574, + "grad_norm": 0.4225657880306244, + "learning_rate": 4.003106699236447e-08, + "loss": 0.1964, + "step": 13298 + }, + { + "epoch": 3.538850452368281, + "grad_norm": 0.3590537905693054, + "learning_rate": 4.001755314048818e-08, + "loss": 0.1807, + "step": 13299 + }, + { + "epoch": 3.539116551357105, + "grad_norm": 0.40336334705352783, + "learning_rate": 4.00040409994332e-08, + "loss": 0.1847, + "step": 13300 + }, + { + "epoch": 3.539382650345929, + "grad_norm": 0.4332916736602783, + "learning_rate": 3.9990530569584843e-08, + "loss": 0.1876, + "step": 13301 + }, + { + "epoch": 3.5396487493347526, + "grad_norm": 0.28305783867836, + "learning_rate": 3.997702185132854e-08, + "loss": 0.1803, + "step": 13302 + }, + { + "epoch": 3.5399148483235763, + "grad_norm": 0.3727007210254669, + "learning_rate": 3.996351484504947e-08, + "loss": 0.172, + "step": 13303 + }, + { + "epoch": 3.5401809473124004, + "grad_norm": 0.26111507415771484, + "learning_rate": 3.9950009551133035e-08, + "loss": 0.1764, + "step": 13304 + }, + { + "epoch": 3.540447046301224, + "grad_norm": 0.3737199008464813, + "learning_rate": 3.993650596996432e-08, + "loss": 0.1819, + "step": 13305 + }, + { + "epoch": 3.5407131452900478, + "grad_norm": 0.24990852177143097, + "learning_rate": 3.992300410192855e-08, + "loss": 0.1639, + "step": 13306 + }, + { + "epoch": 3.540979244278872, + "grad_norm": 0.280340313911438, + "learning_rate": 3.990950394741074e-08, + "loss": 0.1667, + "step": 13307 + }, + { + "epoch": 3.5412453432676956, + "grad_norm": 0.3689453899860382, + "learning_rate": 3.9896005506796025e-08, + "loss": 0.166, + "step": 13308 + }, + { + "epoch": 3.5415114422565193, + "grad_norm": 0.2728375196456909, + "learning_rate": 3.9882508780469335e-08, + "loss": 0.1741, + "step": 13309 + }, + { + "epoch": 3.5417775412453434, + "grad_norm": 0.5910345911979675, + "learning_rate": 3.9869013768815654e-08, + "loss": 0.1799, + "step": 13310 + }, + { + "epoch": 3.542043640234167, + "grad_norm": 0.2958889901638031, + "learning_rate": 3.98555204722199e-08, + "loss": 0.1551, + "step": 13311 + }, + { + "epoch": 3.5423097392229907, + "grad_norm": 0.2814343273639679, + "learning_rate": 3.984202889106692e-08, + "loss": 0.1812, + "step": 13312 + }, + { + "epoch": 3.542575838211815, + "grad_norm": 0.38664740324020386, + "learning_rate": 3.982853902574156e-08, + "loss": 0.1868, + "step": 13313 + }, + { + "epoch": 3.5428419372006386, + "grad_norm": 0.3388063311576843, + "learning_rate": 3.9815050876628496e-08, + "loss": 0.1757, + "step": 13314 + }, + { + "epoch": 3.5431080361894622, + "grad_norm": 0.2605532109737396, + "learning_rate": 3.9801564444112524e-08, + "loss": 0.1759, + "step": 13315 + }, + { + "epoch": 3.5433741351782864, + "grad_norm": 0.36977121233940125, + "learning_rate": 3.978807972857823e-08, + "loss": 0.1829, + "step": 13316 + }, + { + "epoch": 3.54364023416711, + "grad_norm": 0.3373427987098694, + "learning_rate": 3.977459673041026e-08, + "loss": 0.1809, + "step": 13317 + }, + { + "epoch": 3.5439063331559337, + "grad_norm": 0.29795655608177185, + "learning_rate": 3.976111544999316e-08, + "loss": 0.1957, + "step": 13318 + }, + { + "epoch": 3.544172432144758, + "grad_norm": 0.313996285200119, + "learning_rate": 3.974763588771152e-08, + "loss": 0.1752, + "step": 13319 + }, + { + "epoch": 3.5444385311335815, + "grad_norm": 0.41450366377830505, + "learning_rate": 3.9734158043949695e-08, + "loss": 0.1912, + "step": 13320 + }, + { + "epoch": 3.5447046301224057, + "grad_norm": 0.3862978219985962, + "learning_rate": 3.972068191909218e-08, + "loss": 0.176, + "step": 13321 + }, + { + "epoch": 3.5449707291112293, + "grad_norm": 0.37107476592063904, + "learning_rate": 3.9707207513523276e-08, + "loss": 0.1982, + "step": 13322 + }, + { + "epoch": 3.5452368281000535, + "grad_norm": 0.2608698606491089, + "learning_rate": 3.969373482762737e-08, + "loss": 0.1712, + "step": 13323 + }, + { + "epoch": 3.545502927088877, + "grad_norm": 0.2686064839363098, + "learning_rate": 3.968026386178867e-08, + "loss": 0.174, + "step": 13324 + }, + { + "epoch": 3.545769026077701, + "grad_norm": 0.36657169461250305, + "learning_rate": 3.966679461639141e-08, + "loss": 0.1839, + "step": 13325 + }, + { + "epoch": 3.546035125066525, + "grad_norm": 0.3198825418949127, + "learning_rate": 3.965332709181977e-08, + "loss": 0.1736, + "step": 13326 + }, + { + "epoch": 3.5463012240553486, + "grad_norm": 0.3002726137638092, + "learning_rate": 3.963986128845791e-08, + "loss": 0.1709, + "step": 13327 + }, + { + "epoch": 3.5465673230441723, + "grad_norm": 0.37032264471054077, + "learning_rate": 3.962639720668982e-08, + "loss": 0.1823, + "step": 13328 + }, + { + "epoch": 3.5468334220329965, + "grad_norm": 0.3447522521018982, + "learning_rate": 3.9612934846899583e-08, + "loss": 0.1973, + "step": 13329 + }, + { + "epoch": 3.54709952102182, + "grad_norm": 0.46538954973220825, + "learning_rate": 3.959947420947117e-08, + "loss": 0.1862, + "step": 13330 + }, + { + "epoch": 3.547365620010644, + "grad_norm": 0.28201282024383545, + "learning_rate": 3.958601529478848e-08, + "loss": 0.1652, + "step": 13331 + }, + { + "epoch": 3.547631718999468, + "grad_norm": 0.33145585656166077, + "learning_rate": 3.957255810323539e-08, + "loss": 0.1971, + "step": 13332 + }, + { + "epoch": 3.5478978179882916, + "grad_norm": 0.24798281490802765, + "learning_rate": 3.955910263519574e-08, + "loss": 0.162, + "step": 13333 + }, + { + "epoch": 3.5481639169771153, + "grad_norm": 0.28436368703842163, + "learning_rate": 3.954564889105333e-08, + "loss": 0.1899, + "step": 13334 + }, + { + "epoch": 3.5484300159659394, + "grad_norm": 0.2626729905605316, + "learning_rate": 3.953219687119184e-08, + "loss": 0.1816, + "step": 13335 + }, + { + "epoch": 3.548696114954763, + "grad_norm": 0.269878089427948, + "learning_rate": 3.9518746575995e-08, + "loss": 0.1817, + "step": 13336 + }, + { + "epoch": 3.548962213943587, + "grad_norm": 0.2682357430458069, + "learning_rate": 3.950529800584638e-08, + "loss": 0.1637, + "step": 13337 + }, + { + "epoch": 3.549228312932411, + "grad_norm": 0.42871248722076416, + "learning_rate": 3.949185116112963e-08, + "loss": 0.205, + "step": 13338 + }, + { + "epoch": 3.5494944119212346, + "grad_norm": 0.3458351194858551, + "learning_rate": 3.9478406042228194e-08, + "loss": 0.1668, + "step": 13339 + }, + { + "epoch": 3.5497605109100583, + "grad_norm": 0.26814550161361694, + "learning_rate": 3.946496264952563e-08, + "loss": 0.1683, + "step": 13340 + }, + { + "epoch": 3.5500266098988824, + "grad_norm": 0.36037299036979675, + "learning_rate": 3.945152098340532e-08, + "loss": 0.1899, + "step": 13341 + }, + { + "epoch": 3.550292708887706, + "grad_norm": 0.29317429661750793, + "learning_rate": 3.9438081044250717e-08, + "loss": 0.156, + "step": 13342 + }, + { + "epoch": 3.55055880787653, + "grad_norm": 0.31591707468032837, + "learning_rate": 3.9424642832445064e-08, + "loss": 0.1547, + "step": 13343 + }, + { + "epoch": 3.550824906865354, + "grad_norm": 0.28757551312446594, + "learning_rate": 3.941120634837174e-08, + "loss": 0.1679, + "step": 13344 + }, + { + "epoch": 3.551091005854178, + "grad_norm": 0.2903643250465393, + "learning_rate": 3.93977715924139e-08, + "loss": 0.1791, + "step": 13345 + }, + { + "epoch": 3.5513571048430017, + "grad_norm": 0.33943766355514526, + "learning_rate": 3.9384338564954756e-08, + "loss": 0.1737, + "step": 13346 + }, + { + "epoch": 3.5516232038318254, + "grad_norm": 0.29921650886535645, + "learning_rate": 3.937090726637745e-08, + "loss": 0.2, + "step": 13347 + }, + { + "epoch": 3.5518893028206495, + "grad_norm": 0.3138186037540436, + "learning_rate": 3.935747769706508e-08, + "loss": 0.1814, + "step": 13348 + }, + { + "epoch": 3.552155401809473, + "grad_norm": 0.302925169467926, + "learning_rate": 3.934404985740072e-08, + "loss": 0.1885, + "step": 13349 + }, + { + "epoch": 3.552421500798297, + "grad_norm": 0.2721126675605774, + "learning_rate": 3.933062374776727e-08, + "loss": 0.1704, + "step": 13350 + }, + { + "epoch": 3.552687599787121, + "grad_norm": 0.3624158501625061, + "learning_rate": 3.931719936854775e-08, + "loss": 0.1856, + "step": 13351 + }, + { + "epoch": 3.5529536987759447, + "grad_norm": 0.27295541763305664, + "learning_rate": 3.930377672012499e-08, + "loss": 0.166, + "step": 13352 + }, + { + "epoch": 3.5532197977647684, + "grad_norm": 0.35745683312416077, + "learning_rate": 3.929035580288189e-08, + "loss": 0.1859, + "step": 13353 + }, + { + "epoch": 3.5534858967535925, + "grad_norm": 0.3554359972476959, + "learning_rate": 3.927693661720114e-08, + "loss": 0.197, + "step": 13354 + }, + { + "epoch": 3.553751995742416, + "grad_norm": 0.4387836158275604, + "learning_rate": 3.926351916346563e-08, + "loss": 0.2055, + "step": 13355 + }, + { + "epoch": 3.55401809473124, + "grad_norm": 0.3385910391807556, + "learning_rate": 3.9250103442057926e-08, + "loss": 0.1652, + "step": 13356 + }, + { + "epoch": 3.554284193720064, + "grad_norm": 0.36615869402885437, + "learning_rate": 3.923668945336077e-08, + "loss": 0.1595, + "step": 13357 + }, + { + "epoch": 3.5545502927088877, + "grad_norm": 0.38896167278289795, + "learning_rate": 3.922327719775667e-08, + "loss": 0.1799, + "step": 13358 + }, + { + "epoch": 3.5548163916977114, + "grad_norm": 0.2635389268398285, + "learning_rate": 3.920986667562824e-08, + "loss": 0.1602, + "step": 13359 + }, + { + "epoch": 3.5550824906865355, + "grad_norm": 0.2983193099498749, + "learning_rate": 3.9196457887357915e-08, + "loss": 0.1772, + "step": 13360 + }, + { + "epoch": 3.555348589675359, + "grad_norm": 0.27931302785873413, + "learning_rate": 3.9183050833328156e-08, + "loss": 0.1687, + "step": 13361 + }, + { + "epoch": 3.555614688664183, + "grad_norm": 0.2710933983325958, + "learning_rate": 3.916964551392139e-08, + "loss": 0.1576, + "step": 13362 + }, + { + "epoch": 3.555880787653007, + "grad_norm": 0.27769121527671814, + "learning_rate": 3.915624192951997e-08, + "loss": 0.1738, + "step": 13363 + }, + { + "epoch": 3.5561468866418307, + "grad_norm": 0.39045533537864685, + "learning_rate": 3.9142840080506144e-08, + "loss": 0.1966, + "step": 13364 + }, + { + "epoch": 3.5564129856306543, + "grad_norm": 0.28407052159309387, + "learning_rate": 3.912943996726218e-08, + "loss": 0.1438, + "step": 13365 + }, + { + "epoch": 3.5566790846194785, + "grad_norm": 0.2735726535320282, + "learning_rate": 3.9116041590170326e-08, + "loss": 0.1646, + "step": 13366 + }, + { + "epoch": 3.556945183608302, + "grad_norm": 0.337920218706131, + "learning_rate": 3.910264494961264e-08, + "loss": 0.1644, + "step": 13367 + }, + { + "epoch": 3.5572112825971263, + "grad_norm": 0.2661888003349304, + "learning_rate": 3.908925004597128e-08, + "loss": 0.1697, + "step": 13368 + }, + { + "epoch": 3.55747738158595, + "grad_norm": 0.2727803587913513, + "learning_rate": 3.907585687962829e-08, + "loss": 0.1862, + "step": 13369 + }, + { + "epoch": 3.557743480574774, + "grad_norm": 0.2820306718349457, + "learning_rate": 3.906246545096569e-08, + "loss": 0.1701, + "step": 13370 + }, + { + "epoch": 3.5580095795635978, + "grad_norm": 0.28446313738822937, + "learning_rate": 3.904907576036537e-08, + "loss": 0.1984, + "step": 13371 + }, + { + "epoch": 3.5582756785524214, + "grad_norm": 0.29647356271743774, + "learning_rate": 3.903568780820931e-08, + "loss": 0.1822, + "step": 13372 + }, + { + "epoch": 3.5585417775412456, + "grad_norm": 0.2530519962310791, + "learning_rate": 3.902230159487928e-08, + "loss": 0.1594, + "step": 13373 + }, + { + "epoch": 3.5588078765300692, + "grad_norm": 0.26832741498947144, + "learning_rate": 3.900891712075716e-08, + "loss": 0.1696, + "step": 13374 + }, + { + "epoch": 3.559073975518893, + "grad_norm": 0.39288634061813354, + "learning_rate": 3.899553438622463e-08, + "loss": 0.1888, + "step": 13375 + }, + { + "epoch": 3.559340074507717, + "grad_norm": 0.3445385694503784, + "learning_rate": 3.8982153391663406e-08, + "loss": 0.1743, + "step": 13376 + }, + { + "epoch": 3.5596061734965407, + "grad_norm": 0.2940770089626312, + "learning_rate": 3.896877413745519e-08, + "loss": 0.1828, + "step": 13377 + }, + { + "epoch": 3.5598722724853644, + "grad_norm": 0.2710168659687042, + "learning_rate": 3.895539662398157e-08, + "loss": 0.1632, + "step": 13378 + }, + { + "epoch": 3.5601383714741885, + "grad_norm": 0.33972877264022827, + "learning_rate": 3.894202085162407e-08, + "loss": 0.1746, + "step": 13379 + }, + { + "epoch": 3.5604044704630122, + "grad_norm": 0.31764519214630127, + "learning_rate": 3.8928646820764236e-08, + "loss": 0.1627, + "step": 13380 + }, + { + "epoch": 3.560670569451836, + "grad_norm": 0.39650413393974304, + "learning_rate": 3.891527453178346e-08, + "loss": 0.1826, + "step": 13381 + }, + { + "epoch": 3.56093666844066, + "grad_norm": 0.33681774139404297, + "learning_rate": 3.890190398506319e-08, + "loss": 0.1639, + "step": 13382 + }, + { + "epoch": 3.5612027674294837, + "grad_norm": 0.2732979953289032, + "learning_rate": 3.8888535180984774e-08, + "loss": 0.1636, + "step": 13383 + }, + { + "epoch": 3.5614688664183074, + "grad_norm": 0.26629137992858887, + "learning_rate": 3.887516811992951e-08, + "loss": 0.1742, + "step": 13384 + }, + { + "epoch": 3.5617349654071315, + "grad_norm": 0.29239746928215027, + "learning_rate": 3.886180280227872e-08, + "loss": 0.1733, + "step": 13385 + }, + { + "epoch": 3.562001064395955, + "grad_norm": 0.26691943407058716, + "learning_rate": 3.88484392284135e-08, + "loss": 0.1796, + "step": 13386 + }, + { + "epoch": 3.562267163384779, + "grad_norm": 0.29310059547424316, + "learning_rate": 3.88350773987151e-08, + "loss": 0.1818, + "step": 13387 + }, + { + "epoch": 3.562533262373603, + "grad_norm": 0.3421136736869812, + "learning_rate": 3.882171731356455e-08, + "loss": 0.1865, + "step": 13388 + }, + { + "epoch": 3.5627993613624267, + "grad_norm": 0.41065290570259094, + "learning_rate": 3.880835897334298e-08, + "loss": 0.1898, + "step": 13389 + }, + { + "epoch": 3.563065460351251, + "grad_norm": 0.38337215781211853, + "learning_rate": 3.879500237843132e-08, + "loss": 0.1737, + "step": 13390 + }, + { + "epoch": 3.5633315593400745, + "grad_norm": 0.39746373891830444, + "learning_rate": 3.878164752921056e-08, + "loss": 0.1723, + "step": 13391 + }, + { + "epoch": 3.563597658328898, + "grad_norm": 0.27615031599998474, + "learning_rate": 3.876829442606163e-08, + "loss": 0.1761, + "step": 13392 + }, + { + "epoch": 3.5638637573177223, + "grad_norm": 0.3131340742111206, + "learning_rate": 3.87549430693654e-08, + "loss": 0.192, + "step": 13393 + }, + { + "epoch": 3.564129856306546, + "grad_norm": 0.2835581302642822, + "learning_rate": 3.8741593459502605e-08, + "loss": 0.1864, + "step": 13394 + }, + { + "epoch": 3.56439595529537, + "grad_norm": 0.27696162462234497, + "learning_rate": 3.872824559685409e-08, + "loss": 0.1876, + "step": 13395 + }, + { + "epoch": 3.564662054284194, + "grad_norm": 0.40407848358154297, + "learning_rate": 3.871489948180049e-08, + "loss": 0.1855, + "step": 13396 + }, + { + "epoch": 3.5649281532730175, + "grad_norm": 0.29612472653388977, + "learning_rate": 3.87015551147225e-08, + "loss": 0.1822, + "step": 13397 + }, + { + "epoch": 3.5651942522618416, + "grad_norm": 0.27032095193862915, + "learning_rate": 3.8688212496000706e-08, + "loss": 0.1742, + "step": 13398 + }, + { + "epoch": 3.5654603512506653, + "grad_norm": 0.316928893327713, + "learning_rate": 3.867487162601572e-08, + "loss": 0.1961, + "step": 13399 + }, + { + "epoch": 3.565726450239489, + "grad_norm": 0.32039621472358704, + "learning_rate": 3.8661532505147986e-08, + "loss": 0.1803, + "step": 13400 + }, + { + "epoch": 3.565992549228313, + "grad_norm": 0.27144166827201843, + "learning_rate": 3.8648195133778e-08, + "loss": 0.1866, + "step": 13401 + }, + { + "epoch": 3.566258648217137, + "grad_norm": 0.46337026357650757, + "learning_rate": 3.8634859512286175e-08, + "loss": 0.1804, + "step": 13402 + }, + { + "epoch": 3.5665247472059605, + "grad_norm": 0.2778942584991455, + "learning_rate": 3.862152564105283e-08, + "loss": 0.1674, + "step": 13403 + }, + { + "epoch": 3.5667908461947846, + "grad_norm": 0.36456727981567383, + "learning_rate": 3.860819352045831e-08, + "loss": 0.1835, + "step": 13404 + }, + { + "epoch": 3.5670569451836083, + "grad_norm": 0.27352234721183777, + "learning_rate": 3.859486315088286e-08, + "loss": 0.1733, + "step": 13405 + }, + { + "epoch": 3.567323044172432, + "grad_norm": 0.3381568193435669, + "learning_rate": 3.858153453270673e-08, + "loss": 0.169, + "step": 13406 + }, + { + "epoch": 3.567589143161256, + "grad_norm": 0.3119112551212311, + "learning_rate": 3.8568207666310006e-08, + "loss": 0.1868, + "step": 13407 + }, + { + "epoch": 3.5678552421500798, + "grad_norm": 0.33911648392677307, + "learning_rate": 3.8554882552072867e-08, + "loss": 0.1726, + "step": 13408 + }, + { + "epoch": 3.5681213411389034, + "grad_norm": 0.2995038628578186, + "learning_rate": 3.8541559190375304e-08, + "loss": 0.1949, + "step": 13409 + }, + { + "epoch": 3.5683874401277276, + "grad_norm": 0.34465110301971436, + "learning_rate": 3.85282375815974e-08, + "loss": 0.1829, + "step": 13410 + }, + { + "epoch": 3.5686535391165513, + "grad_norm": 0.29518213868141174, + "learning_rate": 3.851491772611905e-08, + "loss": 0.1757, + "step": 13411 + }, + { + "epoch": 3.568919638105375, + "grad_norm": 0.28891536593437195, + "learning_rate": 3.850159962432018e-08, + "loss": 0.1822, + "step": 13412 + }, + { + "epoch": 3.569185737094199, + "grad_norm": 0.2860948145389557, + "learning_rate": 3.848828327658066e-08, + "loss": 0.1795, + "step": 13413 + }, + { + "epoch": 3.5694518360830227, + "grad_norm": 0.25759461522102356, + "learning_rate": 3.847496868328033e-08, + "loss": 0.1811, + "step": 13414 + }, + { + "epoch": 3.569717935071847, + "grad_norm": 0.2974488437175751, + "learning_rate": 3.846165584479889e-08, + "loss": 0.192, + "step": 13415 + }, + { + "epoch": 3.5699840340606706, + "grad_norm": 0.27091342210769653, + "learning_rate": 3.8448344761516115e-08, + "loss": 0.1833, + "step": 13416 + }, + { + "epoch": 3.5702501330494947, + "grad_norm": 0.3682830035686493, + "learning_rate": 3.843503543381159e-08, + "loss": 0.1788, + "step": 13417 + }, + { + "epoch": 3.5705162320383184, + "grad_norm": 0.40550288558006287, + "learning_rate": 3.842172786206496e-08, + "loss": 0.1721, + "step": 13418 + }, + { + "epoch": 3.570782331027142, + "grad_norm": 0.35230791568756104, + "learning_rate": 3.840842204665578e-08, + "loss": 0.1726, + "step": 13419 + }, + { + "epoch": 3.571048430015966, + "grad_norm": 0.3263658881187439, + "learning_rate": 3.839511798796357e-08, + "loss": 0.1782, + "step": 13420 + }, + { + "epoch": 3.57131452900479, + "grad_norm": 0.3024901747703552, + "learning_rate": 3.83818156863678e-08, + "loss": 0.182, + "step": 13421 + }, + { + "epoch": 3.5715806279936135, + "grad_norm": 0.2655559182167053, + "learning_rate": 3.836851514224784e-08, + "loss": 0.165, + "step": 13422 + }, + { + "epoch": 3.5718467269824377, + "grad_norm": 0.30754485726356506, + "learning_rate": 3.83552163559831e-08, + "loss": 0.1768, + "step": 13423 + }, + { + "epoch": 3.5721128259712613, + "grad_norm": 0.34226176142692566, + "learning_rate": 3.834191932795283e-08, + "loss": 0.1771, + "step": 13424 + }, + { + "epoch": 3.572378924960085, + "grad_norm": 0.39905914664268494, + "learning_rate": 3.8328624058536354e-08, + "loss": 0.1827, + "step": 13425 + }, + { + "epoch": 3.572645023948909, + "grad_norm": 0.2646041512489319, + "learning_rate": 3.831533054811279e-08, + "loss": 0.1644, + "step": 13426 + }, + { + "epoch": 3.572911122937733, + "grad_norm": 0.356772243976593, + "learning_rate": 3.8302038797061365e-08, + "loss": 0.1986, + "step": 13427 + }, + { + "epoch": 3.5731772219265565, + "grad_norm": 0.2820441722869873, + "learning_rate": 3.828874880576116e-08, + "loss": 0.1651, + "step": 13428 + }, + { + "epoch": 3.5734433209153806, + "grad_norm": 0.28206080198287964, + "learning_rate": 3.8275460574591276e-08, + "loss": 0.1847, + "step": 13429 + }, + { + "epoch": 3.5737094199042043, + "grad_norm": 0.3176414668560028, + "learning_rate": 3.8262174103930656e-08, + "loss": 0.1684, + "step": 13430 + }, + { + "epoch": 3.573975518893028, + "grad_norm": 0.3948129713535309, + "learning_rate": 3.8248889394158326e-08, + "loss": 0.1866, + "step": 13431 + }, + { + "epoch": 3.574241617881852, + "grad_norm": 0.5753121376037598, + "learning_rate": 3.823560644565311e-08, + "loss": 0.1764, + "step": 13432 + }, + { + "epoch": 3.574507716870676, + "grad_norm": 0.28105178475379944, + "learning_rate": 3.822232525879392e-08, + "loss": 0.1969, + "step": 13433 + }, + { + "epoch": 3.5747738158594995, + "grad_norm": 0.270515114068985, + "learning_rate": 3.820904583395954e-08, + "loss": 0.1716, + "step": 13434 + }, + { + "epoch": 3.5750399148483236, + "grad_norm": 0.2885766625404358, + "learning_rate": 3.819576817152878e-08, + "loss": 0.1822, + "step": 13435 + }, + { + "epoch": 3.5753060138371473, + "grad_norm": 0.3529072701931, + "learning_rate": 3.8182492271880273e-08, + "loss": 0.1921, + "step": 13436 + }, + { + "epoch": 3.5755721128259714, + "grad_norm": 0.3860267698764801, + "learning_rate": 3.81692181353927e-08, + "loss": 0.2048, + "step": 13437 + }, + { + "epoch": 3.575838211814795, + "grad_norm": 0.31697213649749756, + "learning_rate": 3.815594576244471e-08, + "loss": 0.1612, + "step": 13438 + }, + { + "epoch": 3.576104310803619, + "grad_norm": 0.4339614510536194, + "learning_rate": 3.8142675153414785e-08, + "loss": 0.1828, + "step": 13439 + }, + { + "epoch": 3.576370409792443, + "grad_norm": 0.26828306913375854, + "learning_rate": 3.81294063086815e-08, + "loss": 0.1559, + "step": 13440 + }, + { + "epoch": 3.5766365087812666, + "grad_norm": 0.4560030698776245, + "learning_rate": 3.81161392286232e-08, + "loss": 0.1807, + "step": 13441 + }, + { + "epoch": 3.5769026077700907, + "grad_norm": 0.27357015013694763, + "learning_rate": 3.8102873913618435e-08, + "loss": 0.1615, + "step": 13442 + }, + { + "epoch": 3.5771687067589144, + "grad_norm": 0.2735452353954315, + "learning_rate": 3.808961036404546e-08, + "loss": 0.1664, + "step": 13443 + }, + { + "epoch": 3.577434805747738, + "grad_norm": 0.2842042148113251, + "learning_rate": 3.8076348580282626e-08, + "loss": 0.1817, + "step": 13444 + }, + { + "epoch": 3.577700904736562, + "grad_norm": 0.44723498821258545, + "learning_rate": 3.806308856270814e-08, + "loss": 0.1714, + "step": 13445 + }, + { + "epoch": 3.577967003725386, + "grad_norm": 0.3151944875717163, + "learning_rate": 3.804983031170026e-08, + "loss": 0.1843, + "step": 13446 + }, + { + "epoch": 3.5782331027142096, + "grad_norm": 0.3340018093585968, + "learning_rate": 3.8036573827637075e-08, + "loss": 0.1846, + "step": 13447 + }, + { + "epoch": 3.5784992017030337, + "grad_norm": 0.35128363966941833, + "learning_rate": 3.8023319110896725e-08, + "loss": 0.1792, + "step": 13448 + }, + { + "epoch": 3.5787653006918574, + "grad_norm": 0.2702801823616028, + "learning_rate": 3.8010066161857245e-08, + "loss": 0.1678, + "step": 13449 + }, + { + "epoch": 3.579031399680681, + "grad_norm": 0.2758261561393738, + "learning_rate": 3.79968149808967e-08, + "loss": 0.1703, + "step": 13450 + }, + { + "epoch": 3.579297498669505, + "grad_norm": 0.28649234771728516, + "learning_rate": 3.798356556839294e-08, + "loss": 0.1809, + "step": 13451 + }, + { + "epoch": 3.579563597658329, + "grad_norm": 0.2818647623062134, + "learning_rate": 3.797031792472395e-08, + "loss": 0.18, + "step": 13452 + }, + { + "epoch": 3.5798296966471526, + "grad_norm": 0.28615376353263855, + "learning_rate": 3.795707205026751e-08, + "loss": 0.1683, + "step": 13453 + }, + { + "epoch": 3.5800957956359767, + "grad_norm": 0.2867152988910675, + "learning_rate": 3.7943827945401484e-08, + "loss": 0.1808, + "step": 13454 + }, + { + "epoch": 3.5803618946248004, + "grad_norm": 0.31845927238464355, + "learning_rate": 3.793058561050353e-08, + "loss": 0.1658, + "step": 13455 + }, + { + "epoch": 3.580627993613624, + "grad_norm": 0.2697734236717224, + "learning_rate": 3.7917345045951434e-08, + "loss": 0.1695, + "step": 13456 + }, + { + "epoch": 3.580894092602448, + "grad_norm": 0.3532976806163788, + "learning_rate": 3.7904106252122845e-08, + "loss": 0.1896, + "step": 13457 + }, + { + "epoch": 3.581160191591272, + "grad_norm": 0.4746008813381195, + "learning_rate": 3.7890869229395295e-08, + "loss": 0.1949, + "step": 13458 + }, + { + "epoch": 3.5814262905800955, + "grad_norm": 0.26267263293266296, + "learning_rate": 3.7877633978146404e-08, + "loss": 0.1733, + "step": 13459 + }, + { + "epoch": 3.5816923895689197, + "grad_norm": 0.26251858472824097, + "learning_rate": 3.78644004987536e-08, + "loss": 0.1622, + "step": 13460 + }, + { + "epoch": 3.5819584885577433, + "grad_norm": 0.3360283672809601, + "learning_rate": 3.785116879159439e-08, + "loss": 0.1613, + "step": 13461 + }, + { + "epoch": 3.5822245875465675, + "grad_norm": 0.2753472924232483, + "learning_rate": 3.783793885704611e-08, + "loss": 0.168, + "step": 13462 + }, + { + "epoch": 3.582490686535391, + "grad_norm": 0.37281617522239685, + "learning_rate": 3.782471069548613e-08, + "loss": 0.1775, + "step": 13463 + }, + { + "epoch": 3.5827567855242153, + "grad_norm": 0.406770259141922, + "learning_rate": 3.7811484307291743e-08, + "loss": 0.1971, + "step": 13464 + }, + { + "epoch": 3.583022884513039, + "grad_norm": 0.3813169300556183, + "learning_rate": 3.779825969284024e-08, + "loss": 0.193, + "step": 13465 + }, + { + "epoch": 3.5832889835018626, + "grad_norm": 0.3441554009914398, + "learning_rate": 3.778503685250872e-08, + "loss": 0.1621, + "step": 13466 + }, + { + "epoch": 3.5835550824906868, + "grad_norm": 0.605865478515625, + "learning_rate": 3.777181578667443e-08, + "loss": 0.2015, + "step": 13467 + }, + { + "epoch": 3.5838211814795105, + "grad_norm": 0.39121749997138977, + "learning_rate": 3.775859649571436e-08, + "loss": 0.1718, + "step": 13468 + }, + { + "epoch": 3.584087280468334, + "grad_norm": 0.2742752730846405, + "learning_rate": 3.7745378980005605e-08, + "loss": 0.1698, + "step": 13469 + }, + { + "epoch": 3.5843533794571583, + "grad_norm": 0.26999419927597046, + "learning_rate": 3.773216323992514e-08, + "loss": 0.1693, + "step": 13470 + }, + { + "epoch": 3.584619478445982, + "grad_norm": 0.2901706397533417, + "learning_rate": 3.771894927584995e-08, + "loss": 0.1886, + "step": 13471 + }, + { + "epoch": 3.5848855774348056, + "grad_norm": 0.4875889718532562, + "learning_rate": 3.7705737088156866e-08, + "loss": 0.1805, + "step": 13472 + }, + { + "epoch": 3.5851516764236298, + "grad_norm": 0.2920851409435272, + "learning_rate": 3.769252667722278e-08, + "loss": 0.1699, + "step": 13473 + }, + { + "epoch": 3.5854177754124534, + "grad_norm": 0.29505959153175354, + "learning_rate": 3.76793180434244e-08, + "loss": 0.1733, + "step": 13474 + }, + { + "epoch": 3.585683874401277, + "grad_norm": 0.3416200280189514, + "learning_rate": 3.7666111187138516e-08, + "loss": 0.1934, + "step": 13475 + }, + { + "epoch": 3.5859499733901012, + "grad_norm": 0.3715469539165497, + "learning_rate": 3.7652906108741854e-08, + "loss": 0.1742, + "step": 13476 + }, + { + "epoch": 3.586216072378925, + "grad_norm": 0.434781938791275, + "learning_rate": 3.763970280861096e-08, + "loss": 0.205, + "step": 13477 + }, + { + "epoch": 3.5864821713677486, + "grad_norm": 0.29585573077201843, + "learning_rate": 3.762650128712247e-08, + "loss": 0.1768, + "step": 13478 + }, + { + "epoch": 3.5867482703565727, + "grad_norm": 0.32909929752349854, + "learning_rate": 3.7613301544652906e-08, + "loss": 0.1674, + "step": 13479 + }, + { + "epoch": 3.5870143693453964, + "grad_norm": 0.40552881360054016, + "learning_rate": 3.76001035815788e-08, + "loss": 0.1715, + "step": 13480 + }, + { + "epoch": 3.58728046833422, + "grad_norm": 0.3402256667613983, + "learning_rate": 3.7586907398276504e-08, + "loss": 0.174, + "step": 13481 + }, + { + "epoch": 3.5875465673230442, + "grad_norm": 0.32951050996780396, + "learning_rate": 3.7573712995122474e-08, + "loss": 0.185, + "step": 13482 + }, + { + "epoch": 3.587812666311868, + "grad_norm": 0.3102642297744751, + "learning_rate": 3.756052037249296e-08, + "loss": 0.1773, + "step": 13483 + }, + { + "epoch": 3.5880787653006916, + "grad_norm": 0.3016367256641388, + "learning_rate": 3.7547329530764306e-08, + "loss": 0.1585, + "step": 13484 + }, + { + "epoch": 3.5883448642895157, + "grad_norm": 0.3265334367752075, + "learning_rate": 3.753414047031272e-08, + "loss": 0.1831, + "step": 13485 + }, + { + "epoch": 3.5886109632783394, + "grad_norm": 0.2990509271621704, + "learning_rate": 3.752095319151442e-08, + "loss": 0.187, + "step": 13486 + }, + { + "epoch": 3.5888770622671635, + "grad_norm": 0.3494367003440857, + "learning_rate": 3.750776769474545e-08, + "loss": 0.1783, + "step": 13487 + }, + { + "epoch": 3.589143161255987, + "grad_norm": 0.2802756130695343, + "learning_rate": 3.7494583980381975e-08, + "loss": 0.1781, + "step": 13488 + }, + { + "epoch": 3.5894092602448113, + "grad_norm": 0.2628484070301056, + "learning_rate": 3.748140204879996e-08, + "loss": 0.1683, + "step": 13489 + }, + { + "epoch": 3.589675359233635, + "grad_norm": 0.286519318819046, + "learning_rate": 3.746822190037543e-08, + "loss": 0.1751, + "step": 13490 + }, + { + "epoch": 3.5899414582224587, + "grad_norm": 0.268540620803833, + "learning_rate": 3.745504353548424e-08, + "loss": 0.1814, + "step": 13491 + }, + { + "epoch": 3.590207557211283, + "grad_norm": 0.38605996966362, + "learning_rate": 3.744186695450228e-08, + "loss": 0.1645, + "step": 13492 + }, + { + "epoch": 3.5904736562001065, + "grad_norm": 0.37420356273651123, + "learning_rate": 3.7428692157805466e-08, + "loss": 0.1948, + "step": 13493 + }, + { + "epoch": 3.59073975518893, + "grad_norm": 0.2916076183319092, + "learning_rate": 3.741551914576947e-08, + "loss": 0.1858, + "step": 13494 + }, + { + "epoch": 3.5910058541777543, + "grad_norm": 1.0352915525436401, + "learning_rate": 3.7402347918770074e-08, + "loss": 0.1829, + "step": 13495 + }, + { + "epoch": 3.591271953166578, + "grad_norm": 0.3531738221645355, + "learning_rate": 3.7389178477182895e-08, + "loss": 0.1848, + "step": 13496 + }, + { + "epoch": 3.5915380521554017, + "grad_norm": 0.29450875520706177, + "learning_rate": 3.73760108213836e-08, + "loss": 0.1844, + "step": 13497 + }, + { + "epoch": 3.591804151144226, + "grad_norm": 0.2964146137237549, + "learning_rate": 3.7362844951747705e-08, + "loss": 0.1787, + "step": 13498 + }, + { + "epoch": 3.5920702501330495, + "grad_norm": 0.2552628517150879, + "learning_rate": 3.7349680868650754e-08, + "loss": 0.1708, + "step": 13499 + }, + { + "epoch": 3.592336349121873, + "grad_norm": 0.4171089828014374, + "learning_rate": 3.733651857246822e-08, + "loss": 0.1813, + "step": 13500 + }, + { + "epoch": 3.5926024481106973, + "grad_norm": 0.36626380681991577, + "learning_rate": 3.732335806357555e-08, + "loss": 0.2015, + "step": 13501 + }, + { + "epoch": 3.592868547099521, + "grad_norm": 0.36842289566993713, + "learning_rate": 3.7310199342348034e-08, + "loss": 0.181, + "step": 13502 + }, + { + "epoch": 3.5931346460883447, + "grad_norm": 0.40733227133750916, + "learning_rate": 3.729704240916106e-08, + "loss": 0.1871, + "step": 13503 + }, + { + "epoch": 3.593400745077169, + "grad_norm": 0.2856406271457672, + "learning_rate": 3.7283887264389825e-08, + "loss": 0.1772, + "step": 13504 + }, + { + "epoch": 3.5936668440659925, + "grad_norm": 0.4059846103191376, + "learning_rate": 3.7270733908409604e-08, + "loss": 0.1784, + "step": 13505 + }, + { + "epoch": 3.593932943054816, + "grad_norm": 0.3219764828681946, + "learning_rate": 3.725758234159545e-08, + "loss": 0.1781, + "step": 13506 + }, + { + "epoch": 3.5941990420436403, + "grad_norm": 0.33298951387405396, + "learning_rate": 3.724443256432262e-08, + "loss": 0.1855, + "step": 13507 + }, + { + "epoch": 3.594465141032464, + "grad_norm": 0.34254369139671326, + "learning_rate": 3.723128457696607e-08, + "loss": 0.2079, + "step": 13508 + }, + { + "epoch": 3.594731240021288, + "grad_norm": 0.37936246395111084, + "learning_rate": 3.7218138379900855e-08, + "loss": 0.1699, + "step": 13509 + }, + { + "epoch": 3.5949973390101118, + "grad_norm": 0.26021137833595276, + "learning_rate": 3.7204993973501884e-08, + "loss": 0.1653, + "step": 13510 + }, + { + "epoch": 3.5952634379989354, + "grad_norm": 0.3912879526615143, + "learning_rate": 3.71918513581441e-08, + "loss": 0.1759, + "step": 13511 + }, + { + "epoch": 3.5955295369877596, + "grad_norm": 0.26602035760879517, + "learning_rate": 3.717871053420237e-08, + "loss": 0.1663, + "step": 13512 + }, + { + "epoch": 3.5957956359765832, + "grad_norm": 0.3348656892776489, + "learning_rate": 3.716557150205145e-08, + "loss": 0.1891, + "step": 13513 + }, + { + "epoch": 3.5960617349654074, + "grad_norm": 0.29975855350494385, + "learning_rate": 3.715243426206611e-08, + "loss": 0.1709, + "step": 13514 + }, + { + "epoch": 3.596327833954231, + "grad_norm": 0.3781687915325165, + "learning_rate": 3.7139298814621054e-08, + "loss": 0.1843, + "step": 13515 + }, + { + "epoch": 3.5965939329430547, + "grad_norm": 0.44237983226776123, + "learning_rate": 3.7126165160090975e-08, + "loss": 0.1874, + "step": 13516 + }, + { + "epoch": 3.596860031931879, + "grad_norm": 0.3694855868816376, + "learning_rate": 3.711303329885039e-08, + "loss": 0.1857, + "step": 13517 + }, + { + "epoch": 3.5971261309207025, + "grad_norm": 0.2842940092086792, + "learning_rate": 3.7099903231273936e-08, + "loss": 0.1706, + "step": 13518 + }, + { + "epoch": 3.5973922299095262, + "grad_norm": 0.28671005368232727, + "learning_rate": 3.7086774957736025e-08, + "loss": 0.1894, + "step": 13519 + }, + { + "epoch": 3.5976583288983504, + "grad_norm": 0.3007357716560364, + "learning_rate": 3.707364847861112e-08, + "loss": 0.1721, + "step": 13520 + }, + { + "epoch": 3.597924427887174, + "grad_norm": 0.39561769366264343, + "learning_rate": 3.706052379427365e-08, + "loss": 0.1876, + "step": 13521 + }, + { + "epoch": 3.5981905268759977, + "grad_norm": 0.3083924949169159, + "learning_rate": 3.704740090509797e-08, + "loss": 0.1713, + "step": 13522 + }, + { + "epoch": 3.598456625864822, + "grad_norm": 0.2673228979110718, + "learning_rate": 3.7034279811458304e-08, + "loss": 0.1804, + "step": 13523 + }, + { + "epoch": 3.5987227248536455, + "grad_norm": 0.3265096843242645, + "learning_rate": 3.7021160513728966e-08, + "loss": 0.1448, + "step": 13524 + }, + { + "epoch": 3.598988823842469, + "grad_norm": 0.27479711174964905, + "learning_rate": 3.7008043012284076e-08, + "loss": 0.1929, + "step": 13525 + }, + { + "epoch": 3.5992549228312933, + "grad_norm": 0.35658085346221924, + "learning_rate": 3.6994927307497836e-08, + "loss": 0.1799, + "step": 13526 + }, + { + "epoch": 3.599521021820117, + "grad_norm": 0.2580820322036743, + "learning_rate": 3.698181339974428e-08, + "loss": 0.1651, + "step": 13527 + }, + { + "epoch": 3.5997871208089407, + "grad_norm": 0.3618791401386261, + "learning_rate": 3.6968701289397454e-08, + "loss": 0.1583, + "step": 13528 + }, + { + "epoch": 3.600053219797765, + "grad_norm": 0.38624581694602966, + "learning_rate": 3.6955590976831355e-08, + "loss": 0.1904, + "step": 13529 + }, + { + "epoch": 3.6003193187865885, + "grad_norm": 0.3872763216495514, + "learning_rate": 3.694248246241991e-08, + "loss": 0.1814, + "step": 13530 + }, + { + "epoch": 3.600585417775412, + "grad_norm": 0.3287528157234192, + "learning_rate": 3.692937574653704e-08, + "loss": 0.1747, + "step": 13531 + }, + { + "epoch": 3.6008515167642363, + "grad_norm": 0.3123336136341095, + "learning_rate": 3.6916270829556505e-08, + "loss": 0.1871, + "step": 13532 + }, + { + "epoch": 3.60111761575306, + "grad_norm": 0.28337591886520386, + "learning_rate": 3.690316771185216e-08, + "loss": 0.1474, + "step": 13533 + }, + { + "epoch": 3.601383714741884, + "grad_norm": 0.3566989004611969, + "learning_rate": 3.689006639379765e-08, + "loss": 0.1758, + "step": 13534 + }, + { + "epoch": 3.601649813730708, + "grad_norm": 0.29488447308540344, + "learning_rate": 3.687696687576669e-08, + "loss": 0.1883, + "step": 13535 + }, + { + "epoch": 3.601915912719532, + "grad_norm": 0.3428952395915985, + "learning_rate": 3.686386915813291e-08, + "loss": 0.1657, + "step": 13536 + }, + { + "epoch": 3.6021820117083556, + "grad_norm": 0.2789889872074127, + "learning_rate": 3.685077324126992e-08, + "loss": 0.1836, + "step": 13537 + }, + { + "epoch": 3.6024481106971793, + "grad_norm": 0.2754192054271698, + "learning_rate": 3.6837679125551156e-08, + "loss": 0.1658, + "step": 13538 + }, + { + "epoch": 3.6027142096860034, + "grad_norm": 0.2961655259132385, + "learning_rate": 3.6824586811350176e-08, + "loss": 0.163, + "step": 13539 + }, + { + "epoch": 3.602980308674827, + "grad_norm": 0.28226006031036377, + "learning_rate": 3.681149629904032e-08, + "loss": 0.1856, + "step": 13540 + }, + { + "epoch": 3.603246407663651, + "grad_norm": 0.2821267545223236, + "learning_rate": 3.679840758899504e-08, + "loss": 0.1769, + "step": 13541 + }, + { + "epoch": 3.603512506652475, + "grad_norm": 0.40049588680267334, + "learning_rate": 3.6785320681587565e-08, + "loss": 0.1748, + "step": 13542 + }, + { + "epoch": 3.6037786056412986, + "grad_norm": 0.30754297971725464, + "learning_rate": 3.6772235577191205e-08, + "loss": 0.1993, + "step": 13543 + }, + { + "epoch": 3.6040447046301223, + "grad_norm": 1.4723789691925049, + "learning_rate": 3.6759152276179176e-08, + "loss": 0.1909, + "step": 13544 + }, + { + "epoch": 3.6043108036189464, + "grad_norm": 0.29594770073890686, + "learning_rate": 3.674607077892466e-08, + "loss": 0.1816, + "step": 13545 + }, + { + "epoch": 3.60457690260777, + "grad_norm": 0.42160600423812866, + "learning_rate": 3.6732991085800714e-08, + "loss": 0.1944, + "step": 13546 + }, + { + "epoch": 3.6048430015965938, + "grad_norm": 0.38329392671585083, + "learning_rate": 3.671991319718043e-08, + "loss": 0.184, + "step": 13547 + }, + { + "epoch": 3.605109100585418, + "grad_norm": 0.2714586853981018, + "learning_rate": 3.670683711343684e-08, + "loss": 0.1712, + "step": 13548 + }, + { + "epoch": 3.6053751995742416, + "grad_norm": 0.33317938446998596, + "learning_rate": 3.6693762834942843e-08, + "loss": 0.1556, + "step": 13549 + }, + { + "epoch": 3.6056412985630653, + "grad_norm": 0.5530869364738464, + "learning_rate": 3.6680690362071377e-08, + "loss": 0.1783, + "step": 13550 + }, + { + "epoch": 3.6059073975518894, + "grad_norm": 0.2791384756565094, + "learning_rate": 3.6667619695195286e-08, + "loss": 0.1688, + "step": 13551 + }, + { + "epoch": 3.606173496540713, + "grad_norm": 0.27873358130455017, + "learning_rate": 3.665455083468742e-08, + "loss": 0.1783, + "step": 13552 + }, + { + "epoch": 3.6064395955295367, + "grad_norm": 0.27713078260421753, + "learning_rate": 3.6641483780920446e-08, + "loss": 0.1728, + "step": 13553 + }, + { + "epoch": 3.606705694518361, + "grad_norm": 0.355072945356369, + "learning_rate": 3.6628418534267146e-08, + "loss": 0.1817, + "step": 13554 + }, + { + "epoch": 3.6069717935071846, + "grad_norm": 0.28200986981391907, + "learning_rate": 3.66153550951001e-08, + "loss": 0.1897, + "step": 13555 + }, + { + "epoch": 3.6072378924960087, + "grad_norm": 0.2979382276535034, + "learning_rate": 3.6602293463791954e-08, + "loss": 0.1843, + "step": 13556 + }, + { + "epoch": 3.6075039914848324, + "grad_norm": 0.27582478523254395, + "learning_rate": 3.658923364071515e-08, + "loss": 0.1752, + "step": 13557 + }, + { + "epoch": 3.607770090473656, + "grad_norm": 0.2773403525352478, + "learning_rate": 3.6576175626242344e-08, + "loss": 0.1493, + "step": 13558 + }, + { + "epoch": 3.60803618946248, + "grad_norm": 0.49027320742607117, + "learning_rate": 3.656311942074586e-08, + "loss": 0.1786, + "step": 13559 + }, + { + "epoch": 3.608302288451304, + "grad_norm": 0.28140369057655334, + "learning_rate": 3.6550065024598145e-08, + "loss": 0.181, + "step": 13560 + }, + { + "epoch": 3.608568387440128, + "grad_norm": 0.4120008051395416, + "learning_rate": 3.653701243817149e-08, + "loss": 0.1863, + "step": 13561 + }, + { + "epoch": 3.6088344864289517, + "grad_norm": 0.2992574870586395, + "learning_rate": 3.652396166183822e-08, + "loss": 0.1872, + "step": 13562 + }, + { + "epoch": 3.6091005854177753, + "grad_norm": 0.35586830973625183, + "learning_rate": 3.651091269597053e-08, + "loss": 0.1967, + "step": 13563 + }, + { + "epoch": 3.6093666844065995, + "grad_norm": 0.28001514077186584, + "learning_rate": 3.649786554094062e-08, + "loss": 0.1743, + "step": 13564 + }, + { + "epoch": 3.609632783395423, + "grad_norm": 0.3210647404193878, + "learning_rate": 3.6484820197120635e-08, + "loss": 0.172, + "step": 13565 + }, + { + "epoch": 3.609898882384247, + "grad_norm": 0.28350621461868286, + "learning_rate": 3.647177666488263e-08, + "loss": 0.1641, + "step": 13566 + }, + { + "epoch": 3.610164981373071, + "grad_norm": 0.286811500787735, + "learning_rate": 3.645873494459871e-08, + "loss": 0.1576, + "step": 13567 + }, + { + "epoch": 3.6104310803618946, + "grad_norm": 0.3260350823402405, + "learning_rate": 3.6445695036640746e-08, + "loss": 0.2097, + "step": 13568 + }, + { + "epoch": 3.6106971793507183, + "grad_norm": 0.26002514362335205, + "learning_rate": 3.6432656941380733e-08, + "loss": 0.1841, + "step": 13569 + }, + { + "epoch": 3.6109632783395424, + "grad_norm": 0.4296674430370331, + "learning_rate": 3.6419620659190496e-08, + "loss": 0.1924, + "step": 13570 + }, + { + "epoch": 3.611229377328366, + "grad_norm": 0.29478588700294495, + "learning_rate": 3.640658619044188e-08, + "loss": 0.2028, + "step": 13571 + }, + { + "epoch": 3.61149547631719, + "grad_norm": 0.43197470903396606, + "learning_rate": 3.639355353550664e-08, + "loss": 0.1942, + "step": 13572 + }, + { + "epoch": 3.611761575306014, + "grad_norm": 0.38442882895469666, + "learning_rate": 3.6380522694756553e-08, + "loss": 0.1746, + "step": 13573 + }, + { + "epoch": 3.6120276742948376, + "grad_norm": 0.27473849058151245, + "learning_rate": 3.636749366856321e-08, + "loss": 0.1632, + "step": 13574 + }, + { + "epoch": 3.6122937732836613, + "grad_norm": 0.26281261444091797, + "learning_rate": 3.6354466457298284e-08, + "loss": 0.1751, + "step": 13575 + }, + { + "epoch": 3.6125598722724854, + "grad_norm": 0.4071154296398163, + "learning_rate": 3.6341441061333265e-08, + "loss": 0.1817, + "step": 13576 + }, + { + "epoch": 3.612825971261309, + "grad_norm": 0.3710656464099884, + "learning_rate": 3.6328417481039755e-08, + "loss": 0.1774, + "step": 13577 + }, + { + "epoch": 3.613092070250133, + "grad_norm": 0.30148130655288696, + "learning_rate": 3.631539571678912e-08, + "loss": 0.1666, + "step": 13578 + }, + { + "epoch": 3.613358169238957, + "grad_norm": 0.2613242268562317, + "learning_rate": 3.630237576895282e-08, + "loss": 0.1441, + "step": 13579 + }, + { + "epoch": 3.6136242682277806, + "grad_norm": 0.3536601662635803, + "learning_rate": 3.6289357637902206e-08, + "loss": 0.1793, + "step": 13580 + }, + { + "epoch": 3.6138903672166047, + "grad_norm": 0.2787293493747711, + "learning_rate": 3.627634132400861e-08, + "loss": 0.1742, + "step": 13581 + }, + { + "epoch": 3.6141564662054284, + "grad_norm": 0.3376922309398651, + "learning_rate": 3.6263326827643215e-08, + "loss": 0.1801, + "step": 13582 + }, + { + "epoch": 3.6144225651942525, + "grad_norm": 0.46193432807922363, + "learning_rate": 3.625031414917728e-08, + "loss": 0.1701, + "step": 13583 + }, + { + "epoch": 3.614688664183076, + "grad_norm": 0.2868511378765106, + "learning_rate": 3.6237303288981916e-08, + "loss": 0.1787, + "step": 13584 + }, + { + "epoch": 3.6149547631719, + "grad_norm": 0.27264735102653503, + "learning_rate": 3.622429424742823e-08, + "loss": 0.1623, + "step": 13585 + }, + { + "epoch": 3.615220862160724, + "grad_norm": 0.27312877774238586, + "learning_rate": 3.621128702488727e-08, + "loss": 0.1736, + "step": 13586 + }, + { + "epoch": 3.6154869611495477, + "grad_norm": 0.335383802652359, + "learning_rate": 3.619828162173002e-08, + "loss": 0.1659, + "step": 13587 + }, + { + "epoch": 3.6157530601383714, + "grad_norm": 0.27454251050949097, + "learning_rate": 3.618527803832747e-08, + "loss": 0.1837, + "step": 13588 + }, + { + "epoch": 3.6160191591271955, + "grad_norm": 0.2795220613479614, + "learning_rate": 3.617227627505044e-08, + "loss": 0.1783, + "step": 13589 + }, + { + "epoch": 3.616285258116019, + "grad_norm": 0.2833915054798126, + "learning_rate": 3.615927633226983e-08, + "loss": 0.1684, + "step": 13590 + }, + { + "epoch": 3.616551357104843, + "grad_norm": 0.28855231404304504, + "learning_rate": 3.614627821035634e-08, + "loss": 0.1844, + "step": 13591 + }, + { + "epoch": 3.616817456093667, + "grad_norm": 0.27259907126426697, + "learning_rate": 3.6133281909680805e-08, + "loss": 0.1846, + "step": 13592 + }, + { + "epoch": 3.6170835550824907, + "grad_norm": 0.36527785658836365, + "learning_rate": 3.6120287430613806e-08, + "loss": 0.1806, + "step": 13593 + }, + { + "epoch": 3.6173496540713144, + "grad_norm": 0.377534419298172, + "learning_rate": 3.610729477352602e-08, + "loss": 0.1904, + "step": 13594 + }, + { + "epoch": 3.6176157530601385, + "grad_norm": 0.36161866784095764, + "learning_rate": 3.609430393878803e-08, + "loss": 0.156, + "step": 13595 + }, + { + "epoch": 3.617881852048962, + "grad_norm": 0.33562952280044556, + "learning_rate": 3.608131492677038e-08, + "loss": 0.1785, + "step": 13596 + }, + { + "epoch": 3.618147951037786, + "grad_norm": 0.27278923988342285, + "learning_rate": 3.606832773784348e-08, + "loss": 0.181, + "step": 13597 + }, + { + "epoch": 3.61841405002661, + "grad_norm": 0.264975905418396, + "learning_rate": 3.6055342372377836e-08, + "loss": 0.164, + "step": 13598 + }, + { + "epoch": 3.6186801490154337, + "grad_norm": 0.37358468770980835, + "learning_rate": 3.604235883074374e-08, + "loss": 0.176, + "step": 13599 + }, + { + "epoch": 3.6189462480042573, + "grad_norm": 0.38052424788475037, + "learning_rate": 3.6029377113311524e-08, + "loss": 0.1877, + "step": 13600 + }, + { + "epoch": 3.6192123469930815, + "grad_norm": 0.27776038646698, + "learning_rate": 3.601639722045147e-08, + "loss": 0.1801, + "step": 13601 + }, + { + "epoch": 3.619478445981905, + "grad_norm": 0.27770715951919556, + "learning_rate": 3.6003419152533785e-08, + "loss": 0.1719, + "step": 13602 + }, + { + "epoch": 3.619744544970729, + "grad_norm": 0.38827750086784363, + "learning_rate": 3.599044290992867e-08, + "loss": 0.1626, + "step": 13603 + }, + { + "epoch": 3.620010643959553, + "grad_norm": 0.3863622844219208, + "learning_rate": 3.597746849300617e-08, + "loss": 0.185, + "step": 13604 + }, + { + "epoch": 3.6202767429483766, + "grad_norm": 0.3819751739501953, + "learning_rate": 3.596449590213639e-08, + "loss": 0.1763, + "step": 13605 + }, + { + "epoch": 3.6205428419372008, + "grad_norm": 0.3380923867225647, + "learning_rate": 3.595152513768929e-08, + "loss": 0.1752, + "step": 13606 + }, + { + "epoch": 3.6208089409260245, + "grad_norm": 0.2961583137512207, + "learning_rate": 3.5938556200034876e-08, + "loss": 0.1734, + "step": 13607 + }, + { + "epoch": 3.6210750399148486, + "grad_norm": 0.3794705867767334, + "learning_rate": 3.5925589089542955e-08, + "loss": 0.1866, + "step": 13608 + }, + { + "epoch": 3.6213411389036723, + "grad_norm": 0.42258429527282715, + "learning_rate": 3.59126238065835e-08, + "loss": 0.1751, + "step": 13609 + }, + { + "epoch": 3.621607237892496, + "grad_norm": 0.3804038166999817, + "learning_rate": 3.5899660351526205e-08, + "loss": 0.1701, + "step": 13610 + }, + { + "epoch": 3.62187333688132, + "grad_norm": 0.294336199760437, + "learning_rate": 3.58866987247409e-08, + "loss": 0.1797, + "step": 13611 + }, + { + "epoch": 3.6221394358701438, + "grad_norm": 0.45149949193000793, + "learning_rate": 3.58737389265972e-08, + "loss": 0.1784, + "step": 13612 + }, + { + "epoch": 3.6224055348589674, + "grad_norm": 0.47433358430862427, + "learning_rate": 3.58607809574648e-08, + "loss": 0.1823, + "step": 13613 + }, + { + "epoch": 3.6226716338477916, + "grad_norm": 0.3703540563583374, + "learning_rate": 3.5847824817713234e-08, + "loss": 0.1832, + "step": 13614 + }, + { + "epoch": 3.6229377328366152, + "grad_norm": 0.30012232065200806, + "learning_rate": 3.583487050771208e-08, + "loss": 0.184, + "step": 13615 + }, + { + "epoch": 3.623203831825439, + "grad_norm": 0.34804099798202515, + "learning_rate": 3.582191802783079e-08, + "loss": 0.18, + "step": 13616 + }, + { + "epoch": 3.623469930814263, + "grad_norm": 0.38305747509002686, + "learning_rate": 3.580896737843886e-08, + "loss": 0.1953, + "step": 13617 + }, + { + "epoch": 3.6237360298030867, + "grad_norm": 0.2681860327720642, + "learning_rate": 3.5796018559905584e-08, + "loss": 0.1592, + "step": 13618 + }, + { + "epoch": 3.6240021287919104, + "grad_norm": 0.263520210981369, + "learning_rate": 3.5783071572600363e-08, + "loss": 0.1785, + "step": 13619 + }, + { + "epoch": 3.6242682277807345, + "grad_norm": 0.26447778940200806, + "learning_rate": 3.57701264168924e-08, + "loss": 0.1573, + "step": 13620 + }, + { + "epoch": 3.6245343267695582, + "grad_norm": 0.2824147343635559, + "learning_rate": 3.5757183093150955e-08, + "loss": 0.1816, + "step": 13621 + }, + { + "epoch": 3.624800425758382, + "grad_norm": 0.2748565077781677, + "learning_rate": 3.5744241601745195e-08, + "loss": 0.1648, + "step": 13622 + }, + { + "epoch": 3.625066524747206, + "grad_norm": 0.26256826519966125, + "learning_rate": 3.573130194304425e-08, + "loss": 0.174, + "step": 13623 + }, + { + "epoch": 3.6253326237360297, + "grad_norm": 0.33063480257987976, + "learning_rate": 3.571836411741719e-08, + "loss": 0.1781, + "step": 13624 + }, + { + "epoch": 3.6255987227248534, + "grad_norm": 0.29507118463516235, + "learning_rate": 3.5705428125232986e-08, + "loss": 0.1606, + "step": 13625 + }, + { + "epoch": 3.6258648217136775, + "grad_norm": 0.27084609866142273, + "learning_rate": 3.569249396686065e-08, + "loss": 0.176, + "step": 13626 + }, + { + "epoch": 3.626130920702501, + "grad_norm": 0.3384082317352295, + "learning_rate": 3.5679561642669045e-08, + "loss": 0.1848, + "step": 13627 + }, + { + "epoch": 3.6263970196913253, + "grad_norm": 0.28197145462036133, + "learning_rate": 3.566663115302707e-08, + "loss": 0.1752, + "step": 13628 + }, + { + "epoch": 3.626663118680149, + "grad_norm": 0.28003445267677307, + "learning_rate": 3.5653702498303475e-08, + "loss": 0.1672, + "step": 13629 + }, + { + "epoch": 3.6269292176689727, + "grad_norm": 0.4415486752986908, + "learning_rate": 3.5640775678867055e-08, + "loss": 0.1961, + "step": 13630 + }, + { + "epoch": 3.627195316657797, + "grad_norm": 0.2794036567211151, + "learning_rate": 3.562785069508648e-08, + "loss": 0.1763, + "step": 13631 + }, + { + "epoch": 3.6274614156466205, + "grad_norm": 0.26792749762535095, + "learning_rate": 3.5614927547330454e-08, + "loss": 0.1818, + "step": 13632 + }, + { + "epoch": 3.6277275146354446, + "grad_norm": 0.27910998463630676, + "learning_rate": 3.5602006235967496e-08, + "loss": 0.1808, + "step": 13633 + }, + { + "epoch": 3.6279936136242683, + "grad_norm": 0.2790265381336212, + "learning_rate": 3.5589086761366215e-08, + "loss": 0.185, + "step": 13634 + }, + { + "epoch": 3.628259712613092, + "grad_norm": 0.34922105073928833, + "learning_rate": 3.557616912389504e-08, + "loss": 0.1701, + "step": 13635 + }, + { + "epoch": 3.628525811601916, + "grad_norm": 0.2491264045238495, + "learning_rate": 3.5563253323922436e-08, + "loss": 0.1532, + "step": 13636 + }, + { + "epoch": 3.62879191059074, + "grad_norm": 0.2688121497631073, + "learning_rate": 3.5550339361816783e-08, + "loss": 0.1695, + "step": 13637 + }, + { + "epoch": 3.6290580095795635, + "grad_norm": 0.2851217985153198, + "learning_rate": 3.553742723794643e-08, + "loss": 0.1746, + "step": 13638 + }, + { + "epoch": 3.6293241085683876, + "grad_norm": 0.2732401490211487, + "learning_rate": 3.5524516952679674e-08, + "loss": 0.1853, + "step": 13639 + }, + { + "epoch": 3.6295902075572113, + "grad_norm": 0.3184315860271454, + "learning_rate": 3.551160850638468e-08, + "loss": 0.1829, + "step": 13640 + }, + { + "epoch": 3.629856306546035, + "grad_norm": 0.40055447816848755, + "learning_rate": 3.54987018994297e-08, + "loss": 0.1764, + "step": 13641 + }, + { + "epoch": 3.630122405534859, + "grad_norm": 0.3746151924133301, + "learning_rate": 3.548579713218277e-08, + "loss": 0.2036, + "step": 13642 + }, + { + "epoch": 3.630388504523683, + "grad_norm": 0.3851032555103302, + "learning_rate": 3.547289420501204e-08, + "loss": 0.1926, + "step": 13643 + }, + { + "epoch": 3.6306546035125065, + "grad_norm": 0.27845045924186707, + "learning_rate": 3.5459993118285467e-08, + "loss": 0.1831, + "step": 13644 + }, + { + "epoch": 3.6309207025013306, + "grad_norm": 0.3904474377632141, + "learning_rate": 3.544709387237104e-08, + "loss": 0.1826, + "step": 13645 + }, + { + "epoch": 3.6311868014901543, + "grad_norm": 0.4013054668903351, + "learning_rate": 3.543419646763667e-08, + "loss": 0.1808, + "step": 13646 + }, + { + "epoch": 3.631452900478978, + "grad_norm": 0.37709012627601624, + "learning_rate": 3.5421300904450247e-08, + "loss": 0.1809, + "step": 13647 + }, + { + "epoch": 3.631718999467802, + "grad_norm": 0.42944350838661194, + "learning_rate": 3.540840718317953e-08, + "loss": 0.196, + "step": 13648 + }, + { + "epoch": 3.6319850984566258, + "grad_norm": 0.2720871567726135, + "learning_rate": 3.5395515304192324e-08, + "loss": 0.1767, + "step": 13649 + }, + { + "epoch": 3.6322511974454494, + "grad_norm": 0.26149851083755493, + "learning_rate": 3.538262526785628e-08, + "loss": 0.1742, + "step": 13650 + }, + { + "epoch": 3.6325172964342736, + "grad_norm": 0.33965882658958435, + "learning_rate": 3.536973707453906e-08, + "loss": 0.1708, + "step": 13651 + }, + { + "epoch": 3.6327833954230973, + "grad_norm": 0.3365662395954132, + "learning_rate": 3.535685072460828e-08, + "loss": 0.1884, + "step": 13652 + }, + { + "epoch": 3.6330494944119214, + "grad_norm": 0.40066269040107727, + "learning_rate": 3.534396621843151e-08, + "loss": 0.1669, + "step": 13653 + }, + { + "epoch": 3.633315593400745, + "grad_norm": 0.29446879029273987, + "learning_rate": 3.533108355637618e-08, + "loss": 0.1785, + "step": 13654 + }, + { + "epoch": 3.633581692389569, + "grad_norm": 0.3700595796108246, + "learning_rate": 3.53182027388098e-08, + "loss": 0.1737, + "step": 13655 + }, + { + "epoch": 3.633847791378393, + "grad_norm": 0.2671910226345062, + "learning_rate": 3.530532376609967e-08, + "loss": 0.1688, + "step": 13656 + }, + { + "epoch": 3.6341138903672165, + "grad_norm": 0.26281481981277466, + "learning_rate": 3.5292446638613184e-08, + "loss": 0.1766, + "step": 13657 + }, + { + "epoch": 3.6343799893560407, + "grad_norm": 0.41263461112976074, + "learning_rate": 3.52795713567176e-08, + "loss": 0.1946, + "step": 13658 + }, + { + "epoch": 3.6346460883448644, + "grad_norm": 0.41617441177368164, + "learning_rate": 3.526669792078016e-08, + "loss": 0.2076, + "step": 13659 + }, + { + "epoch": 3.634912187333688, + "grad_norm": 0.40302789211273193, + "learning_rate": 3.525382633116809e-08, + "loss": 0.197, + "step": 13660 + }, + { + "epoch": 3.635178286322512, + "grad_norm": 0.3295404613018036, + "learning_rate": 3.524095658824842e-08, + "loss": 0.1884, + "step": 13661 + }, + { + "epoch": 3.635444385311336, + "grad_norm": 0.33919912576675415, + "learning_rate": 3.5228088692388294e-08, + "loss": 0.1803, + "step": 13662 + }, + { + "epoch": 3.6357104843001595, + "grad_norm": 0.3405972421169281, + "learning_rate": 3.5215222643954667e-08, + "loss": 0.1885, + "step": 13663 + }, + { + "epoch": 3.6359765832889837, + "grad_norm": 0.30098727345466614, + "learning_rate": 3.5202358443314586e-08, + "loss": 0.1764, + "step": 13664 + }, + { + "epoch": 3.6362426822778073, + "grad_norm": 0.32832568883895874, + "learning_rate": 3.518949609083487e-08, + "loss": 0.1724, + "step": 13665 + }, + { + "epoch": 3.636508781266631, + "grad_norm": 0.26862528920173645, + "learning_rate": 3.517663558688243e-08, + "loss": 0.1841, + "step": 13666 + }, + { + "epoch": 3.636774880255455, + "grad_norm": 0.30150333046913147, + "learning_rate": 3.516377693182407e-08, + "loss": 0.1715, + "step": 13667 + }, + { + "epoch": 3.637040979244279, + "grad_norm": 0.3719223737716675, + "learning_rate": 3.515092012602659e-08, + "loss": 0.192, + "step": 13668 + }, + { + "epoch": 3.6373070782331025, + "grad_norm": 0.45665082335472107, + "learning_rate": 3.513806516985659e-08, + "loss": 0.2009, + "step": 13669 + }, + { + "epoch": 3.6375731772219266, + "grad_norm": 0.2697945535182953, + "learning_rate": 3.512521206368083e-08, + "loss": 0.1713, + "step": 13670 + }, + { + "epoch": 3.6378392762107503, + "grad_norm": 0.381109356880188, + "learning_rate": 3.511236080786581e-08, + "loss": 0.1753, + "step": 13671 + }, + { + "epoch": 3.638105375199574, + "grad_norm": 0.37634801864624023, + "learning_rate": 3.5099511402778126e-08, + "loss": 0.1825, + "step": 13672 + }, + { + "epoch": 3.638371474188398, + "grad_norm": 0.40824174880981445, + "learning_rate": 3.508666384878426e-08, + "loss": 0.1913, + "step": 13673 + }, + { + "epoch": 3.638637573177222, + "grad_norm": 0.272536039352417, + "learning_rate": 3.507381814625069e-08, + "loss": 0.1723, + "step": 13674 + }, + { + "epoch": 3.638903672166046, + "grad_norm": 0.35335108637809753, + "learning_rate": 3.506097429554372e-08, + "loss": 0.173, + "step": 13675 + }, + { + "epoch": 3.6391697711548696, + "grad_norm": 0.37494295835494995, + "learning_rate": 3.504813229702972e-08, + "loss": 0.1833, + "step": 13676 + }, + { + "epoch": 3.6394358701436933, + "grad_norm": 0.3109087347984314, + "learning_rate": 3.5035292151075026e-08, + "loss": 0.1792, + "step": 13677 + }, + { + "epoch": 3.6397019691325174, + "grad_norm": 0.27704814076423645, + "learning_rate": 3.502245385804576e-08, + "loss": 0.1786, + "step": 13678 + }, + { + "epoch": 3.639968068121341, + "grad_norm": 0.2904585897922516, + "learning_rate": 3.50096174183082e-08, + "loss": 0.1726, + "step": 13679 + }, + { + "epoch": 3.6402341671101652, + "grad_norm": 0.30661702156066895, + "learning_rate": 3.4996782832228375e-08, + "loss": 0.1819, + "step": 13680 + }, + { + "epoch": 3.640500266098989, + "grad_norm": 0.2716301381587982, + "learning_rate": 3.498395010017241e-08, + "loss": 0.1552, + "step": 13681 + }, + { + "epoch": 3.6407663650878126, + "grad_norm": 0.3504084646701813, + "learning_rate": 3.4971119222506295e-08, + "loss": 0.205, + "step": 13682 + }, + { + "epoch": 3.6410324640766367, + "grad_norm": 0.30229437351226807, + "learning_rate": 3.495829019959604e-08, + "loss": 0.183, + "step": 13683 + }, + { + "epoch": 3.6412985630654604, + "grad_norm": 0.3585937023162842, + "learning_rate": 3.49454630318075e-08, + "loss": 0.1886, + "step": 13684 + }, + { + "epoch": 3.641564662054284, + "grad_norm": 0.2798503637313843, + "learning_rate": 3.4932637719506564e-08, + "loss": 0.1713, + "step": 13685 + }, + { + "epoch": 3.641830761043108, + "grad_norm": 0.41858118772506714, + "learning_rate": 3.4919814263059e-08, + "loss": 0.1684, + "step": 13686 + }, + { + "epoch": 3.642096860031932, + "grad_norm": 0.33343666791915894, + "learning_rate": 3.490699266283058e-08, + "loss": 0.1814, + "step": 13687 + }, + { + "epoch": 3.6423629590207556, + "grad_norm": 0.38117775321006775, + "learning_rate": 3.4894172919187e-08, + "loss": 0.1881, + "step": 13688 + }, + { + "epoch": 3.6426290580095797, + "grad_norm": 0.2795350253582001, + "learning_rate": 3.4881355032493954e-08, + "loss": 0.1684, + "step": 13689 + }, + { + "epoch": 3.6428951569984034, + "grad_norm": 0.39094772934913635, + "learning_rate": 3.486853900311696e-08, + "loss": 0.1907, + "step": 13690 + }, + { + "epoch": 3.643161255987227, + "grad_norm": 0.2514891028404236, + "learning_rate": 3.485572483142161e-08, + "loss": 0.1674, + "step": 13691 + }, + { + "epoch": 3.643427354976051, + "grad_norm": 0.28680849075317383, + "learning_rate": 3.4842912517773335e-08, + "loss": 0.1785, + "step": 13692 + }, + { + "epoch": 3.643693453964875, + "grad_norm": 0.4174155294895172, + "learning_rate": 3.48301020625376e-08, + "loss": 0.1706, + "step": 13693 + }, + { + "epoch": 3.6439595529536986, + "grad_norm": 0.25649431347846985, + "learning_rate": 3.4817293466079824e-08, + "loss": 0.1746, + "step": 13694 + }, + { + "epoch": 3.6442256519425227, + "grad_norm": 0.2797492742538452, + "learning_rate": 3.4804486728765225e-08, + "loss": 0.1765, + "step": 13695 + }, + { + "epoch": 3.6444917509313464, + "grad_norm": 0.28008732199668884, + "learning_rate": 3.479168185095922e-08, + "loss": 0.1799, + "step": 13696 + }, + { + "epoch": 3.64475784992017, + "grad_norm": 0.28618916869163513, + "learning_rate": 3.477887883302692e-08, + "loss": 0.1773, + "step": 13697 + }, + { + "epoch": 3.645023948908994, + "grad_norm": 0.36942416429519653, + "learning_rate": 3.476607767533356e-08, + "loss": 0.1855, + "step": 13698 + }, + { + "epoch": 3.645290047897818, + "grad_norm": 0.2561250329017639, + "learning_rate": 3.47532783782442e-08, + "loss": 0.1644, + "step": 13699 + }, + { + "epoch": 3.645556146886642, + "grad_norm": 0.26931703090667725, + "learning_rate": 3.474048094212396e-08, + "loss": 0.1623, + "step": 13700 + }, + { + "epoch": 3.6458222458754657, + "grad_norm": 0.2579996883869171, + "learning_rate": 3.472768536733779e-08, + "loss": 0.1441, + "step": 13701 + }, + { + "epoch": 3.64608834486429, + "grad_norm": 0.2832227647304535, + "learning_rate": 3.471489165425067e-08, + "loss": 0.183, + "step": 13702 + }, + { + "epoch": 3.6463544438531135, + "grad_norm": 0.2921101748943329, + "learning_rate": 3.470209980322749e-08, + "loss": 0.1793, + "step": 13703 + }, + { + "epoch": 3.646620542841937, + "grad_norm": 0.3316712975502014, + "learning_rate": 3.468930981463316e-08, + "loss": 0.1717, + "step": 13704 + }, + { + "epoch": 3.6468866418307613, + "grad_norm": 0.34432685375213623, + "learning_rate": 3.467652168883241e-08, + "loss": 0.1795, + "step": 13705 + }, + { + "epoch": 3.647152740819585, + "grad_norm": 0.3757190704345703, + "learning_rate": 3.4663735426190024e-08, + "loss": 0.2007, + "step": 13706 + }, + { + "epoch": 3.6474188398084086, + "grad_norm": 0.3410428464412689, + "learning_rate": 3.465095102707064e-08, + "loss": 0.1641, + "step": 13707 + }, + { + "epoch": 3.6476849387972328, + "grad_norm": 0.2803999185562134, + "learning_rate": 3.463816849183897e-08, + "loss": 0.1752, + "step": 13708 + }, + { + "epoch": 3.6479510377860564, + "grad_norm": 0.27210676670074463, + "learning_rate": 3.4625387820859485e-08, + "loss": 0.1729, + "step": 13709 + }, + { + "epoch": 3.64821713677488, + "grad_norm": 0.2856478691101074, + "learning_rate": 3.4612609014496854e-08, + "loss": 0.1647, + "step": 13710 + }, + { + "epoch": 3.6484832357637043, + "grad_norm": 0.27198612689971924, + "learning_rate": 3.459983207311545e-08, + "loss": 0.1561, + "step": 13711 + }, + { + "epoch": 3.648749334752528, + "grad_norm": 0.2552433907985687, + "learning_rate": 3.458705699707975e-08, + "loss": 0.1631, + "step": 13712 + }, + { + "epoch": 3.6490154337413516, + "grad_norm": 0.25526222586631775, + "learning_rate": 3.457428378675412e-08, + "loss": 0.1764, + "step": 13713 + }, + { + "epoch": 3.6492815327301757, + "grad_norm": 0.32582056522369385, + "learning_rate": 3.456151244250285e-08, + "loss": 0.1833, + "step": 13714 + }, + { + "epoch": 3.6495476317189994, + "grad_norm": 0.24968495965003967, + "learning_rate": 3.454874296469026e-08, + "loss": 0.1666, + "step": 13715 + }, + { + "epoch": 3.649813730707823, + "grad_norm": 0.3639943599700928, + "learning_rate": 3.453597535368048e-08, + "loss": 0.1775, + "step": 13716 + }, + { + "epoch": 3.6500798296966472, + "grad_norm": 0.27029380202293396, + "learning_rate": 3.452320960983771e-08, + "loss": 0.1801, + "step": 13717 + }, + { + "epoch": 3.650345928685471, + "grad_norm": 0.2978138327598572, + "learning_rate": 3.451044573352606e-08, + "loss": 0.1951, + "step": 13718 + }, + { + "epoch": 3.6506120276742946, + "grad_norm": 0.285328209400177, + "learning_rate": 3.449768372510962e-08, + "loss": 0.1857, + "step": 13719 + }, + { + "epoch": 3.6508781266631187, + "grad_norm": 0.33754611015319824, + "learning_rate": 3.448492358495231e-08, + "loss": 0.1881, + "step": 13720 + }, + { + "epoch": 3.6511442256519424, + "grad_norm": 0.2777082026004791, + "learning_rate": 3.447216531341815e-08, + "loss": 0.1665, + "step": 13721 + }, + { + "epoch": 3.651410324640766, + "grad_norm": 0.36509355902671814, + "learning_rate": 3.4459408910870944e-08, + "loss": 0.1914, + "step": 13722 + }, + { + "epoch": 3.65167642362959, + "grad_norm": 0.32080215215682983, + "learning_rate": 3.44466543776746e-08, + "loss": 0.1839, + "step": 13723 + }, + { + "epoch": 3.651942522618414, + "grad_norm": 0.3083869516849518, + "learning_rate": 3.443390171419287e-08, + "loss": 0.1732, + "step": 13724 + }, + { + "epoch": 3.652208621607238, + "grad_norm": 0.30693989992141724, + "learning_rate": 3.442115092078954e-08, + "loss": 0.172, + "step": 13725 + }, + { + "epoch": 3.6524747205960617, + "grad_norm": 0.2621781527996063, + "learning_rate": 3.4408401997828216e-08, + "loss": 0.1783, + "step": 13726 + }, + { + "epoch": 3.652740819584886, + "grad_norm": 0.283721387386322, + "learning_rate": 3.439565494567258e-08, + "loss": 0.1807, + "step": 13727 + }, + { + "epoch": 3.6530069185737095, + "grad_norm": 0.2821727991104126, + "learning_rate": 3.438290976468615e-08, + "loss": 0.1805, + "step": 13728 + }, + { + "epoch": 3.653273017562533, + "grad_norm": 0.36112791299819946, + "learning_rate": 3.437016645523252e-08, + "loss": 0.1731, + "step": 13729 + }, + { + "epoch": 3.6535391165513573, + "grad_norm": 0.4406691789627075, + "learning_rate": 3.435742501767506e-08, + "loss": 0.1636, + "step": 13730 + }, + { + "epoch": 3.653805215540181, + "grad_norm": 0.3681863844394684, + "learning_rate": 3.4344685452377245e-08, + "loss": 0.1887, + "step": 13731 + }, + { + "epoch": 3.6540713145290047, + "grad_norm": 0.29094457626342773, + "learning_rate": 3.433194775970241e-08, + "loss": 0.1601, + "step": 13732 + }, + { + "epoch": 3.654337413517829, + "grad_norm": 0.40209779143333435, + "learning_rate": 3.431921194001386e-08, + "loss": 0.1869, + "step": 13733 + }, + { + "epoch": 3.6546035125066525, + "grad_norm": 0.2932327091693878, + "learning_rate": 3.43064779936749e-08, + "loss": 0.1759, + "step": 13734 + }, + { + "epoch": 3.654869611495476, + "grad_norm": 0.2657874524593353, + "learning_rate": 3.429374592104864e-08, + "loss": 0.1824, + "step": 13735 + }, + { + "epoch": 3.6551357104843003, + "grad_norm": 0.33471348881721497, + "learning_rate": 3.428101572249832e-08, + "loss": 0.1695, + "step": 13736 + }, + { + "epoch": 3.655401809473124, + "grad_norm": 0.2833622992038727, + "learning_rate": 3.4268287398386933e-08, + "loss": 0.1792, + "step": 13737 + }, + { + "epoch": 3.6556679084619477, + "grad_norm": 0.3657865524291992, + "learning_rate": 3.425556094907756e-08, + "loss": 0.1617, + "step": 13738 + }, + { + "epoch": 3.655934007450772, + "grad_norm": 0.28679269552230835, + "learning_rate": 3.424283637493319e-08, + "loss": 0.1687, + "step": 13739 + }, + { + "epoch": 3.6562001064395955, + "grad_norm": 0.2759314477443695, + "learning_rate": 3.423011367631679e-08, + "loss": 0.1694, + "step": 13740 + }, + { + "epoch": 3.656466205428419, + "grad_norm": 0.4644610285758972, + "learning_rate": 3.421739285359115e-08, + "loss": 0.193, + "step": 13741 + }, + { + "epoch": 3.6567323044172433, + "grad_norm": 0.30439379811286926, + "learning_rate": 3.4204673907119194e-08, + "loss": 0.1691, + "step": 13742 + }, + { + "epoch": 3.656998403406067, + "grad_norm": 0.3565053939819336, + "learning_rate": 3.4191956837263605e-08, + "loss": 0.1749, + "step": 13743 + }, + { + "epoch": 3.6572645023948906, + "grad_norm": 0.35210973024368286, + "learning_rate": 3.4179241644387156e-08, + "loss": 0.1919, + "step": 13744 + }, + { + "epoch": 3.6575306013837148, + "grad_norm": 0.3030959963798523, + "learning_rate": 3.4166528328852474e-08, + "loss": 0.1932, + "step": 13745 + }, + { + "epoch": 3.6577967003725385, + "grad_norm": 0.32314786314964294, + "learning_rate": 3.415381689102217e-08, + "loss": 0.1941, + "step": 13746 + }, + { + "epoch": 3.6580627993613626, + "grad_norm": 0.2813255488872528, + "learning_rate": 3.414110733125882e-08, + "loss": 0.1716, + "step": 13747 + }, + { + "epoch": 3.6583288983501863, + "grad_norm": 0.3353871703147888, + "learning_rate": 3.4128399649924924e-08, + "loss": 0.1762, + "step": 13748 + }, + { + "epoch": 3.65859499733901, + "grad_norm": 0.2899147868156433, + "learning_rate": 3.4115693847382954e-08, + "loss": 0.1831, + "step": 13749 + }, + { + "epoch": 3.658861096327834, + "grad_norm": 0.26573896408081055, + "learning_rate": 3.410298992399524e-08, + "loss": 0.1598, + "step": 13750 + }, + { + "epoch": 3.6591271953166578, + "grad_norm": 0.36432623863220215, + "learning_rate": 3.4090287880124213e-08, + "loss": 0.1844, + "step": 13751 + }, + { + "epoch": 3.659393294305482, + "grad_norm": 0.3609091341495514, + "learning_rate": 3.407758771613206e-08, + "loss": 0.1671, + "step": 13752 + }, + { + "epoch": 3.6596593932943056, + "grad_norm": 0.27579236030578613, + "learning_rate": 3.406488943238107e-08, + "loss": 0.1759, + "step": 13753 + }, + { + "epoch": 3.6599254922831292, + "grad_norm": 0.26815271377563477, + "learning_rate": 3.4052193029233434e-08, + "loss": 0.1752, + "step": 13754 + }, + { + "epoch": 3.6601915912719534, + "grad_norm": 0.3063284158706665, + "learning_rate": 3.403949850705129e-08, + "loss": 0.1791, + "step": 13755 + }, + { + "epoch": 3.660457690260777, + "grad_norm": 0.31935495138168335, + "learning_rate": 3.402680586619665e-08, + "loss": 0.1658, + "step": 13756 + }, + { + "epoch": 3.6607237892496007, + "grad_norm": 0.2713600993156433, + "learning_rate": 3.40141151070316e-08, + "loss": 0.1763, + "step": 13757 + }, + { + "epoch": 3.660989888238425, + "grad_norm": 0.267673522233963, + "learning_rate": 3.400142622991805e-08, + "loss": 0.1585, + "step": 13758 + }, + { + "epoch": 3.6612559872272485, + "grad_norm": 0.4040195643901825, + "learning_rate": 3.398873923521798e-08, + "loss": 0.1953, + "step": 13759 + }, + { + "epoch": 3.6615220862160722, + "grad_norm": 0.2972826063632965, + "learning_rate": 3.3976054123293126e-08, + "loss": 0.1736, + "step": 13760 + }, + { + "epoch": 3.6617881852048964, + "grad_norm": 0.27523481845855713, + "learning_rate": 3.396337089450545e-08, + "loss": 0.1624, + "step": 13761 + }, + { + "epoch": 3.66205428419372, + "grad_norm": 0.3278743326663971, + "learning_rate": 3.3950689549216594e-08, + "loss": 0.1752, + "step": 13762 + }, + { + "epoch": 3.6623203831825437, + "grad_norm": 0.3467637300491333, + "learning_rate": 3.393801008778833e-08, + "loss": 0.1849, + "step": 13763 + }, + { + "epoch": 3.662586482171368, + "grad_norm": 0.4776829183101654, + "learning_rate": 3.392533251058222e-08, + "loss": 0.1805, + "step": 13764 + }, + { + "epoch": 3.6628525811601915, + "grad_norm": 0.42646175622940063, + "learning_rate": 3.391265681795994e-08, + "loss": 0.1867, + "step": 13765 + }, + { + "epoch": 3.663118680149015, + "grad_norm": 0.2555093467235565, + "learning_rate": 3.389998301028294e-08, + "loss": 0.1593, + "step": 13766 + }, + { + "epoch": 3.6633847791378393, + "grad_norm": 0.2588844299316406, + "learning_rate": 3.388731108791275e-08, + "loss": 0.1714, + "step": 13767 + }, + { + "epoch": 3.663650878126663, + "grad_norm": 0.28139445185661316, + "learning_rate": 3.3874641051210797e-08, + "loss": 0.1713, + "step": 13768 + }, + { + "epoch": 3.6639169771154867, + "grad_norm": 0.2703031599521637, + "learning_rate": 3.3861972900538464e-08, + "loss": 0.176, + "step": 13769 + }, + { + "epoch": 3.664183076104311, + "grad_norm": 0.26937299966812134, + "learning_rate": 3.384930663625708e-08, + "loss": 0.1716, + "step": 13770 + }, + { + "epoch": 3.6644491750931345, + "grad_norm": 0.2837250530719757, + "learning_rate": 3.383664225872787e-08, + "loss": 0.1787, + "step": 13771 + }, + { + "epoch": 3.6647152740819586, + "grad_norm": 0.2565128803253174, + "learning_rate": 3.38239797683121e-08, + "loss": 0.1728, + "step": 13772 + }, + { + "epoch": 3.6649813730707823, + "grad_norm": 0.247393399477005, + "learning_rate": 3.381131916537088e-08, + "loss": 0.1537, + "step": 13773 + }, + { + "epoch": 3.6652474720596064, + "grad_norm": 0.40764153003692627, + "learning_rate": 3.3798660450265325e-08, + "loss": 0.1883, + "step": 13774 + }, + { + "epoch": 3.66551357104843, + "grad_norm": 0.2691659927368164, + "learning_rate": 3.378600362335652e-08, + "loss": 0.1714, + "step": 13775 + }, + { + "epoch": 3.665779670037254, + "grad_norm": 0.2775067389011383, + "learning_rate": 3.377334868500546e-08, + "loss": 0.1627, + "step": 13776 + }, + { + "epoch": 3.666045769026078, + "grad_norm": 0.30200517177581787, + "learning_rate": 3.376069563557306e-08, + "loss": 0.193, + "step": 13777 + }, + { + "epoch": 3.6663118680149016, + "grad_norm": 0.28882896900177, + "learning_rate": 3.374804447542024e-08, + "loss": 0.1851, + "step": 13778 + }, + { + "epoch": 3.6665779670037253, + "grad_norm": 0.26883506774902344, + "learning_rate": 3.373539520490779e-08, + "loss": 0.1622, + "step": 13779 + }, + { + "epoch": 3.6668440659925494, + "grad_norm": 0.2899092435836792, + "learning_rate": 3.3722747824396566e-08, + "loss": 0.1781, + "step": 13780 + }, + { + "epoch": 3.667110164981373, + "grad_norm": 0.3057938814163208, + "learning_rate": 3.371010233424722e-08, + "loss": 0.1929, + "step": 13781 + }, + { + "epoch": 3.667376263970197, + "grad_norm": 0.2837509512901306, + "learning_rate": 3.369745873482047e-08, + "loss": 0.1706, + "step": 13782 + }, + { + "epoch": 3.667642362959021, + "grad_norm": 0.38243725895881653, + "learning_rate": 3.368481702647693e-08, + "loss": 0.1828, + "step": 13783 + }, + { + "epoch": 3.6679084619478446, + "grad_norm": 0.2909061014652252, + "learning_rate": 3.367217720957719e-08, + "loss": 0.1756, + "step": 13784 + }, + { + "epoch": 3.6681745609366683, + "grad_norm": 0.2640603482723236, + "learning_rate": 3.3659539284481707e-08, + "loss": 0.1683, + "step": 13785 + }, + { + "epoch": 3.6684406599254924, + "grad_norm": 0.2627769410610199, + "learning_rate": 3.364690325155099e-08, + "loss": 0.1743, + "step": 13786 + }, + { + "epoch": 3.668706758914316, + "grad_norm": 0.2752569615840912, + "learning_rate": 3.363426911114544e-08, + "loss": 0.1646, + "step": 13787 + }, + { + "epoch": 3.6689728579031398, + "grad_norm": 0.3580959141254425, + "learning_rate": 3.362163686362538e-08, + "loss": 0.1843, + "step": 13788 + }, + { + "epoch": 3.669238956891964, + "grad_norm": 0.2789396345615387, + "learning_rate": 3.3609006509351115e-08, + "loss": 0.1909, + "step": 13789 + }, + { + "epoch": 3.6695050558807876, + "grad_norm": 0.341457724571228, + "learning_rate": 3.359637804868291e-08, + "loss": 0.1618, + "step": 13790 + }, + { + "epoch": 3.6697711548696113, + "grad_norm": 0.3147028684616089, + "learning_rate": 3.358375148198096e-08, + "loss": 0.1653, + "step": 13791 + }, + { + "epoch": 3.6700372538584354, + "grad_norm": 0.2930544912815094, + "learning_rate": 3.357112680960536e-08, + "loss": 0.157, + "step": 13792 + }, + { + "epoch": 3.670303352847259, + "grad_norm": 0.28020933270454407, + "learning_rate": 3.355850403191624e-08, + "loss": 0.1757, + "step": 13793 + }, + { + "epoch": 3.670569451836083, + "grad_norm": 0.2779923379421234, + "learning_rate": 3.3545883149273556e-08, + "loss": 0.1861, + "step": 13794 + }, + { + "epoch": 3.670835550824907, + "grad_norm": 0.28190791606903076, + "learning_rate": 3.353326416203738e-08, + "loss": 0.1835, + "step": 13795 + }, + { + "epoch": 3.6711016498137305, + "grad_norm": 0.2963780462741852, + "learning_rate": 3.352064707056753e-08, + "loss": 0.1801, + "step": 13796 + }, + { + "epoch": 3.6713677488025547, + "grad_norm": 0.3065395653247833, + "learning_rate": 3.3508031875223916e-08, + "loss": 0.2056, + "step": 13797 + }, + { + "epoch": 3.6716338477913784, + "grad_norm": 0.3641488552093506, + "learning_rate": 3.3495418576366366e-08, + "loss": 0.183, + "step": 13798 + }, + { + "epoch": 3.6718999467802025, + "grad_norm": 0.3641057312488556, + "learning_rate": 3.348280717435464e-08, + "loss": 0.1742, + "step": 13799 + }, + { + "epoch": 3.672166045769026, + "grad_norm": 0.28009650111198425, + "learning_rate": 3.3470197669548404e-08, + "loss": 0.1932, + "step": 13800 + }, + { + "epoch": 3.67243214475785, + "grad_norm": 0.3715248107910156, + "learning_rate": 3.345759006230735e-08, + "loss": 0.1755, + "step": 13801 + }, + { + "epoch": 3.672698243746674, + "grad_norm": 0.4498650133609772, + "learning_rate": 3.3444984352991016e-08, + "loss": 0.1925, + "step": 13802 + }, + { + "epoch": 3.6729643427354977, + "grad_norm": 0.26576337218284607, + "learning_rate": 3.343238054195897e-08, + "loss": 0.1629, + "step": 13803 + }, + { + "epoch": 3.6732304417243213, + "grad_norm": 0.3762500584125519, + "learning_rate": 3.3419778629570704e-08, + "loss": 0.189, + "step": 13804 + }, + { + "epoch": 3.6734965407131455, + "grad_norm": 0.2843187749385834, + "learning_rate": 3.340717861618566e-08, + "loss": 0.1688, + "step": 13805 + }, + { + "epoch": 3.673762639701969, + "grad_norm": 0.35568636655807495, + "learning_rate": 3.3394580502163225e-08, + "loss": 0.165, + "step": 13806 + }, + { + "epoch": 3.674028738690793, + "grad_norm": 0.2789321541786194, + "learning_rate": 3.3381984287862674e-08, + "loss": 0.178, + "step": 13807 + }, + { + "epoch": 3.674294837679617, + "grad_norm": 0.35812318325042725, + "learning_rate": 3.336938997364335e-08, + "loss": 0.1802, + "step": 13808 + }, + { + "epoch": 3.6745609366684406, + "grad_norm": 0.27977144718170166, + "learning_rate": 3.335679755986437e-08, + "loss": 0.19, + "step": 13809 + }, + { + "epoch": 3.6748270356572643, + "grad_norm": 0.2750104069709778, + "learning_rate": 3.3344207046884995e-08, + "loss": 0.1715, + "step": 13810 + }, + { + "epoch": 3.6750931346460884, + "grad_norm": 0.634286105632782, + "learning_rate": 3.333161843506423e-08, + "loss": 0.1667, + "step": 13811 + }, + { + "epoch": 3.675359233634912, + "grad_norm": 0.263297438621521, + "learning_rate": 3.331903172476123e-08, + "loss": 0.1632, + "step": 13812 + }, + { + "epoch": 3.675625332623736, + "grad_norm": 0.2757165729999542, + "learning_rate": 3.330644691633492e-08, + "loss": 0.175, + "step": 13813 + }, + { + "epoch": 3.67589143161256, + "grad_norm": 0.3211047649383545, + "learning_rate": 3.32938640101443e-08, + "loss": 0.1855, + "step": 13814 + }, + { + "epoch": 3.6761575306013836, + "grad_norm": 0.272495299577713, + "learning_rate": 3.328128300654821e-08, + "loss": 0.1706, + "step": 13815 + }, + { + "epoch": 3.6764236295902073, + "grad_norm": 0.2641107141971588, + "learning_rate": 3.3268703905905535e-08, + "loss": 0.1538, + "step": 13816 + }, + { + "epoch": 3.6766897285790314, + "grad_norm": 0.3199375867843628, + "learning_rate": 3.3256126708574993e-08, + "loss": 0.1679, + "step": 13817 + }, + { + "epoch": 3.676955827567855, + "grad_norm": 0.2947462201118469, + "learning_rate": 3.324355141491536e-08, + "loss": 0.1763, + "step": 13818 + }, + { + "epoch": 3.6772219265566792, + "grad_norm": 0.29866650700569153, + "learning_rate": 3.3230978025285285e-08, + "loss": 0.1945, + "step": 13819 + }, + { + "epoch": 3.677488025545503, + "grad_norm": 0.2460760623216629, + "learning_rate": 3.321840654004343e-08, + "loss": 0.1575, + "step": 13820 + }, + { + "epoch": 3.677754124534327, + "grad_norm": 2.9134905338287354, + "learning_rate": 3.32058369595483e-08, + "loss": 0.1792, + "step": 13821 + }, + { + "epoch": 3.6780202235231507, + "grad_norm": 0.28315097093582153, + "learning_rate": 3.319326928415843e-08, + "loss": 0.1678, + "step": 13822 + }, + { + "epoch": 3.6782863225119744, + "grad_norm": 0.2739027142524719, + "learning_rate": 3.318070351423231e-08, + "loss": 0.148, + "step": 13823 + }, + { + "epoch": 3.6785524215007985, + "grad_norm": 0.2830907702445984, + "learning_rate": 3.316813965012828e-08, + "loss": 0.1793, + "step": 13824 + }, + { + "epoch": 3.678818520489622, + "grad_norm": 0.3291592001914978, + "learning_rate": 3.3155577692204705e-08, + "loss": 0.1642, + "step": 13825 + }, + { + "epoch": 3.679084619478446, + "grad_norm": 0.3547549247741699, + "learning_rate": 3.31430176408199e-08, + "loss": 0.1793, + "step": 13826 + }, + { + "epoch": 3.67935071846727, + "grad_norm": 0.4083201587200165, + "learning_rate": 3.313045949633212e-08, + "loss": 0.1762, + "step": 13827 + }, + { + "epoch": 3.6796168174560937, + "grad_norm": 0.28583282232284546, + "learning_rate": 3.311790325909949e-08, + "loss": 0.1778, + "step": 13828 + }, + { + "epoch": 3.6798829164449174, + "grad_norm": 0.27501046657562256, + "learning_rate": 3.3105348929480206e-08, + "loss": 0.1715, + "step": 13829 + }, + { + "epoch": 3.6801490154337415, + "grad_norm": 0.3227713406085968, + "learning_rate": 3.309279650783226e-08, + "loss": 0.1837, + "step": 13830 + }, + { + "epoch": 3.680415114422565, + "grad_norm": 0.35254016518592834, + "learning_rate": 3.308024599451377e-08, + "loss": 0.1636, + "step": 13831 + }, + { + "epoch": 3.680681213411389, + "grad_norm": 0.3331749439239502, + "learning_rate": 3.306769738988261e-08, + "loss": 0.1765, + "step": 13832 + }, + { + "epoch": 3.680947312400213, + "grad_norm": 0.347610741853714, + "learning_rate": 3.305515069429673e-08, + "loss": 0.1841, + "step": 13833 + }, + { + "epoch": 3.6812134113890367, + "grad_norm": 0.2622349262237549, + "learning_rate": 3.3042605908114e-08, + "loss": 0.1618, + "step": 13834 + }, + { + "epoch": 3.6814795103778604, + "grad_norm": 0.26923084259033203, + "learning_rate": 3.3030063031692236e-08, + "loss": 0.168, + "step": 13835 + }, + { + "epoch": 3.6817456093666845, + "grad_norm": 0.312393456697464, + "learning_rate": 3.3017522065389135e-08, + "loss": 0.1712, + "step": 13836 + }, + { + "epoch": 3.682011708355508, + "grad_norm": 0.29102957248687744, + "learning_rate": 3.300498300956246e-08, + "loss": 0.1752, + "step": 13837 + }, + { + "epoch": 3.682277807344332, + "grad_norm": 0.27498945593833923, + "learning_rate": 3.299244586456976e-08, + "loss": 0.1788, + "step": 13838 + }, + { + "epoch": 3.682543906333156, + "grad_norm": 0.3181329071521759, + "learning_rate": 3.297991063076868e-08, + "loss": 0.1836, + "step": 13839 + }, + { + "epoch": 3.6828100053219797, + "grad_norm": 0.2779395580291748, + "learning_rate": 3.296737730851674e-08, + "loss": 0.1726, + "step": 13840 + }, + { + "epoch": 3.6830761043108033, + "grad_norm": 0.26474064588546753, + "learning_rate": 3.295484589817142e-08, + "loss": 0.1649, + "step": 13841 + }, + { + "epoch": 3.6833422032996275, + "grad_norm": 0.2665461003780365, + "learning_rate": 3.294231640009016e-08, + "loss": 0.1586, + "step": 13842 + }, + { + "epoch": 3.683608302288451, + "grad_norm": 0.4304017126560211, + "learning_rate": 3.292978881463029e-08, + "loss": 0.188, + "step": 13843 + }, + { + "epoch": 3.6838744012772753, + "grad_norm": 0.31095194816589355, + "learning_rate": 3.291726314214915e-08, + "loss": 0.1722, + "step": 13844 + }, + { + "epoch": 3.684140500266099, + "grad_norm": 0.2905414402484894, + "learning_rate": 3.290473938300397e-08, + "loss": 0.1773, + "step": 13845 + }, + { + "epoch": 3.684406599254923, + "grad_norm": 0.47161537408828735, + "learning_rate": 3.289221753755198e-08, + "loss": 0.1751, + "step": 13846 + }, + { + "epoch": 3.6846726982437468, + "grad_norm": 0.3218732476234436, + "learning_rate": 3.287969760615029e-08, + "loss": 0.1746, + "step": 13847 + }, + { + "epoch": 3.6849387972325705, + "grad_norm": 0.4576047658920288, + "learning_rate": 3.2867179589156016e-08, + "loss": 0.1878, + "step": 13848 + }, + { + "epoch": 3.6852048962213946, + "grad_norm": 0.28248971700668335, + "learning_rate": 3.28546634869262e-08, + "loss": 0.1849, + "step": 13849 + }, + { + "epoch": 3.6854709952102183, + "grad_norm": 0.2888832688331604, + "learning_rate": 3.284214929981787e-08, + "loss": 0.1792, + "step": 13850 + }, + { + "epoch": 3.685737094199042, + "grad_norm": 0.2626356780529022, + "learning_rate": 3.282963702818787e-08, + "loss": 0.1713, + "step": 13851 + }, + { + "epoch": 3.686003193187866, + "grad_norm": 0.272167444229126, + "learning_rate": 3.281712667239316e-08, + "loss": 0.1609, + "step": 13852 + }, + { + "epoch": 3.6862692921766897, + "grad_norm": 0.2632869780063629, + "learning_rate": 3.280461823279048e-08, + "loss": 0.1706, + "step": 13853 + }, + { + "epoch": 3.6865353911655134, + "grad_norm": 0.2771463394165039, + "learning_rate": 3.2792111709736635e-08, + "loss": 0.1749, + "step": 13854 + }, + { + "epoch": 3.6868014901543376, + "grad_norm": 0.33462053537368774, + "learning_rate": 3.277960710358835e-08, + "loss": 0.1565, + "step": 13855 + }, + { + "epoch": 3.6870675891431612, + "grad_norm": 0.2985271215438843, + "learning_rate": 3.276710441470229e-08, + "loss": 0.1847, + "step": 13856 + }, + { + "epoch": 3.687333688131985, + "grad_norm": 0.5194379091262817, + "learning_rate": 3.2754603643435e-08, + "loss": 0.2074, + "step": 13857 + }, + { + "epoch": 3.687599787120809, + "grad_norm": 0.3395881652832031, + "learning_rate": 3.274210479014307e-08, + "loss": 0.1886, + "step": 13858 + }, + { + "epoch": 3.6878658861096327, + "grad_norm": 0.44379669427871704, + "learning_rate": 3.272960785518302e-08, + "loss": 0.1904, + "step": 13859 + }, + { + "epoch": 3.6881319850984564, + "grad_norm": 0.4854632318019867, + "learning_rate": 3.271711283891121e-08, + "loss": 0.1663, + "step": 13860 + }, + { + "epoch": 3.6883980840872805, + "grad_norm": 0.3689243495464325, + "learning_rate": 3.270461974168411e-08, + "loss": 0.1864, + "step": 13861 + }, + { + "epoch": 3.688664183076104, + "grad_norm": 0.44890129566192627, + "learning_rate": 3.2692128563857935e-08, + "loss": 0.1804, + "step": 13862 + }, + { + "epoch": 3.688930282064928, + "grad_norm": 0.2741225063800812, + "learning_rate": 3.26796393057891e-08, + "loss": 0.168, + "step": 13863 + }, + { + "epoch": 3.689196381053752, + "grad_norm": 0.39640745520591736, + "learning_rate": 3.266715196783372e-08, + "loss": 0.1893, + "step": 13864 + }, + { + "epoch": 3.6894624800425757, + "grad_norm": 0.27826738357543945, + "learning_rate": 3.265466655034802e-08, + "loss": 0.1717, + "step": 13865 + }, + { + "epoch": 3.6897285790314, + "grad_norm": 0.30474376678466797, + "learning_rate": 3.2642183053688056e-08, + "loss": 0.2016, + "step": 13866 + }, + { + "epoch": 3.6899946780202235, + "grad_norm": 0.34083691239356995, + "learning_rate": 3.2629701478209936e-08, + "loss": 0.1742, + "step": 13867 + }, + { + "epoch": 3.690260777009047, + "grad_norm": 0.38052651286125183, + "learning_rate": 3.26172218242696e-08, + "loss": 0.1887, + "step": 13868 + }, + { + "epoch": 3.6905268759978713, + "grad_norm": 0.41531795263290405, + "learning_rate": 3.260474409222303e-08, + "loss": 0.1752, + "step": 13869 + }, + { + "epoch": 3.690792974986695, + "grad_norm": 0.2531210780143738, + "learning_rate": 3.2592268282426114e-08, + "loss": 0.1595, + "step": 13870 + }, + { + "epoch": 3.691059073975519, + "grad_norm": 0.31007662415504456, + "learning_rate": 3.257979439523472e-08, + "loss": 0.1804, + "step": 13871 + }, + { + "epoch": 3.691325172964343, + "grad_norm": 0.361001580953598, + "learning_rate": 3.256732243100456e-08, + "loss": 0.2002, + "step": 13872 + }, + { + "epoch": 3.6915912719531665, + "grad_norm": 0.4049316346645355, + "learning_rate": 3.2554852390091424e-08, + "loss": 0.193, + "step": 13873 + }, + { + "epoch": 3.6918573709419906, + "grad_norm": 0.40750354528427124, + "learning_rate": 3.254238427285092e-08, + "loss": 0.1751, + "step": 13874 + }, + { + "epoch": 3.6921234699308143, + "grad_norm": 0.44813835620880127, + "learning_rate": 3.2529918079638694e-08, + "loss": 0.1785, + "step": 13875 + }, + { + "epoch": 3.692389568919638, + "grad_norm": 0.28418123722076416, + "learning_rate": 3.251745381081031e-08, + "loss": 0.1673, + "step": 13876 + }, + { + "epoch": 3.692655667908462, + "grad_norm": 0.2918519079685211, + "learning_rate": 3.250499146672129e-08, + "loss": 0.1728, + "step": 13877 + }, + { + "epoch": 3.692921766897286, + "grad_norm": 0.27148425579071045, + "learning_rate": 3.249253104772708e-08, + "loss": 0.1766, + "step": 13878 + }, + { + "epoch": 3.6931878658861095, + "grad_norm": 0.3417574465274811, + "learning_rate": 3.2480072554183045e-08, + "loss": 0.1768, + "step": 13879 + }, + { + "epoch": 3.6934539648749336, + "grad_norm": 0.27922987937927246, + "learning_rate": 3.246761598644459e-08, + "loss": 0.1763, + "step": 13880 + }, + { + "epoch": 3.6937200638637573, + "grad_norm": 0.3857099711894989, + "learning_rate": 3.2455161344866914e-08, + "loss": 0.1857, + "step": 13881 + }, + { + "epoch": 3.693986162852581, + "grad_norm": 0.28550222516059875, + "learning_rate": 3.244270862980533e-08, + "loss": 0.1669, + "step": 13882 + }, + { + "epoch": 3.694252261841405, + "grad_norm": 0.3646811842918396, + "learning_rate": 3.243025784161494e-08, + "loss": 0.1868, + "step": 13883 + }, + { + "epoch": 3.6945183608302288, + "grad_norm": 0.3512483537197113, + "learning_rate": 3.241780898065091e-08, + "loss": 0.1744, + "step": 13884 + }, + { + "epoch": 3.6947844598190525, + "grad_norm": 0.4235031306743622, + "learning_rate": 3.2405362047268304e-08, + "loss": 0.1909, + "step": 13885 + }, + { + "epoch": 3.6950505588078766, + "grad_norm": 0.3444186747074127, + "learning_rate": 3.2392917041822153e-08, + "loss": 0.1857, + "step": 13886 + }, + { + "epoch": 3.6953166577967003, + "grad_norm": 0.36434441804885864, + "learning_rate": 3.238047396466738e-08, + "loss": 0.1828, + "step": 13887 + }, + { + "epoch": 3.695582756785524, + "grad_norm": 0.2798294126987457, + "learning_rate": 3.2368032816158907e-08, + "loss": 0.1778, + "step": 13888 + }, + { + "epoch": 3.695848855774348, + "grad_norm": 0.41941243410110474, + "learning_rate": 3.235559359665155e-08, + "loss": 0.2014, + "step": 13889 + }, + { + "epoch": 3.6961149547631718, + "grad_norm": 0.3842962682247162, + "learning_rate": 3.234315630650013e-08, + "loss": 0.1766, + "step": 13890 + }, + { + "epoch": 3.696381053751996, + "grad_norm": 0.2905454635620117, + "learning_rate": 3.2330720946059374e-08, + "loss": 0.1581, + "step": 13891 + }, + { + "epoch": 3.6966471527408196, + "grad_norm": 0.2811187207698822, + "learning_rate": 3.231828751568401e-08, + "loss": 0.1784, + "step": 13892 + }, + { + "epoch": 3.6969132517296437, + "grad_norm": 0.293364942073822, + "learning_rate": 3.2305856015728573e-08, + "loss": 0.1923, + "step": 13893 + }, + { + "epoch": 3.6971793507184674, + "grad_norm": 0.29454901814460754, + "learning_rate": 3.229342644654769e-08, + "loss": 0.1913, + "step": 13894 + }, + { + "epoch": 3.697445449707291, + "grad_norm": 0.26234716176986694, + "learning_rate": 3.2280998808495895e-08, + "loss": 0.1801, + "step": 13895 + }, + { + "epoch": 3.697711548696115, + "grad_norm": 0.3917371332645416, + "learning_rate": 3.226857310192761e-08, + "loss": 0.1973, + "step": 13896 + }, + { + "epoch": 3.697977647684939, + "grad_norm": 0.34396690130233765, + "learning_rate": 3.225614932719728e-08, + "loss": 0.1777, + "step": 13897 + }, + { + "epoch": 3.6982437466737625, + "grad_norm": 0.2997024953365326, + "learning_rate": 3.22437274846592e-08, + "loss": 0.1875, + "step": 13898 + }, + { + "epoch": 3.6985098456625867, + "grad_norm": 0.3851800560951233, + "learning_rate": 3.22313075746677e-08, + "loss": 0.1957, + "step": 13899 + }, + { + "epoch": 3.6987759446514104, + "grad_norm": 0.29333582520484924, + "learning_rate": 3.221888959757701e-08, + "loss": 0.1821, + "step": 13900 + }, + { + "epoch": 3.699042043640234, + "grad_norm": 0.27266550064086914, + "learning_rate": 3.2206473553741376e-08, + "loss": 0.1605, + "step": 13901 + }, + { + "epoch": 3.699308142629058, + "grad_norm": 0.28896787762641907, + "learning_rate": 3.219405944351484e-08, + "loss": 0.1684, + "step": 13902 + }, + { + "epoch": 3.699574241617882, + "grad_norm": 0.28215768933296204, + "learning_rate": 3.2181647267251554e-08, + "loss": 0.1765, + "step": 13903 + }, + { + "epoch": 3.6998403406067055, + "grad_norm": 0.36691445112228394, + "learning_rate": 3.216923702530546e-08, + "loss": 0.1947, + "step": 13904 + }, + { + "epoch": 3.7001064395955297, + "grad_norm": 0.36381796002388, + "learning_rate": 3.215682871803057e-08, + "loss": 0.1684, + "step": 13905 + }, + { + "epoch": 3.7003725385843533, + "grad_norm": 0.38337430357933044, + "learning_rate": 3.2144422345780795e-08, + "loss": 0.1973, + "step": 13906 + }, + { + "epoch": 3.700638637573177, + "grad_norm": 0.28088465332984924, + "learning_rate": 3.2132017908910016e-08, + "loss": 0.1591, + "step": 13907 + }, + { + "epoch": 3.700904736562001, + "grad_norm": 0.2924879491329193, + "learning_rate": 3.211961540777196e-08, + "loss": 0.1866, + "step": 13908 + }, + { + "epoch": 3.701170835550825, + "grad_norm": 0.2718762457370758, + "learning_rate": 3.2107214842720444e-08, + "loss": 0.1849, + "step": 13909 + }, + { + "epoch": 3.7014369345396485, + "grad_norm": 0.35653385519981384, + "learning_rate": 3.20948162141091e-08, + "loss": 0.1879, + "step": 13910 + }, + { + "epoch": 3.7017030335284726, + "grad_norm": 0.28878143429756165, + "learning_rate": 3.208241952229161e-08, + "loss": 0.1786, + "step": 13911 + }, + { + "epoch": 3.7019691325172963, + "grad_norm": 0.28683745861053467, + "learning_rate": 3.2070024767621474e-08, + "loss": 0.171, + "step": 13912 + }, + { + "epoch": 3.7022352315061204, + "grad_norm": 0.34216272830963135, + "learning_rate": 3.20576319504523e-08, + "loss": 0.1783, + "step": 13913 + }, + { + "epoch": 3.702501330494944, + "grad_norm": 0.2896292805671692, + "learning_rate": 3.204524107113756e-08, + "loss": 0.1775, + "step": 13914 + }, + { + "epoch": 3.702767429483768, + "grad_norm": 0.2589924931526184, + "learning_rate": 3.203285213003061e-08, + "loss": 0.1774, + "step": 13915 + }, + { + "epoch": 3.703033528472592, + "grad_norm": 0.2771718204021454, + "learning_rate": 3.202046512748486e-08, + "loss": 0.1719, + "step": 13916 + }, + { + "epoch": 3.7032996274614156, + "grad_norm": 0.311541348695755, + "learning_rate": 3.200808006385355e-08, + "loss": 0.1822, + "step": 13917 + }, + { + "epoch": 3.7035657264502397, + "grad_norm": 0.2779146730899811, + "learning_rate": 3.199569693949e-08, + "loss": 0.1876, + "step": 13918 + }, + { + "epoch": 3.7038318254390634, + "grad_norm": 0.32909440994262695, + "learning_rate": 3.198331575474733e-08, + "loss": 0.1876, + "step": 13919 + }, + { + "epoch": 3.704097924427887, + "grad_norm": 0.38363543152809143, + "learning_rate": 3.197093650997872e-08, + "loss": 0.192, + "step": 13920 + }, + { + "epoch": 3.7043640234167112, + "grad_norm": 0.3746669292449951, + "learning_rate": 3.195855920553723e-08, + "loss": 0.1943, + "step": 13921 + }, + { + "epoch": 3.704630122405535, + "grad_norm": 0.28408968448638916, + "learning_rate": 3.194618384177593e-08, + "loss": 0.1924, + "step": 13922 + }, + { + "epoch": 3.7048962213943586, + "grad_norm": 0.27743789553642273, + "learning_rate": 3.193381041904774e-08, + "loss": 0.185, + "step": 13923 + }, + { + "epoch": 3.7051623203831827, + "grad_norm": 0.28928282856941223, + "learning_rate": 3.192143893770561e-08, + "loss": 0.1913, + "step": 13924 + }, + { + "epoch": 3.7054284193720064, + "grad_norm": 0.2673056423664093, + "learning_rate": 3.190906939810236e-08, + "loss": 0.1756, + "step": 13925 + }, + { + "epoch": 3.70569451836083, + "grad_norm": 0.3205457031726837, + "learning_rate": 3.189670180059082e-08, + "loss": 0.1865, + "step": 13926 + }, + { + "epoch": 3.705960617349654, + "grad_norm": 0.2800562083721161, + "learning_rate": 3.188433614552374e-08, + "loss": 0.1735, + "step": 13927 + }, + { + "epoch": 3.706226716338478, + "grad_norm": 0.3946051299571991, + "learning_rate": 3.187197243325383e-08, + "loss": 0.1787, + "step": 13928 + }, + { + "epoch": 3.7064928153273016, + "grad_norm": 0.3755740821361542, + "learning_rate": 3.1859610664133696e-08, + "loss": 0.1873, + "step": 13929 + }, + { + "epoch": 3.7067589143161257, + "grad_norm": 0.2525579333305359, + "learning_rate": 3.1847250838515946e-08, + "loss": 0.17, + "step": 13930 + }, + { + "epoch": 3.7070250133049494, + "grad_norm": 0.3021468222141266, + "learning_rate": 3.183489295675308e-08, + "loss": 0.1844, + "step": 13931 + }, + { + "epoch": 3.707291112293773, + "grad_norm": 0.27220940589904785, + "learning_rate": 3.182253701919757e-08, + "loss": 0.1608, + "step": 13932 + }, + { + "epoch": 3.707557211282597, + "grad_norm": 0.2852937579154968, + "learning_rate": 3.1810183026201886e-08, + "loss": 0.1784, + "step": 13933 + }, + { + "epoch": 3.707823310271421, + "grad_norm": 0.268601655960083, + "learning_rate": 3.179783097811832e-08, + "loss": 0.1767, + "step": 13934 + }, + { + "epoch": 3.7080894092602446, + "grad_norm": 0.3480006456375122, + "learning_rate": 3.178548087529921e-08, + "loss": 0.1934, + "step": 13935 + }, + { + "epoch": 3.7083555082490687, + "grad_norm": 0.2589929699897766, + "learning_rate": 3.1773132718096796e-08, + "loss": 0.16, + "step": 13936 + }, + { + "epoch": 3.7086216072378924, + "grad_norm": 0.3372456431388855, + "learning_rate": 3.1760786506863325e-08, + "loss": 0.1796, + "step": 13937 + }, + { + "epoch": 3.7088877062267165, + "grad_norm": 0.29310178756713867, + "learning_rate": 3.174844224195085e-08, + "loss": 0.1711, + "step": 13938 + }, + { + "epoch": 3.70915380521554, + "grad_norm": 0.2717466354370117, + "learning_rate": 3.173609992371152e-08, + "loss": 0.1779, + "step": 13939 + }, + { + "epoch": 3.7094199042043643, + "grad_norm": 0.3647846281528473, + "learning_rate": 3.172375955249732e-08, + "loss": 0.1709, + "step": 13940 + }, + { + "epoch": 3.709686003193188, + "grad_norm": 0.35910168290138245, + "learning_rate": 3.1711421128660245e-08, + "loss": 0.184, + "step": 13941 + }, + { + "epoch": 3.7099521021820117, + "grad_norm": 0.286478191614151, + "learning_rate": 3.16990846525522e-08, + "loss": 0.1771, + "step": 13942 + }, + { + "epoch": 3.710218201170836, + "grad_norm": 0.28555941581726074, + "learning_rate": 3.1686750124525084e-08, + "loss": 0.1757, + "step": 13943 + }, + { + "epoch": 3.7104843001596595, + "grad_norm": 0.2665148377418518, + "learning_rate": 3.167441754493065e-08, + "loss": 0.1811, + "step": 13944 + }, + { + "epoch": 3.710750399148483, + "grad_norm": 0.3363332450389862, + "learning_rate": 3.16620869141207e-08, + "loss": 0.1795, + "step": 13945 + }, + { + "epoch": 3.7110164981373073, + "grad_norm": 0.29658225178718567, + "learning_rate": 3.164975823244687e-08, + "loss": 0.1694, + "step": 13946 + }, + { + "epoch": 3.711282597126131, + "grad_norm": 0.28375324606895447, + "learning_rate": 3.163743150026086e-08, + "loss": 0.1583, + "step": 13947 + }, + { + "epoch": 3.7115486961149546, + "grad_norm": 0.3057458996772766, + "learning_rate": 3.16251067179142e-08, + "loss": 0.1682, + "step": 13948 + }, + { + "epoch": 3.7118147951037788, + "grad_norm": 0.4457295835018158, + "learning_rate": 3.161278388575842e-08, + "loss": 0.1949, + "step": 13949 + }, + { + "epoch": 3.7120808940926024, + "grad_norm": 0.2943388521671295, + "learning_rate": 3.1600463004145026e-08, + "loss": 0.1605, + "step": 13950 + }, + { + "epoch": 3.712346993081426, + "grad_norm": 0.27495595812797546, + "learning_rate": 3.1588144073425426e-08, + "loss": 0.1734, + "step": 13951 + }, + { + "epoch": 3.7126130920702503, + "grad_norm": 0.29135745763778687, + "learning_rate": 3.1575827093951e-08, + "loss": 0.1659, + "step": 13952 + }, + { + "epoch": 3.712879191059074, + "grad_norm": 0.2726542055606842, + "learning_rate": 3.1563512066073e-08, + "loss": 0.1747, + "step": 13953 + }, + { + "epoch": 3.7131452900478976, + "grad_norm": 0.32778769731521606, + "learning_rate": 3.1551198990142736e-08, + "loss": 0.1861, + "step": 13954 + }, + { + "epoch": 3.7134113890367217, + "grad_norm": 0.283141165971756, + "learning_rate": 3.1538887866511346e-08, + "loss": 0.1767, + "step": 13955 + }, + { + "epoch": 3.7136774880255454, + "grad_norm": 0.2741638123989105, + "learning_rate": 3.1526578695529993e-08, + "loss": 0.1805, + "step": 13956 + }, + { + "epoch": 3.713943587014369, + "grad_norm": 0.27937427163124084, + "learning_rate": 3.151427147754975e-08, + "loss": 0.1836, + "step": 13957 + }, + { + "epoch": 3.7142096860031932, + "grad_norm": 0.35787904262542725, + "learning_rate": 3.150196621292169e-08, + "loss": 0.1713, + "step": 13958 + }, + { + "epoch": 3.714475784992017, + "grad_norm": 0.3674570620059967, + "learning_rate": 3.148966290199673e-08, + "loss": 0.1666, + "step": 13959 + }, + { + "epoch": 3.7147418839808406, + "grad_norm": 0.4228660464286804, + "learning_rate": 3.1477361545125825e-08, + "loss": 0.1662, + "step": 13960 + }, + { + "epoch": 3.7150079829696647, + "grad_norm": 0.4431575834751129, + "learning_rate": 3.146506214265977e-08, + "loss": 0.1915, + "step": 13961 + }, + { + "epoch": 3.7152740819584884, + "grad_norm": 0.2906510829925537, + "learning_rate": 3.145276469494946e-08, + "loss": 0.1756, + "step": 13962 + }, + { + "epoch": 3.7155401809473125, + "grad_norm": 0.45512136816978455, + "learning_rate": 3.144046920234553e-08, + "loss": 0.1942, + "step": 13963 + }, + { + "epoch": 3.715806279936136, + "grad_norm": 0.2721363306045532, + "learning_rate": 3.14281756651988e-08, + "loss": 0.1785, + "step": 13964 + }, + { + "epoch": 3.7160723789249603, + "grad_norm": 0.28473466634750366, + "learning_rate": 3.141588408385981e-08, + "loss": 0.1746, + "step": 13965 + }, + { + "epoch": 3.716338477913784, + "grad_norm": 0.29387718439102173, + "learning_rate": 3.1403594458679204e-08, + "loss": 0.1626, + "step": 13966 + }, + { + "epoch": 3.7166045769026077, + "grad_norm": 0.25887030363082886, + "learning_rate": 3.139130679000746e-08, + "loss": 0.1654, + "step": 13967 + }, + { + "epoch": 3.716870675891432, + "grad_norm": 0.2981325685977936, + "learning_rate": 3.1379021078195054e-08, + "loss": 0.1786, + "step": 13968 + }, + { + "epoch": 3.7171367748802555, + "grad_norm": 0.26958367228507996, + "learning_rate": 3.136673732359245e-08, + "loss": 0.1783, + "step": 13969 + }, + { + "epoch": 3.717402873869079, + "grad_norm": 0.34835702180862427, + "learning_rate": 3.1354455526549925e-08, + "loss": 0.1869, + "step": 13970 + }, + { + "epoch": 3.7176689728579033, + "grad_norm": 0.3748839199542999, + "learning_rate": 3.134217568741784e-08, + "loss": 0.1646, + "step": 13971 + }, + { + "epoch": 3.717935071846727, + "grad_norm": 0.2778840959072113, + "learning_rate": 3.1329897806546414e-08, + "loss": 0.1742, + "step": 13972 + }, + { + "epoch": 3.7182011708355507, + "grad_norm": 0.2658863961696625, + "learning_rate": 3.131762188428587e-08, + "loss": 0.1554, + "step": 13973 + }, + { + "epoch": 3.718467269824375, + "grad_norm": 0.40303391218185425, + "learning_rate": 3.1305347920986305e-08, + "loss": 0.2015, + "step": 13974 + }, + { + "epoch": 3.7187333688131985, + "grad_norm": 0.32292675971984863, + "learning_rate": 3.1293075916997824e-08, + "loss": 0.1874, + "step": 13975 + }, + { + "epoch": 3.718999467802022, + "grad_norm": 0.26287057995796204, + "learning_rate": 3.1280805872670426e-08, + "loss": 0.1525, + "step": 13976 + }, + { + "epoch": 3.7192655667908463, + "grad_norm": 0.31487977504730225, + "learning_rate": 3.126853778835408e-08, + "loss": 0.1785, + "step": 13977 + }, + { + "epoch": 3.71953166577967, + "grad_norm": 0.2672567069530487, + "learning_rate": 3.125627166439869e-08, + "loss": 0.174, + "step": 13978 + }, + { + "epoch": 3.7197977647684937, + "grad_norm": 0.2751195728778839, + "learning_rate": 3.124400750115418e-08, + "loss": 0.1735, + "step": 13979 + }, + { + "epoch": 3.720063863757318, + "grad_norm": 0.3830951750278473, + "learning_rate": 3.1231745298970244e-08, + "loss": 0.1941, + "step": 13980 + }, + { + "epoch": 3.7203299627461415, + "grad_norm": 0.2532140016555786, + "learning_rate": 3.1219485058196724e-08, + "loss": 0.159, + "step": 13981 + }, + { + "epoch": 3.720596061734965, + "grad_norm": 0.26956263184547424, + "learning_rate": 3.120722677918323e-08, + "loss": 0.1762, + "step": 13982 + }, + { + "epoch": 3.7208621607237893, + "grad_norm": 0.2505221366882324, + "learning_rate": 3.1194970462279434e-08, + "loss": 0.158, + "step": 13983 + }, + { + "epoch": 3.721128259712613, + "grad_norm": 0.29048383235931396, + "learning_rate": 3.118271610783488e-08, + "loss": 0.1766, + "step": 13984 + }, + { + "epoch": 3.721394358701437, + "grad_norm": 0.35662540793418884, + "learning_rate": 3.11704637161991e-08, + "loss": 0.1915, + "step": 13985 + }, + { + "epoch": 3.7216604576902608, + "grad_norm": 0.4815082848072052, + "learning_rate": 3.1158213287721566e-08, + "loss": 0.1772, + "step": 13986 + }, + { + "epoch": 3.721926556679085, + "grad_norm": 0.2841012179851532, + "learning_rate": 3.114596482275169e-08, + "loss": 0.1687, + "step": 13987 + }, + { + "epoch": 3.7221926556679086, + "grad_norm": 0.27086368203163147, + "learning_rate": 3.1133718321638834e-08, + "loss": 0.1672, + "step": 13988 + }, + { + "epoch": 3.7224587546567323, + "grad_norm": 0.4158882796764374, + "learning_rate": 3.112147378473226e-08, + "loss": 0.1642, + "step": 13989 + }, + { + "epoch": 3.7227248536455564, + "grad_norm": 0.2862277328968048, + "learning_rate": 3.110923121238124e-08, + "loss": 0.1779, + "step": 13990 + }, + { + "epoch": 3.72299095263438, + "grad_norm": 0.384064257144928, + "learning_rate": 3.1096990604934904e-08, + "loss": 0.1718, + "step": 13991 + }, + { + "epoch": 3.7232570516232038, + "grad_norm": 0.29202213883399963, + "learning_rate": 3.108475196274242e-08, + "loss": 0.1722, + "step": 13992 + }, + { + "epoch": 3.723523150612028, + "grad_norm": 0.2799200713634491, + "learning_rate": 3.107251528615285e-08, + "loss": 0.1632, + "step": 13993 + }, + { + "epoch": 3.7237892496008516, + "grad_norm": 0.3000749349594116, + "learning_rate": 3.1060280575515244e-08, + "loss": 0.1929, + "step": 13994 + }, + { + "epoch": 3.7240553485896752, + "grad_norm": 0.27793028950691223, + "learning_rate": 3.1048047831178495e-08, + "loss": 0.1712, + "step": 13995 + }, + { + "epoch": 3.7243214475784994, + "grad_norm": 0.35556814074516296, + "learning_rate": 3.103581705349158e-08, + "loss": 0.1727, + "step": 13996 + }, + { + "epoch": 3.724587546567323, + "grad_norm": 0.2697451412677765, + "learning_rate": 3.102358824280325e-08, + "loss": 0.1644, + "step": 13997 + }, + { + "epoch": 3.7248536455561467, + "grad_norm": 0.2792363464832306, + "learning_rate": 3.101136139946241e-08, + "loss": 0.1778, + "step": 13998 + }, + { + "epoch": 3.725119744544971, + "grad_norm": 0.3231731057167053, + "learning_rate": 3.0999136523817684e-08, + "loss": 0.1772, + "step": 13999 + }, + { + "epoch": 3.7253858435337945, + "grad_norm": 0.2763131856918335, + "learning_rate": 3.09869136162178e-08, + "loss": 0.1826, + "step": 14000 + }, + { + "epoch": 3.725651942522618, + "grad_norm": 0.258348673582077, + "learning_rate": 3.097469267701139e-08, + "loss": 0.1719, + "step": 14001 + }, + { + "epoch": 3.7259180415114423, + "grad_norm": 1.0960206985473633, + "learning_rate": 3.0962473706547045e-08, + "loss": 0.1724, + "step": 14002 + }, + { + "epoch": 3.726184140500266, + "grad_norm": 0.35852932929992676, + "learning_rate": 3.0950256705173204e-08, + "loss": 0.1761, + "step": 14003 + }, + { + "epoch": 3.7264502394890897, + "grad_norm": 0.3309471607208252, + "learning_rate": 3.0938041673238355e-08, + "loss": 0.1725, + "step": 14004 + }, + { + "epoch": 3.726716338477914, + "grad_norm": 0.3337951898574829, + "learning_rate": 3.0925828611090934e-08, + "loss": 0.1741, + "step": 14005 + }, + { + "epoch": 3.7269824374667375, + "grad_norm": 0.2860753536224365, + "learning_rate": 3.0913617519079216e-08, + "loss": 0.1664, + "step": 14006 + }, + { + "epoch": 3.727248536455561, + "grad_norm": 0.2811800539493561, + "learning_rate": 3.090140839755151e-08, + "loss": 0.1701, + "step": 14007 + }, + { + "epoch": 3.7275146354443853, + "grad_norm": 0.40445592999458313, + "learning_rate": 3.0889201246856054e-08, + "loss": 0.1884, + "step": 14008 + }, + { + "epoch": 3.727780734433209, + "grad_norm": 0.36531496047973633, + "learning_rate": 3.0876996067341056e-08, + "loss": 0.1836, + "step": 14009 + }, + { + "epoch": 3.728046833422033, + "grad_norm": 0.3220943808555603, + "learning_rate": 3.086479285935456e-08, + "loss": 0.1973, + "step": 14010 + }, + { + "epoch": 3.728312932410857, + "grad_norm": 0.27436593174934387, + "learning_rate": 3.0852591623244696e-08, + "loss": 0.1825, + "step": 14011 + }, + { + "epoch": 3.728579031399681, + "grad_norm": 0.36434435844421387, + "learning_rate": 3.084039235935939e-08, + "loss": 0.1613, + "step": 14012 + }, + { + "epoch": 3.7288451303885046, + "grad_norm": 0.3702682852745056, + "learning_rate": 3.082819506804667e-08, + "loss": 0.1663, + "step": 14013 + }, + { + "epoch": 3.7291112293773283, + "grad_norm": 0.2866601347923279, + "learning_rate": 3.081599974965433e-08, + "loss": 0.1803, + "step": 14014 + }, + { + "epoch": 3.7293773283661524, + "grad_norm": 0.2521698474884033, + "learning_rate": 3.080380640453033e-08, + "loss": 0.1518, + "step": 14015 + }, + { + "epoch": 3.729643427354976, + "grad_norm": 0.2815287709236145, + "learning_rate": 3.0791615033022355e-08, + "loss": 0.1848, + "step": 14016 + }, + { + "epoch": 3.7299095263438, + "grad_norm": 0.2793627679347992, + "learning_rate": 3.0779425635478184e-08, + "loss": 0.1813, + "step": 14017 + }, + { + "epoch": 3.730175625332624, + "grad_norm": 0.2996567487716675, + "learning_rate": 3.076723821224543e-08, + "loss": 0.1731, + "step": 14018 + }, + { + "epoch": 3.7304417243214476, + "grad_norm": 0.28373873233795166, + "learning_rate": 3.075505276367175e-08, + "loss": 0.1675, + "step": 14019 + }, + { + "epoch": 3.7307078233102713, + "grad_norm": 0.27920231223106384, + "learning_rate": 3.0742869290104656e-08, + "loss": 0.1694, + "step": 14020 + }, + { + "epoch": 3.7309739222990954, + "grad_norm": 0.25747284293174744, + "learning_rate": 3.073068779189166e-08, + "loss": 0.1709, + "step": 14021 + }, + { + "epoch": 3.731240021287919, + "grad_norm": 0.2699650824069977, + "learning_rate": 3.0718508269380215e-08, + "loss": 0.1597, + "step": 14022 + }, + { + "epoch": 3.7315061202767428, + "grad_norm": 0.4577794373035431, + "learning_rate": 3.070633072291771e-08, + "loss": 0.195, + "step": 14023 + }, + { + "epoch": 3.731772219265567, + "grad_norm": 0.38657593727111816, + "learning_rate": 3.0694155152851486e-08, + "loss": 0.1577, + "step": 14024 + }, + { + "epoch": 3.7320383182543906, + "grad_norm": 0.40866056084632874, + "learning_rate": 3.0681981559528756e-08, + "loss": 0.1867, + "step": 14025 + }, + { + "epoch": 3.7323044172432143, + "grad_norm": 0.3523465692996979, + "learning_rate": 3.066980994329681e-08, + "loss": 0.1747, + "step": 14026 + }, + { + "epoch": 3.7325705162320384, + "grad_norm": 0.2941226363182068, + "learning_rate": 3.065764030450274e-08, + "loss": 0.1852, + "step": 14027 + }, + { + "epoch": 3.732836615220862, + "grad_norm": 0.3665750026702881, + "learning_rate": 3.0645472643493675e-08, + "loss": 0.2065, + "step": 14028 + }, + { + "epoch": 3.7331027142096858, + "grad_norm": 0.3103283643722534, + "learning_rate": 3.0633306960616676e-08, + "loss": 0.167, + "step": 14029 + }, + { + "epoch": 3.73336881319851, + "grad_norm": 0.259773850440979, + "learning_rate": 3.062114325621874e-08, + "loss": 0.1752, + "step": 14030 + }, + { + "epoch": 3.7336349121873336, + "grad_norm": 0.3721464276313782, + "learning_rate": 3.060898153064676e-08, + "loss": 0.1682, + "step": 14031 + }, + { + "epoch": 3.7339010111761577, + "grad_norm": 0.2502882778644562, + "learning_rate": 3.0596821784247674e-08, + "loss": 0.1646, + "step": 14032 + }, + { + "epoch": 3.7341671101649814, + "grad_norm": 0.2585011422634125, + "learning_rate": 3.058466401736822e-08, + "loss": 0.1566, + "step": 14033 + }, + { + "epoch": 3.734433209153805, + "grad_norm": 0.3960258662700653, + "learning_rate": 3.057250823035524e-08, + "loss": 0.1972, + "step": 14034 + }, + { + "epoch": 3.734699308142629, + "grad_norm": 0.26516440510749817, + "learning_rate": 3.056035442355538e-08, + "loss": 0.1681, + "step": 14035 + }, + { + "epoch": 3.734965407131453, + "grad_norm": 0.3031652569770813, + "learning_rate": 3.054820259731532e-08, + "loss": 0.1693, + "step": 14036 + }, + { + "epoch": 3.735231506120277, + "grad_norm": 0.27056917548179626, + "learning_rate": 3.0536052751981656e-08, + "loss": 0.1729, + "step": 14037 + }, + { + "epoch": 3.7354976051091007, + "grad_norm": 0.24550321698188782, + "learning_rate": 3.0523904887900955e-08, + "loss": 0.1575, + "step": 14038 + }, + { + "epoch": 3.7357637040979244, + "grad_norm": 0.5703083276748657, + "learning_rate": 3.0511759005419636e-08, + "loss": 0.186, + "step": 14039 + }, + { + "epoch": 3.7360298030867485, + "grad_norm": 0.357181191444397, + "learning_rate": 3.049961510488416e-08, + "loss": 0.1724, + "step": 14040 + }, + { + "epoch": 3.736295902075572, + "grad_norm": 0.2967979311943054, + "learning_rate": 3.048747318664092e-08, + "loss": 0.1769, + "step": 14041 + }, + { + "epoch": 3.736562001064396, + "grad_norm": 0.3499354124069214, + "learning_rate": 3.047533325103617e-08, + "loss": 0.1718, + "step": 14042 + }, + { + "epoch": 3.73682810005322, + "grad_norm": 0.26792576909065247, + "learning_rate": 3.046319529841621e-08, + "loss": 0.173, + "step": 14043 + }, + { + "epoch": 3.7370941990420437, + "grad_norm": 0.3664613366127014, + "learning_rate": 3.045105932912721e-08, + "loss": 0.1703, + "step": 14044 + }, + { + "epoch": 3.7373602980308673, + "grad_norm": 0.35197770595550537, + "learning_rate": 3.043892534351537e-08, + "loss": 0.1803, + "step": 14045 + }, + { + "epoch": 3.7376263970196915, + "grad_norm": 0.36666950583457947, + "learning_rate": 3.0426793341926695e-08, + "loss": 0.1846, + "step": 14046 + }, + { + "epoch": 3.737892496008515, + "grad_norm": 0.27100178599357605, + "learning_rate": 3.041466332470729e-08, + "loss": 0.1761, + "step": 14047 + }, + { + "epoch": 3.738158594997339, + "grad_norm": 0.27843356132507324, + "learning_rate": 3.0402535292203045e-08, + "loss": 0.1732, + "step": 14048 + }, + { + "epoch": 3.738424693986163, + "grad_norm": 0.26291531324386597, + "learning_rate": 3.0390409244759975e-08, + "loss": 0.1614, + "step": 14049 + }, + { + "epoch": 3.7386907929749866, + "grad_norm": 0.25527292490005493, + "learning_rate": 3.037828518272384e-08, + "loss": 0.1681, + "step": 14050 + }, + { + "epoch": 3.7389568919638103, + "grad_norm": 0.26293495297431946, + "learning_rate": 3.03661631064405e-08, + "loss": 0.181, + "step": 14051 + }, + { + "epoch": 3.7392229909526344, + "grad_norm": 0.3033864498138428, + "learning_rate": 3.035404301625568e-08, + "loss": 0.182, + "step": 14052 + }, + { + "epoch": 3.739489089941458, + "grad_norm": 0.3650212287902832, + "learning_rate": 3.034192491251513e-08, + "loss": 0.2006, + "step": 14053 + }, + { + "epoch": 3.739755188930282, + "grad_norm": 0.33123037219047546, + "learning_rate": 3.032980879556438e-08, + "loss": 0.1858, + "step": 14054 + }, + { + "epoch": 3.740021287919106, + "grad_norm": 0.2728082239627838, + "learning_rate": 3.0317694665749105e-08, + "loss": 0.1742, + "step": 14055 + }, + { + "epoch": 3.7402873869079296, + "grad_norm": 0.2500610053539276, + "learning_rate": 3.030558252341474e-08, + "loss": 0.1554, + "step": 14056 + }, + { + "epoch": 3.7405534858967537, + "grad_norm": 0.3955002725124359, + "learning_rate": 3.0293472368906794e-08, + "loss": 0.169, + "step": 14057 + }, + { + "epoch": 3.7408195848855774, + "grad_norm": 0.3210693895816803, + "learning_rate": 3.028136420257066e-08, + "loss": 0.1786, + "step": 14058 + }, + { + "epoch": 3.7410856838744015, + "grad_norm": 0.2762131690979004, + "learning_rate": 3.02692580247517e-08, + "loss": 0.1691, + "step": 14059 + }, + { + "epoch": 3.7413517828632252, + "grad_norm": 0.39576566219329834, + "learning_rate": 3.025715383579523e-08, + "loss": 0.1849, + "step": 14060 + }, + { + "epoch": 3.741617881852049, + "grad_norm": 0.37469273805618286, + "learning_rate": 3.024505163604643e-08, + "loss": 0.1856, + "step": 14061 + }, + { + "epoch": 3.741883980840873, + "grad_norm": 0.30259403586387634, + "learning_rate": 3.023295142585053e-08, + "loss": 0.1719, + "step": 14062 + }, + { + "epoch": 3.7421500798296967, + "grad_norm": 0.30331146717071533, + "learning_rate": 3.02208532055526e-08, + "loss": 0.1972, + "step": 14063 + }, + { + "epoch": 3.7424161788185204, + "grad_norm": 0.2671009600162506, + "learning_rate": 3.0208756975497773e-08, + "loss": 0.1612, + "step": 14064 + }, + { + "epoch": 3.7426822778073445, + "grad_norm": 0.272807240486145, + "learning_rate": 3.019666273603094e-08, + "loss": 0.1786, + "step": 14065 + }, + { + "epoch": 3.742948376796168, + "grad_norm": 0.3826124370098114, + "learning_rate": 3.018457048749721e-08, + "loss": 0.2004, + "step": 14066 + }, + { + "epoch": 3.743214475784992, + "grad_norm": 0.29161393642425537, + "learning_rate": 3.017248023024137e-08, + "loss": 0.1767, + "step": 14067 + }, + { + "epoch": 3.743480574773816, + "grad_norm": 0.2743852734565735, + "learning_rate": 3.016039196460831e-08, + "loss": 0.1648, + "step": 14068 + }, + { + "epoch": 3.7437466737626397, + "grad_norm": 0.2682376801967621, + "learning_rate": 3.0148305690942776e-08, + "loss": 0.1792, + "step": 14069 + }, + { + "epoch": 3.7440127727514634, + "grad_norm": 0.36671048402786255, + "learning_rate": 3.013622140958954e-08, + "loss": 0.1717, + "step": 14070 + }, + { + "epoch": 3.7442788717402875, + "grad_norm": 0.35855787992477417, + "learning_rate": 3.012413912089319e-08, + "loss": 0.1859, + "step": 14071 + }, + { + "epoch": 3.744544970729111, + "grad_norm": 0.31994155049324036, + "learning_rate": 3.0112058825198394e-08, + "loss": 0.1746, + "step": 14072 + }, + { + "epoch": 3.744811069717935, + "grad_norm": 0.3646632432937622, + "learning_rate": 3.009998052284971e-08, + "loss": 0.1829, + "step": 14073 + }, + { + "epoch": 3.745077168706759, + "grad_norm": 0.2793588638305664, + "learning_rate": 3.008790421419165e-08, + "loss": 0.1734, + "step": 14074 + }, + { + "epoch": 3.7453432676955827, + "grad_norm": 0.26946133375167847, + "learning_rate": 3.0075829899568597e-08, + "loss": 0.1763, + "step": 14075 + }, + { + "epoch": 3.7456093666844064, + "grad_norm": 0.3232742249965668, + "learning_rate": 3.0063757579325e-08, + "loss": 0.1778, + "step": 14076 + }, + { + "epoch": 3.7458754656732305, + "grad_norm": 0.282351016998291, + "learning_rate": 3.0051687253805124e-08, + "loss": 0.1788, + "step": 14077 + }, + { + "epoch": 3.746141564662054, + "grad_norm": 0.30993330478668213, + "learning_rate": 3.003961892335327e-08, + "loss": 0.1745, + "step": 14078 + }, + { + "epoch": 3.746407663650878, + "grad_norm": 0.3198011517524719, + "learning_rate": 3.002755258831364e-08, + "loss": 0.2027, + "step": 14079 + }, + { + "epoch": 3.746673762639702, + "grad_norm": 0.29122641682624817, + "learning_rate": 3.001548824903042e-08, + "loss": 0.1842, + "step": 14080 + }, + { + "epoch": 3.7469398616285257, + "grad_norm": 0.26999184489250183, + "learning_rate": 3.000342590584771e-08, + "loss": 0.1681, + "step": 14081 + }, + { + "epoch": 3.74720596061735, + "grad_norm": 0.29131120443344116, + "learning_rate": 2.99913655591095e-08, + "loss": 0.1747, + "step": 14082 + }, + { + "epoch": 3.7474720596061735, + "grad_norm": 0.48308703303337097, + "learning_rate": 2.9979307209159846e-08, + "loss": 0.1775, + "step": 14083 + }, + { + "epoch": 3.7477381585949976, + "grad_norm": 0.3707083761692047, + "learning_rate": 2.996725085634261e-08, + "loss": 0.1722, + "step": 14084 + }, + { + "epoch": 3.7480042575838213, + "grad_norm": 0.6920527815818787, + "learning_rate": 2.995519650100171e-08, + "loss": 0.181, + "step": 14085 + }, + { + "epoch": 3.748270356572645, + "grad_norm": 0.2723976969718933, + "learning_rate": 2.994314414348093e-08, + "loss": 0.1602, + "step": 14086 + }, + { + "epoch": 3.748536455561469, + "grad_norm": 0.2534569501876831, + "learning_rate": 2.993109378412403e-08, + "loss": 0.1693, + "step": 14087 + }, + { + "epoch": 3.7488025545502928, + "grad_norm": 0.30639147758483887, + "learning_rate": 2.991904542327473e-08, + "loss": 0.1724, + "step": 14088 + }, + { + "epoch": 3.7490686535391164, + "grad_norm": 0.3845352828502655, + "learning_rate": 2.9906999061276693e-08, + "loss": 0.1866, + "step": 14089 + }, + { + "epoch": 3.7493347525279406, + "grad_norm": 0.28751489520072937, + "learning_rate": 2.989495469847344e-08, + "loss": 0.1787, + "step": 14090 + }, + { + "epoch": 3.7496008515167643, + "grad_norm": 0.27471140027046204, + "learning_rate": 2.988291233520858e-08, + "loss": 0.1561, + "step": 14091 + }, + { + "epoch": 3.749866950505588, + "grad_norm": 0.5417821407318115, + "learning_rate": 2.9870871971825516e-08, + "loss": 0.1726, + "step": 14092 + }, + { + "epoch": 3.750133049494412, + "grad_norm": 0.27811065316200256, + "learning_rate": 2.985883360866769e-08, + "loss": 0.1626, + "step": 14093 + }, + { + "epoch": 3.7503991484832357, + "grad_norm": 0.36093759536743164, + "learning_rate": 2.984679724607846e-08, + "loss": 0.1929, + "step": 14094 + }, + { + "epoch": 3.7506652474720594, + "grad_norm": 0.2911902964115143, + "learning_rate": 2.983476288440112e-08, + "loss": 0.169, + "step": 14095 + }, + { + "epoch": 3.7509313464608836, + "grad_norm": 0.2742476165294647, + "learning_rate": 2.982273052397898e-08, + "loss": 0.1807, + "step": 14096 + }, + { + "epoch": 3.7511974454497072, + "grad_norm": 0.4283086359500885, + "learning_rate": 2.9810700165155124e-08, + "loss": 0.1944, + "step": 14097 + }, + { + "epoch": 3.751463544438531, + "grad_norm": 0.34248149394989014, + "learning_rate": 2.9798671808272768e-08, + "loss": 0.1959, + "step": 14098 + }, + { + "epoch": 3.751729643427355, + "grad_norm": 0.35584115982055664, + "learning_rate": 2.978664545367492e-08, + "loss": 0.185, + "step": 14099 + }, + { + "epoch": 3.7519957424161787, + "grad_norm": 0.7136241793632507, + "learning_rate": 2.9774621101704644e-08, + "loss": 0.1795, + "step": 14100 + }, + { + "epoch": 3.7522618414050024, + "grad_norm": 0.2768082022666931, + "learning_rate": 2.976259875270486e-08, + "loss": 0.1896, + "step": 14101 + }, + { + "epoch": 3.7525279403938265, + "grad_norm": 0.269665002822876, + "learning_rate": 2.975057840701848e-08, + "loss": 0.1664, + "step": 14102 + }, + { + "epoch": 3.75279403938265, + "grad_norm": 0.31901487708091736, + "learning_rate": 2.973856006498836e-08, + "loss": 0.2019, + "step": 14103 + }, + { + "epoch": 3.7530601383714743, + "grad_norm": 0.28142666816711426, + "learning_rate": 2.972654372695732e-08, + "loss": 0.1673, + "step": 14104 + }, + { + "epoch": 3.753326237360298, + "grad_norm": 0.44863882660865784, + "learning_rate": 2.9714529393268018e-08, + "loss": 0.1644, + "step": 14105 + }, + { + "epoch": 3.753592336349122, + "grad_norm": 0.2892298400402069, + "learning_rate": 2.9702517064263198e-08, + "loss": 0.1774, + "step": 14106 + }, + { + "epoch": 3.753858435337946, + "grad_norm": 0.29034778475761414, + "learning_rate": 2.9690506740285414e-08, + "loss": 0.1793, + "step": 14107 + }, + { + "epoch": 3.7541245343267695, + "grad_norm": 0.28326940536499023, + "learning_rate": 2.9678498421677257e-08, + "loss": 0.1853, + "step": 14108 + }, + { + "epoch": 3.7543906333155936, + "grad_norm": 0.4177178740501404, + "learning_rate": 2.9666492108781216e-08, + "loss": 0.1851, + "step": 14109 + }, + { + "epoch": 3.7546567323044173, + "grad_norm": 0.2912156879901886, + "learning_rate": 2.9654487801939787e-08, + "loss": 0.1754, + "step": 14110 + }, + { + "epoch": 3.754922831293241, + "grad_norm": 0.29217496514320374, + "learning_rate": 2.9642485501495274e-08, + "loss": 0.1787, + "step": 14111 + }, + { + "epoch": 3.755188930282065, + "grad_norm": 0.25758394598960876, + "learning_rate": 2.963048520779008e-08, + "loss": 0.1604, + "step": 14112 + }, + { + "epoch": 3.755455029270889, + "grad_norm": 0.30357515811920166, + "learning_rate": 2.961848692116642e-08, + "loss": 0.1774, + "step": 14113 + }, + { + "epoch": 3.7557211282597125, + "grad_norm": 0.25345152616500854, + "learning_rate": 2.960649064196653e-08, + "loss": 0.1609, + "step": 14114 + }, + { + "epoch": 3.7559872272485366, + "grad_norm": 0.3295280933380127, + "learning_rate": 2.9594496370532607e-08, + "loss": 0.1943, + "step": 14115 + }, + { + "epoch": 3.7562533262373603, + "grad_norm": 0.38616490364074707, + "learning_rate": 2.9582504107206652e-08, + "loss": 0.1771, + "step": 14116 + }, + { + "epoch": 3.756519425226184, + "grad_norm": 0.30303534865379333, + "learning_rate": 2.9570513852330835e-08, + "loss": 0.1853, + "step": 14117 + }, + { + "epoch": 3.756785524215008, + "grad_norm": 0.2938328683376312, + "learning_rate": 2.9558525606247064e-08, + "loss": 0.1867, + "step": 14118 + }, + { + "epoch": 3.757051623203832, + "grad_norm": 0.2957998812198639, + "learning_rate": 2.9546539369297307e-08, + "loss": 0.1672, + "step": 14119 + }, + { + "epoch": 3.7573177221926555, + "grad_norm": 0.3988533318042755, + "learning_rate": 2.953455514182339e-08, + "loss": 0.1858, + "step": 14120 + }, + { + "epoch": 3.7575838211814796, + "grad_norm": 0.3579321801662445, + "learning_rate": 2.9522572924167177e-08, + "loss": 0.1826, + "step": 14121 + }, + { + "epoch": 3.7578499201703033, + "grad_norm": 0.3455829322338104, + "learning_rate": 2.951059271667038e-08, + "loss": 0.1878, + "step": 14122 + }, + { + "epoch": 3.758116019159127, + "grad_norm": 0.331605464220047, + "learning_rate": 2.9498614519674715e-08, + "loss": 0.1747, + "step": 14123 + }, + { + "epoch": 3.758382118147951, + "grad_norm": 0.2810536324977875, + "learning_rate": 2.948663833352183e-08, + "loss": 0.1704, + "step": 14124 + }, + { + "epoch": 3.7586482171367748, + "grad_norm": 0.2869429886341095, + "learning_rate": 2.947466415855334e-08, + "loss": 0.1781, + "step": 14125 + }, + { + "epoch": 3.7589143161255985, + "grad_norm": 0.27790313959121704, + "learning_rate": 2.9462691995110712e-08, + "loss": 0.1783, + "step": 14126 + }, + { + "epoch": 3.7591804151144226, + "grad_norm": 0.2614857852458954, + "learning_rate": 2.9450721843535476e-08, + "loss": 0.166, + "step": 14127 + }, + { + "epoch": 3.7594465141032463, + "grad_norm": 0.4603458642959595, + "learning_rate": 2.9438753704168983e-08, + "loss": 0.185, + "step": 14128 + }, + { + "epoch": 3.7597126130920704, + "grad_norm": 0.267367959022522, + "learning_rate": 2.942678757735263e-08, + "loss": 0.1864, + "step": 14129 + }, + { + "epoch": 3.759978712080894, + "grad_norm": 0.2825351655483246, + "learning_rate": 2.9414823463427705e-08, + "loss": 0.171, + "step": 14130 + }, + { + "epoch": 3.760244811069718, + "grad_norm": 0.2769171893596649, + "learning_rate": 2.9402861362735474e-08, + "loss": 0.1842, + "step": 14131 + }, + { + "epoch": 3.760510910058542, + "grad_norm": 0.2847006320953369, + "learning_rate": 2.9390901275617076e-08, + "loss": 0.1732, + "step": 14132 + }, + { + "epoch": 3.7607770090473656, + "grad_norm": 0.26067861914634705, + "learning_rate": 2.9378943202413653e-08, + "loss": 0.1718, + "step": 14133 + }, + { + "epoch": 3.7610431080361897, + "grad_norm": 0.4889424443244934, + "learning_rate": 2.9366987143466315e-08, + "loss": 0.1832, + "step": 14134 + }, + { + "epoch": 3.7613092070250134, + "grad_norm": 0.36964333057403564, + "learning_rate": 2.9355033099116e-08, + "loss": 0.1733, + "step": 14135 + }, + { + "epoch": 3.761575306013837, + "grad_norm": 0.29672113060951233, + "learning_rate": 2.9343081069703724e-08, + "loss": 0.1735, + "step": 14136 + }, + { + "epoch": 3.761841405002661, + "grad_norm": 0.24927757680416107, + "learning_rate": 2.9331131055570336e-08, + "loss": 0.1572, + "step": 14137 + }, + { + "epoch": 3.762107503991485, + "grad_norm": 0.32108721137046814, + "learning_rate": 2.9319183057056697e-08, + "loss": 0.1802, + "step": 14138 + }, + { + "epoch": 3.7623736029803085, + "grad_norm": 0.3625802993774414, + "learning_rate": 2.9307237074503577e-08, + "loss": 0.1875, + "step": 14139 + }, + { + "epoch": 3.7626397019691327, + "grad_norm": 0.29710736870765686, + "learning_rate": 2.929529310825175e-08, + "loss": 0.173, + "step": 14140 + }, + { + "epoch": 3.7629058009579563, + "grad_norm": 0.37459835410118103, + "learning_rate": 2.928335115864181e-08, + "loss": 0.196, + "step": 14141 + }, + { + "epoch": 3.76317189994678, + "grad_norm": 0.2736821174621582, + "learning_rate": 2.927141122601443e-08, + "loss": 0.1719, + "step": 14142 + }, + { + "epoch": 3.763437998935604, + "grad_norm": 0.41610339283943176, + "learning_rate": 2.925947331071009e-08, + "loss": 0.1719, + "step": 14143 + }, + { + "epoch": 3.763704097924428, + "grad_norm": 0.28591272234916687, + "learning_rate": 2.9247537413069334e-08, + "loss": 0.1797, + "step": 14144 + }, + { + "epoch": 3.7639701969132515, + "grad_norm": 0.3678387999534607, + "learning_rate": 2.9235603533432585e-08, + "loss": 0.1843, + "step": 14145 + }, + { + "epoch": 3.7642362959020756, + "grad_norm": 0.39321863651275635, + "learning_rate": 2.922367167214025e-08, + "loss": 0.1758, + "step": 14146 + }, + { + "epoch": 3.7645023948908993, + "grad_norm": 0.3005914092063904, + "learning_rate": 2.92117418295326e-08, + "loss": 0.18, + "step": 14147 + }, + { + "epoch": 3.764768493879723, + "grad_norm": 0.26447877287864685, + "learning_rate": 2.919981400594994e-08, + "loss": 0.1748, + "step": 14148 + }, + { + "epoch": 3.765034592868547, + "grad_norm": 0.32864677906036377, + "learning_rate": 2.9187888201732434e-08, + "loss": 0.1833, + "step": 14149 + }, + { + "epoch": 3.765300691857371, + "grad_norm": 0.300689697265625, + "learning_rate": 2.9175964417220257e-08, + "loss": 0.1625, + "step": 14150 + }, + { + "epoch": 3.765566790846195, + "grad_norm": 0.27183130383491516, + "learning_rate": 2.916404265275352e-08, + "loss": 0.1651, + "step": 14151 + }, + { + "epoch": 3.7658328898350186, + "grad_norm": 0.3043779730796814, + "learning_rate": 2.9152122908672204e-08, + "loss": 0.1709, + "step": 14152 + }, + { + "epoch": 3.7660989888238423, + "grad_norm": 0.2851827144622803, + "learning_rate": 2.9140205185316314e-08, + "loss": 0.1797, + "step": 14153 + }, + { + "epoch": 3.7663650878126664, + "grad_norm": 0.34544894099235535, + "learning_rate": 2.912828948302577e-08, + "loss": 0.1845, + "step": 14154 + }, + { + "epoch": 3.76663118680149, + "grad_norm": 0.26769644021987915, + "learning_rate": 2.9116375802140447e-08, + "loss": 0.1692, + "step": 14155 + }, + { + "epoch": 3.7668972857903142, + "grad_norm": 0.2692904770374298, + "learning_rate": 2.910446414300011e-08, + "loss": 0.169, + "step": 14156 + }, + { + "epoch": 3.767163384779138, + "grad_norm": 0.321700781583786, + "learning_rate": 2.9092554505944543e-08, + "loss": 0.1703, + "step": 14157 + }, + { + "epoch": 3.7674294837679616, + "grad_norm": 0.388916552066803, + "learning_rate": 2.908064689131339e-08, + "loss": 0.1704, + "step": 14158 + }, + { + "epoch": 3.7676955827567857, + "grad_norm": 0.2737692594528198, + "learning_rate": 2.9068741299446298e-08, + "loss": 0.1799, + "step": 14159 + }, + { + "epoch": 3.7679616817456094, + "grad_norm": 0.2941012680530548, + "learning_rate": 2.905683773068284e-08, + "loss": 0.1761, + "step": 14160 + }, + { + "epoch": 3.768227780734433, + "grad_norm": 0.3087330460548401, + "learning_rate": 2.904493618536257e-08, + "loss": 0.1867, + "step": 14161 + }, + { + "epoch": 3.7684938797232572, + "grad_norm": 0.25461912155151367, + "learning_rate": 2.9033036663824873e-08, + "loss": 0.1635, + "step": 14162 + }, + { + "epoch": 3.768759978712081, + "grad_norm": 0.26996999979019165, + "learning_rate": 2.9021139166409215e-08, + "loss": 0.1802, + "step": 14163 + }, + { + "epoch": 3.7690260777009046, + "grad_norm": 0.3206138610839844, + "learning_rate": 2.9009243693454876e-08, + "loss": 0.1959, + "step": 14164 + }, + { + "epoch": 3.7692921766897287, + "grad_norm": 0.3576660454273224, + "learning_rate": 2.899735024530119e-08, + "loss": 0.1817, + "step": 14165 + }, + { + "epoch": 3.7695582756785524, + "grad_norm": 0.3518717288970947, + "learning_rate": 2.89854588222873e-08, + "loss": 0.187, + "step": 14166 + }, + { + "epoch": 3.769824374667376, + "grad_norm": 0.29434341192245483, + "learning_rate": 2.8973569424752497e-08, + "loss": 0.1899, + "step": 14167 + }, + { + "epoch": 3.7700904736562, + "grad_norm": 0.27393266558647156, + "learning_rate": 2.8961682053035796e-08, + "loss": 0.1736, + "step": 14168 + }, + { + "epoch": 3.770356572645024, + "grad_norm": 0.4308512508869171, + "learning_rate": 2.894979670747628e-08, + "loss": 0.1823, + "step": 14169 + }, + { + "epoch": 3.7706226716338476, + "grad_norm": 0.279675155878067, + "learning_rate": 2.8937913388412973e-08, + "loss": 0.1724, + "step": 14170 + }, + { + "epoch": 3.7708887706226717, + "grad_norm": 0.28309452533721924, + "learning_rate": 2.892603209618475e-08, + "loss": 0.1736, + "step": 14171 + }, + { + "epoch": 3.7711548696114954, + "grad_norm": 0.27526652812957764, + "learning_rate": 2.8914152831130558e-08, + "loss": 0.1755, + "step": 14172 + }, + { + "epoch": 3.771420968600319, + "grad_norm": 0.33753183484077454, + "learning_rate": 2.8902275593589143e-08, + "loss": 0.1773, + "step": 14173 + }, + { + "epoch": 3.771687067589143, + "grad_norm": 0.3854389786720276, + "learning_rate": 2.8890400383899304e-08, + "loss": 0.1729, + "step": 14174 + }, + { + "epoch": 3.771953166577967, + "grad_norm": 0.2640742063522339, + "learning_rate": 2.887852720239975e-08, + "loss": 0.1834, + "step": 14175 + }, + { + "epoch": 3.772219265566791, + "grad_norm": 0.25705111026763916, + "learning_rate": 2.8866656049429163e-08, + "loss": 0.1635, + "step": 14176 + }, + { + "epoch": 3.7724853645556147, + "grad_norm": 0.3025122880935669, + "learning_rate": 2.8854786925326046e-08, + "loss": 0.1828, + "step": 14177 + }, + { + "epoch": 3.772751463544439, + "grad_norm": 0.37558552622795105, + "learning_rate": 2.8842919830429023e-08, + "loss": 0.1818, + "step": 14178 + }, + { + "epoch": 3.7730175625332625, + "grad_norm": 0.26798295974731445, + "learning_rate": 2.88310547650765e-08, + "loss": 0.1659, + "step": 14179 + }, + { + "epoch": 3.773283661522086, + "grad_norm": 0.37741371989250183, + "learning_rate": 2.8819191729606906e-08, + "loss": 0.1931, + "step": 14180 + }, + { + "epoch": 3.7735497605109103, + "grad_norm": 0.30101919174194336, + "learning_rate": 2.8807330724358625e-08, + "loss": 0.1723, + "step": 14181 + }, + { + "epoch": 3.773815859499734, + "grad_norm": 0.28756019473075867, + "learning_rate": 2.8795471749669953e-08, + "loss": 0.198, + "step": 14182 + }, + { + "epoch": 3.7740819584885577, + "grad_norm": 0.2868359386920929, + "learning_rate": 2.8783614805879108e-08, + "loss": 0.1604, + "step": 14183 + }, + { + "epoch": 3.774348057477382, + "grad_norm": 0.28079932928085327, + "learning_rate": 2.877175989332431e-08, + "loss": 0.1893, + "step": 14184 + }, + { + "epoch": 3.7746141564662055, + "grad_norm": 0.506921648979187, + "learning_rate": 2.875990701234363e-08, + "loss": 0.1967, + "step": 14185 + }, + { + "epoch": 3.774880255455029, + "grad_norm": 0.26388856768608093, + "learning_rate": 2.8748056163275204e-08, + "loss": 0.1639, + "step": 14186 + }, + { + "epoch": 3.7751463544438533, + "grad_norm": 0.3380530774593353, + "learning_rate": 2.8736207346456976e-08, + "loss": 0.1679, + "step": 14187 + }, + { + "epoch": 3.775412453432677, + "grad_norm": 0.39627036452293396, + "learning_rate": 2.8724360562226925e-08, + "loss": 0.1786, + "step": 14188 + }, + { + "epoch": 3.7756785524215006, + "grad_norm": 0.27491286396980286, + "learning_rate": 2.8712515810922955e-08, + "loss": 0.1691, + "step": 14189 + }, + { + "epoch": 3.7759446514103248, + "grad_norm": 0.25736698508262634, + "learning_rate": 2.870067309288289e-08, + "loss": 0.17, + "step": 14190 + }, + { + "epoch": 3.7762107503991484, + "grad_norm": 0.2678090035915375, + "learning_rate": 2.8688832408444554e-08, + "loss": 0.1671, + "step": 14191 + }, + { + "epoch": 3.776476849387972, + "grad_norm": 0.296122670173645, + "learning_rate": 2.86769937579456e-08, + "loss": 0.1851, + "step": 14192 + }, + { + "epoch": 3.7767429483767962, + "grad_norm": 0.286933958530426, + "learning_rate": 2.866515714172374e-08, + "loss": 0.1727, + "step": 14193 + }, + { + "epoch": 3.77700904736562, + "grad_norm": 0.3059804439544678, + "learning_rate": 2.8653322560116533e-08, + "loss": 0.1777, + "step": 14194 + }, + { + "epoch": 3.7772751463544436, + "grad_norm": 0.25406414270401, + "learning_rate": 2.8641490013461546e-08, + "loss": 0.149, + "step": 14195 + }, + { + "epoch": 3.7775412453432677, + "grad_norm": 0.2752766013145447, + "learning_rate": 2.8629659502096282e-08, + "loss": 0.1898, + "step": 14196 + }, + { + "epoch": 3.7778073443320914, + "grad_norm": 0.2643532156944275, + "learning_rate": 2.8617831026358174e-08, + "loss": 0.1586, + "step": 14197 + }, + { + "epoch": 3.778073443320915, + "grad_norm": 0.288319855928421, + "learning_rate": 2.8606004586584566e-08, + "loss": 0.179, + "step": 14198 + }, + { + "epoch": 3.7783395423097392, + "grad_norm": 0.4870533347129822, + "learning_rate": 2.8594180183112816e-08, + "loss": 0.1989, + "step": 14199 + }, + { + "epoch": 3.778605641298563, + "grad_norm": 0.2864154577255249, + "learning_rate": 2.858235781628011e-08, + "loss": 0.1861, + "step": 14200 + }, + { + "epoch": 3.778871740287387, + "grad_norm": 0.36206841468811035, + "learning_rate": 2.8570537486423718e-08, + "loss": 0.1768, + "step": 14201 + }, + { + "epoch": 3.7791378392762107, + "grad_norm": 0.276959627866745, + "learning_rate": 2.855871919388072e-08, + "loss": 0.1671, + "step": 14202 + }, + { + "epoch": 3.779403938265035, + "grad_norm": 0.3035353720188141, + "learning_rate": 2.8546902938988236e-08, + "loss": 0.1672, + "step": 14203 + }, + { + "epoch": 3.7796700372538585, + "grad_norm": 0.2604016363620758, + "learning_rate": 2.8535088722083268e-08, + "loss": 0.1682, + "step": 14204 + }, + { + "epoch": 3.779936136242682, + "grad_norm": 0.4494568705558777, + "learning_rate": 2.8523276543502795e-08, + "loss": 0.1906, + "step": 14205 + }, + { + "epoch": 3.7802022352315063, + "grad_norm": 0.2824482023715973, + "learning_rate": 2.8511466403583763e-08, + "loss": 0.1808, + "step": 14206 + }, + { + "epoch": 3.78046833422033, + "grad_norm": 0.35634171962738037, + "learning_rate": 2.8499658302662945e-08, + "loss": 0.2011, + "step": 14207 + }, + { + "epoch": 3.7807344332091537, + "grad_norm": 0.385709285736084, + "learning_rate": 2.8487852241077193e-08, + "loss": 0.1771, + "step": 14208 + }, + { + "epoch": 3.781000532197978, + "grad_norm": 0.2561333179473877, + "learning_rate": 2.8476048219163186e-08, + "loss": 0.1614, + "step": 14209 + }, + { + "epoch": 3.7812666311868015, + "grad_norm": 0.36952468752861023, + "learning_rate": 2.8464246237257626e-08, + "loss": 0.2035, + "step": 14210 + }, + { + "epoch": 3.781532730175625, + "grad_norm": 0.3363436758518219, + "learning_rate": 2.8452446295697143e-08, + "loss": 0.1793, + "step": 14211 + }, + { + "epoch": 3.7817988291644493, + "grad_norm": 0.2865673899650574, + "learning_rate": 2.84406483948183e-08, + "loss": 0.1925, + "step": 14212 + }, + { + "epoch": 3.782064928153273, + "grad_norm": 0.2748170793056488, + "learning_rate": 2.842885253495755e-08, + "loss": 0.1651, + "step": 14213 + }, + { + "epoch": 3.7823310271420967, + "grad_norm": 0.33785951137542725, + "learning_rate": 2.8417058716451404e-08, + "loss": 0.169, + "step": 14214 + }, + { + "epoch": 3.782597126130921, + "grad_norm": 0.26565638184547424, + "learning_rate": 2.8405266939636175e-08, + "loss": 0.1832, + "step": 14215 + }, + { + "epoch": 3.7828632251197445, + "grad_norm": 0.4366054832935333, + "learning_rate": 2.839347720484825e-08, + "loss": 0.1919, + "step": 14216 + }, + { + "epoch": 3.783129324108568, + "grad_norm": 0.3228394687175751, + "learning_rate": 2.83816895124238e-08, + "loss": 0.1931, + "step": 14217 + }, + { + "epoch": 3.7833954230973923, + "grad_norm": 0.36043694615364075, + "learning_rate": 2.8369903862699163e-08, + "loss": 0.196, + "step": 14218 + }, + { + "epoch": 3.783661522086216, + "grad_norm": 0.2742183208465576, + "learning_rate": 2.8358120256010398e-08, + "loss": 0.1667, + "step": 14219 + }, + { + "epoch": 3.7839276210750397, + "grad_norm": 0.26446616649627686, + "learning_rate": 2.834633869269366e-08, + "loss": 0.1592, + "step": 14220 + }, + { + "epoch": 3.784193720063864, + "grad_norm": 0.2923186123371124, + "learning_rate": 2.8334559173084916e-08, + "loss": 0.1866, + "step": 14221 + }, + { + "epoch": 3.7844598190526875, + "grad_norm": 0.2821591794490814, + "learning_rate": 2.8322781697520215e-08, + "loss": 0.1799, + "step": 14222 + }, + { + "epoch": 3.7847259180415116, + "grad_norm": 0.6320244669914246, + "learning_rate": 2.8311006266335392e-08, + "loss": 0.1785, + "step": 14223 + }, + { + "epoch": 3.7849920170303353, + "grad_norm": 0.3120080828666687, + "learning_rate": 2.8299232879866352e-08, + "loss": 0.1821, + "step": 14224 + }, + { + "epoch": 3.7852581160191594, + "grad_norm": 0.35138705372810364, + "learning_rate": 2.8287461538448897e-08, + "loss": 0.1787, + "step": 14225 + }, + { + "epoch": 3.785524215007983, + "grad_norm": 0.28736498951911926, + "learning_rate": 2.827569224241877e-08, + "loss": 0.1788, + "step": 14226 + }, + { + "epoch": 3.7857903139968068, + "grad_norm": 0.33972805738449097, + "learning_rate": 2.8263924992111675e-08, + "loss": 0.1624, + "step": 14227 + }, + { + "epoch": 3.786056412985631, + "grad_norm": 0.3247544765472412, + "learning_rate": 2.8252159787863184e-08, + "loss": 0.19, + "step": 14228 + }, + { + "epoch": 3.7863225119744546, + "grad_norm": 0.3712080717086792, + "learning_rate": 2.8240396630008922e-08, + "loss": 0.1859, + "step": 14229 + }, + { + "epoch": 3.7865886109632783, + "grad_norm": 0.3736477494239807, + "learning_rate": 2.8228635518884347e-08, + "loss": 0.1808, + "step": 14230 + }, + { + "epoch": 3.7868547099521024, + "grad_norm": 0.25882574915885925, + "learning_rate": 2.8216876454824923e-08, + "loss": 0.1478, + "step": 14231 + }, + { + "epoch": 3.787120808940926, + "grad_norm": 0.3595260977745056, + "learning_rate": 2.820511943816606e-08, + "loss": 0.1867, + "step": 14232 + }, + { + "epoch": 3.7873869079297497, + "grad_norm": 0.4247523546218872, + "learning_rate": 2.819336446924312e-08, + "loss": 0.1679, + "step": 14233 + }, + { + "epoch": 3.787653006918574, + "grad_norm": 0.35507968068122864, + "learning_rate": 2.8181611548391304e-08, + "loss": 0.1833, + "step": 14234 + }, + { + "epoch": 3.7879191059073976, + "grad_norm": 0.2932634651660919, + "learning_rate": 2.81698606759459e-08, + "loss": 0.1703, + "step": 14235 + }, + { + "epoch": 3.7881852048962212, + "grad_norm": 0.33037152886390686, + "learning_rate": 2.8158111852242017e-08, + "loss": 0.1587, + "step": 14236 + }, + { + "epoch": 3.7884513038850454, + "grad_norm": 0.3791564404964447, + "learning_rate": 2.8146365077614787e-08, + "loss": 0.1688, + "step": 14237 + }, + { + "epoch": 3.788717402873869, + "grad_norm": 0.4256933331489563, + "learning_rate": 2.8134620352399218e-08, + "loss": 0.182, + "step": 14238 + }, + { + "epoch": 3.7889835018626927, + "grad_norm": 0.3863627314567566, + "learning_rate": 2.8122877676930312e-08, + "loss": 0.1853, + "step": 14239 + }, + { + "epoch": 3.789249600851517, + "grad_norm": 0.27439403533935547, + "learning_rate": 2.8111137051543e-08, + "loss": 0.181, + "step": 14240 + }, + { + "epoch": 3.7895156998403405, + "grad_norm": 0.28191789984703064, + "learning_rate": 2.8099398476572166e-08, + "loss": 0.1788, + "step": 14241 + }, + { + "epoch": 3.789781798829164, + "grad_norm": 0.2754057049751282, + "learning_rate": 2.808766195235257e-08, + "loss": 0.1722, + "step": 14242 + }, + { + "epoch": 3.7900478978179883, + "grad_norm": 0.35005325078964233, + "learning_rate": 2.8075927479218987e-08, + "loss": 0.1834, + "step": 14243 + }, + { + "epoch": 3.790313996806812, + "grad_norm": 0.31876206398010254, + "learning_rate": 2.8064195057506147e-08, + "loss": 0.1718, + "step": 14244 + }, + { + "epoch": 3.7905800957956357, + "grad_norm": 0.335448682308197, + "learning_rate": 2.8052464687548605e-08, + "loss": 0.1668, + "step": 14245 + }, + { + "epoch": 3.79084619478446, + "grad_norm": 0.2761988639831543, + "learning_rate": 2.8040736369680973e-08, + "loss": 0.1634, + "step": 14246 + }, + { + "epoch": 3.7911122937732835, + "grad_norm": 0.24124263226985931, + "learning_rate": 2.8029010104237783e-08, + "loss": 0.1448, + "step": 14247 + }, + { + "epoch": 3.7913783927621076, + "grad_norm": 0.2831011712551117, + "learning_rate": 2.801728589155349e-08, + "loss": 0.1765, + "step": 14248 + }, + { + "epoch": 3.7916444917509313, + "grad_norm": 0.3644675016403198, + "learning_rate": 2.8005563731962455e-08, + "loss": 0.18, + "step": 14249 + }, + { + "epoch": 3.7919105907397554, + "grad_norm": 0.26973050832748413, + "learning_rate": 2.7993843625799074e-08, + "loss": 0.1748, + "step": 14250 + }, + { + "epoch": 3.792176689728579, + "grad_norm": 0.33500927686691284, + "learning_rate": 2.7982125573397563e-08, + "loss": 0.1797, + "step": 14251 + }, + { + "epoch": 3.792442788717403, + "grad_norm": 0.2889283001422882, + "learning_rate": 2.7970409575092212e-08, + "loss": 0.173, + "step": 14252 + }, + { + "epoch": 3.792708887706227, + "grad_norm": 0.39351871609687805, + "learning_rate": 2.7958695631217123e-08, + "loss": 0.1717, + "step": 14253 + }, + { + "epoch": 3.7929749866950506, + "grad_norm": 0.36460474133491516, + "learning_rate": 2.7946983742106423e-08, + "loss": 0.1661, + "step": 14254 + }, + { + "epoch": 3.7932410856838743, + "grad_norm": 0.2902750074863434, + "learning_rate": 2.793527390809417e-08, + "loss": 0.1872, + "step": 14255 + }, + { + "epoch": 3.7935071846726984, + "grad_norm": 0.281682550907135, + "learning_rate": 2.7923566129514375e-08, + "loss": 0.1634, + "step": 14256 + }, + { + "epoch": 3.793773283661522, + "grad_norm": 0.44364768266677856, + "learning_rate": 2.7911860406700915e-08, + "loss": 0.1918, + "step": 14257 + }, + { + "epoch": 3.794039382650346, + "grad_norm": 0.34579306840896606, + "learning_rate": 2.790015673998771e-08, + "loss": 0.1803, + "step": 14258 + }, + { + "epoch": 3.79430548163917, + "grad_norm": 0.3662475645542145, + "learning_rate": 2.7888455129708522e-08, + "loss": 0.1733, + "step": 14259 + }, + { + "epoch": 3.7945715806279936, + "grad_norm": 0.3422587513923645, + "learning_rate": 2.787675557619713e-08, + "loss": 0.1717, + "step": 14260 + }, + { + "epoch": 3.7948376796168173, + "grad_norm": 0.2664075195789337, + "learning_rate": 2.7865058079787228e-08, + "loss": 0.1601, + "step": 14261 + }, + { + "epoch": 3.7951037786056414, + "grad_norm": 0.28835874795913696, + "learning_rate": 2.785336264081246e-08, + "loss": 0.1867, + "step": 14262 + }, + { + "epoch": 3.795369877594465, + "grad_norm": 0.28420206904411316, + "learning_rate": 2.7841669259606426e-08, + "loss": 0.1871, + "step": 14263 + }, + { + "epoch": 3.7956359765832888, + "grad_norm": 0.32249459624290466, + "learning_rate": 2.782997793650259e-08, + "loss": 0.194, + "step": 14264 + }, + { + "epoch": 3.795902075572113, + "grad_norm": 0.35034796595573425, + "learning_rate": 2.7818288671834466e-08, + "loss": 0.1849, + "step": 14265 + }, + { + "epoch": 3.7961681745609366, + "grad_norm": 0.2648886740207672, + "learning_rate": 2.7806601465935398e-08, + "loss": 0.1684, + "step": 14266 + }, + { + "epoch": 3.7964342735497603, + "grad_norm": 0.25828343629837036, + "learning_rate": 2.7794916319138795e-08, + "loss": 0.1805, + "step": 14267 + }, + { + "epoch": 3.7967003725385844, + "grad_norm": 0.3396303951740265, + "learning_rate": 2.7783233231777847e-08, + "loss": 0.1808, + "step": 14268 + }, + { + "epoch": 3.796966471527408, + "grad_norm": 0.291789710521698, + "learning_rate": 2.777155220418589e-08, + "loss": 0.1812, + "step": 14269 + }, + { + "epoch": 3.797232570516232, + "grad_norm": 0.4013032019138336, + "learning_rate": 2.775987323669602e-08, + "loss": 0.1572, + "step": 14270 + }, + { + "epoch": 3.797498669505056, + "grad_norm": 0.3579927384853363, + "learning_rate": 2.774819632964138e-08, + "loss": 0.1844, + "step": 14271 + }, + { + "epoch": 3.7977647684938796, + "grad_norm": 0.35520389676094055, + "learning_rate": 2.7736521483354992e-08, + "loss": 0.1857, + "step": 14272 + }, + { + "epoch": 3.7980308674827037, + "grad_norm": 0.6234651803970337, + "learning_rate": 2.7724848698169878e-08, + "loss": 0.1849, + "step": 14273 + }, + { + "epoch": 3.7982969664715274, + "grad_norm": 0.35257819294929504, + "learning_rate": 2.7713177974418922e-08, + "loss": 0.1776, + "step": 14274 + }, + { + "epoch": 3.7985630654603515, + "grad_norm": 0.31222638487815857, + "learning_rate": 2.7701509312435022e-08, + "loss": 0.1973, + "step": 14275 + }, + { + "epoch": 3.798829164449175, + "grad_norm": 0.39012718200683594, + "learning_rate": 2.7689842712550993e-08, + "loss": 0.1798, + "step": 14276 + }, + { + "epoch": 3.799095263437999, + "grad_norm": 0.37936916947364807, + "learning_rate": 2.767817817509962e-08, + "loss": 0.1928, + "step": 14277 + }, + { + "epoch": 3.799361362426823, + "grad_norm": 0.27059662342071533, + "learning_rate": 2.766651570041354e-08, + "loss": 0.1847, + "step": 14278 + }, + { + "epoch": 3.7996274614156467, + "grad_norm": 0.48001018166542053, + "learning_rate": 2.765485528882543e-08, + "loss": 0.2063, + "step": 14279 + }, + { + "epoch": 3.7998935604044703, + "grad_norm": 0.5210527181625366, + "learning_rate": 2.7643196940667878e-08, + "loss": 0.1813, + "step": 14280 + }, + { + "epoch": 3.8001596593932945, + "grad_norm": 0.28353047370910645, + "learning_rate": 2.7631540656273356e-08, + "loss": 0.1816, + "step": 14281 + }, + { + "epoch": 3.800425758382118, + "grad_norm": 0.2646796405315399, + "learning_rate": 2.761988643597436e-08, + "loss": 0.1669, + "step": 14282 + }, + { + "epoch": 3.800691857370942, + "grad_norm": 0.2788282632827759, + "learning_rate": 2.7608234280103294e-08, + "loss": 0.1725, + "step": 14283 + }, + { + "epoch": 3.800957956359766, + "grad_norm": 0.2560083866119385, + "learning_rate": 2.759658418899251e-08, + "loss": 0.1616, + "step": 14284 + }, + { + "epoch": 3.8012240553485896, + "grad_norm": 0.3923298120498657, + "learning_rate": 2.758493616297425e-08, + "loss": 0.1872, + "step": 14285 + }, + { + "epoch": 3.8014901543374133, + "grad_norm": 0.27148088812828064, + "learning_rate": 2.7573290202380796e-08, + "loss": 0.1739, + "step": 14286 + }, + { + "epoch": 3.8017562533262375, + "grad_norm": 0.28193551301956177, + "learning_rate": 2.756164630754425e-08, + "loss": 0.1603, + "step": 14287 + }, + { + "epoch": 3.802022352315061, + "grad_norm": 0.36214199662208557, + "learning_rate": 2.7550004478796785e-08, + "loss": 0.183, + "step": 14288 + }, + { + "epoch": 3.802288451303885, + "grad_norm": 0.4157964885234833, + "learning_rate": 2.7538364716470387e-08, + "loss": 0.1928, + "step": 14289 + }, + { + "epoch": 3.802554550292709, + "grad_norm": 0.3933499753475189, + "learning_rate": 2.7526727020897078e-08, + "loss": 0.1996, + "step": 14290 + }, + { + "epoch": 3.8028206492815326, + "grad_norm": 0.512622594833374, + "learning_rate": 2.7515091392408786e-08, + "loss": 0.2001, + "step": 14291 + }, + { + "epoch": 3.8030867482703563, + "grad_norm": 0.2741031050682068, + "learning_rate": 2.750345783133742e-08, + "loss": 0.1821, + "step": 14292 + }, + { + "epoch": 3.8033528472591804, + "grad_norm": 0.3764447867870331, + "learning_rate": 2.749182633801471e-08, + "loss": 0.201, + "step": 14293 + }, + { + "epoch": 3.803618946248004, + "grad_norm": 0.31687983870506287, + "learning_rate": 2.7480196912772503e-08, + "loss": 0.1579, + "step": 14294 + }, + { + "epoch": 3.8038850452368282, + "grad_norm": 0.27523136138916016, + "learning_rate": 2.7468569555942413e-08, + "loss": 0.1771, + "step": 14295 + }, + { + "epoch": 3.804151144225652, + "grad_norm": 0.2795974314212799, + "learning_rate": 2.7456944267856108e-08, + "loss": 0.1678, + "step": 14296 + }, + { + "epoch": 3.804417243214476, + "grad_norm": 0.3686608374118805, + "learning_rate": 2.7445321048845172e-08, + "loss": 0.1769, + "step": 14297 + }, + { + "epoch": 3.8046833422032997, + "grad_norm": 0.9595808982849121, + "learning_rate": 2.7433699899241113e-08, + "loss": 0.1978, + "step": 14298 + }, + { + "epoch": 3.8049494411921234, + "grad_norm": 0.2992953360080719, + "learning_rate": 2.7422080819375427e-08, + "loss": 0.1695, + "step": 14299 + }, + { + "epoch": 3.8052155401809475, + "grad_norm": 1.1154283285140991, + "learning_rate": 2.7410463809579454e-08, + "loss": 0.1665, + "step": 14300 + }, + { + "epoch": 3.8054816391697712, + "grad_norm": 0.2935495674610138, + "learning_rate": 2.7398848870184588e-08, + "loss": 0.1759, + "step": 14301 + }, + { + "epoch": 3.805747738158595, + "grad_norm": 0.5680873990058899, + "learning_rate": 2.738723600152205e-08, + "loss": 0.165, + "step": 14302 + }, + { + "epoch": 3.806013837147419, + "grad_norm": 0.3785777986049652, + "learning_rate": 2.7375625203923135e-08, + "loss": 0.194, + "step": 14303 + }, + { + "epoch": 3.8062799361362427, + "grad_norm": 0.30649420619010925, + "learning_rate": 2.736401647771893e-08, + "loss": 0.1842, + "step": 14304 + }, + { + "epoch": 3.8065460351250664, + "grad_norm": 0.2694842219352722, + "learning_rate": 2.7352409823240586e-08, + "loss": 0.1668, + "step": 14305 + }, + { + "epoch": 3.8068121341138905, + "grad_norm": 0.26906993985176086, + "learning_rate": 2.7340805240819144e-08, + "loss": 0.187, + "step": 14306 + }, + { + "epoch": 3.807078233102714, + "grad_norm": 0.3813103437423706, + "learning_rate": 2.7329202730785617e-08, + "loss": 0.1694, + "step": 14307 + }, + { + "epoch": 3.807344332091538, + "grad_norm": 0.26238784193992615, + "learning_rate": 2.7317602293470866e-08, + "loss": 0.1645, + "step": 14308 + }, + { + "epoch": 3.807610431080362, + "grad_norm": 0.28934746980667114, + "learning_rate": 2.7306003929205824e-08, + "loss": 0.1647, + "step": 14309 + }, + { + "epoch": 3.8078765300691857, + "grad_norm": 0.2574230134487152, + "learning_rate": 2.729440763832125e-08, + "loss": 0.1701, + "step": 14310 + }, + { + "epoch": 3.8081426290580094, + "grad_norm": 0.2674959897994995, + "learning_rate": 2.728281342114791e-08, + "loss": 0.1619, + "step": 14311 + }, + { + "epoch": 3.8084087280468335, + "grad_norm": 0.2834394574165344, + "learning_rate": 2.72712212780165e-08, + "loss": 0.1782, + "step": 14312 + }, + { + "epoch": 3.808674827035657, + "grad_norm": 0.2772866487503052, + "learning_rate": 2.7259631209257672e-08, + "loss": 0.1856, + "step": 14313 + }, + { + "epoch": 3.808940926024481, + "grad_norm": 0.3607513904571533, + "learning_rate": 2.7248043215201956e-08, + "loss": 0.1718, + "step": 14314 + }, + { + "epoch": 3.809207025013305, + "grad_norm": 0.3888155221939087, + "learning_rate": 2.7236457296179893e-08, + "loss": 0.1819, + "step": 14315 + }, + { + "epoch": 3.8094731240021287, + "grad_norm": 0.31931740045547485, + "learning_rate": 2.7224873452521945e-08, + "loss": 0.1755, + "step": 14316 + }, + { + "epoch": 3.809739222990953, + "grad_norm": 0.34817126393318176, + "learning_rate": 2.7213291684558482e-08, + "loss": 0.1853, + "step": 14317 + }, + { + "epoch": 3.8100053219797765, + "grad_norm": 0.35399916768074036, + "learning_rate": 2.720171199261987e-08, + "loss": 0.1746, + "step": 14318 + }, + { + "epoch": 3.8102714209686, + "grad_norm": 0.2703126072883606, + "learning_rate": 2.71901343770363e-08, + "loss": 0.1689, + "step": 14319 + }, + { + "epoch": 3.8105375199574243, + "grad_norm": 0.4877060055732727, + "learning_rate": 2.717855883813812e-08, + "loss": 0.1824, + "step": 14320 + }, + { + "epoch": 3.810803618946248, + "grad_norm": 0.2760414183139801, + "learning_rate": 2.716698537625539e-08, + "loss": 0.1625, + "step": 14321 + }, + { + "epoch": 3.811069717935072, + "grad_norm": 0.34847432374954224, + "learning_rate": 2.7155413991718268e-08, + "loss": 0.1742, + "step": 14322 + }, + { + "epoch": 3.811335816923896, + "grad_norm": 0.2737325131893158, + "learning_rate": 2.714384468485674e-08, + "loss": 0.1643, + "step": 14323 + }, + { + "epoch": 3.8116019159127195, + "grad_norm": 0.28464043140411377, + "learning_rate": 2.713227745600084e-08, + "loss": 0.1845, + "step": 14324 + }, + { + "epoch": 3.8118680149015436, + "grad_norm": 0.27516913414001465, + "learning_rate": 2.712071230548043e-08, + "loss": 0.1738, + "step": 14325 + }, + { + "epoch": 3.8121341138903673, + "grad_norm": 0.2846717834472656, + "learning_rate": 2.7109149233625417e-08, + "loss": 0.1712, + "step": 14326 + }, + { + "epoch": 3.812400212879191, + "grad_norm": 0.2804820239543915, + "learning_rate": 2.7097588240765568e-08, + "loss": 0.1731, + "step": 14327 + }, + { + "epoch": 3.812666311868015, + "grad_norm": 0.34139347076416016, + "learning_rate": 2.7086029327230696e-08, + "loss": 0.1869, + "step": 14328 + }, + { + "epoch": 3.8129324108568388, + "grad_norm": 0.29146236181259155, + "learning_rate": 2.70744724933504e-08, + "loss": 0.1683, + "step": 14329 + }, + { + "epoch": 3.8131985098456624, + "grad_norm": 0.358633816242218, + "learning_rate": 2.7062917739454362e-08, + "loss": 0.2074, + "step": 14330 + }, + { + "epoch": 3.8134646088344866, + "grad_norm": 0.2628372311592102, + "learning_rate": 2.70513650658721e-08, + "loss": 0.1594, + "step": 14331 + }, + { + "epoch": 3.8137307078233103, + "grad_norm": 0.2946184277534485, + "learning_rate": 2.7039814472933143e-08, + "loss": 0.1786, + "step": 14332 + }, + { + "epoch": 3.813996806812134, + "grad_norm": 0.481552392244339, + "learning_rate": 2.7028265960966946e-08, + "loss": 0.186, + "step": 14333 + }, + { + "epoch": 3.814262905800958, + "grad_norm": 0.6860707402229309, + "learning_rate": 2.701671953030289e-08, + "loss": 0.18, + "step": 14334 + }, + { + "epoch": 3.8145290047897817, + "grad_norm": 0.29260531067848206, + "learning_rate": 2.7005175181270324e-08, + "loss": 0.1723, + "step": 14335 + }, + { + "epoch": 3.8147951037786054, + "grad_norm": 0.3284158408641815, + "learning_rate": 2.6993632914198473e-08, + "loss": 0.192, + "step": 14336 + }, + { + "epoch": 3.8150612027674295, + "grad_norm": 0.3431093394756317, + "learning_rate": 2.6982092729416582e-08, + "loss": 0.1731, + "step": 14337 + }, + { + "epoch": 3.8153273017562532, + "grad_norm": 0.2780545949935913, + "learning_rate": 2.6970554627253772e-08, + "loss": 0.1624, + "step": 14338 + }, + { + "epoch": 3.815593400745077, + "grad_norm": 0.28230783343315125, + "learning_rate": 2.6959018608039176e-08, + "loss": 0.1916, + "step": 14339 + }, + { + "epoch": 3.815859499733901, + "grad_norm": 0.27826985716819763, + "learning_rate": 2.6947484672101763e-08, + "loss": 0.1737, + "step": 14340 + }, + { + "epoch": 3.8161255987227247, + "grad_norm": 0.27851444482803345, + "learning_rate": 2.6935952819770525e-08, + "loss": 0.1801, + "step": 14341 + }, + { + "epoch": 3.816391697711549, + "grad_norm": 0.26580318808555603, + "learning_rate": 2.69244230513744e-08, + "loss": 0.1761, + "step": 14342 + }, + { + "epoch": 3.8166577967003725, + "grad_norm": 0.2934257984161377, + "learning_rate": 2.691289536724225e-08, + "loss": 0.1781, + "step": 14343 + }, + { + "epoch": 3.8169238956891967, + "grad_norm": 0.26829928159713745, + "learning_rate": 2.6901369767702818e-08, + "loss": 0.1728, + "step": 14344 + }, + { + "epoch": 3.8171899946780203, + "grad_norm": 0.2518172860145569, + "learning_rate": 2.688984625308489e-08, + "loss": 0.1676, + "step": 14345 + }, + { + "epoch": 3.817456093666844, + "grad_norm": 0.3019217848777771, + "learning_rate": 2.687832482371709e-08, + "loss": 0.1985, + "step": 14346 + }, + { + "epoch": 3.817722192655668, + "grad_norm": 0.3563200533390045, + "learning_rate": 2.6866805479928046e-08, + "loss": 0.182, + "step": 14347 + }, + { + "epoch": 3.817988291644492, + "grad_norm": 0.4409105181694031, + "learning_rate": 2.6855288222046336e-08, + "loss": 0.1871, + "step": 14348 + }, + { + "epoch": 3.8182543906333155, + "grad_norm": 0.30122414231300354, + "learning_rate": 2.6843773050400465e-08, + "loss": 0.1962, + "step": 14349 + }, + { + "epoch": 3.8185204896221396, + "grad_norm": 0.2715294659137726, + "learning_rate": 2.6832259965318817e-08, + "loss": 0.1732, + "step": 14350 + }, + { + "epoch": 3.8187865886109633, + "grad_norm": 0.29111143946647644, + "learning_rate": 2.6820748967129803e-08, + "loss": 0.1841, + "step": 14351 + }, + { + "epoch": 3.819052687599787, + "grad_norm": 0.3020817041397095, + "learning_rate": 2.6809240056161764e-08, + "loss": 0.1598, + "step": 14352 + }, + { + "epoch": 3.819318786588611, + "grad_norm": 0.43484237790107727, + "learning_rate": 2.6797733232742903e-08, + "loss": 0.1825, + "step": 14353 + }, + { + "epoch": 3.819584885577435, + "grad_norm": 0.40445849299430847, + "learning_rate": 2.678622849720147e-08, + "loss": 0.1947, + "step": 14354 + }, + { + "epoch": 3.8198509845662585, + "grad_norm": 0.274053156375885, + "learning_rate": 2.6774725849865543e-08, + "loss": 0.1687, + "step": 14355 + }, + { + "epoch": 3.8201170835550826, + "grad_norm": 0.2841004729270935, + "learning_rate": 2.676322529106324e-08, + "loss": 0.1773, + "step": 14356 + }, + { + "epoch": 3.8203831825439063, + "grad_norm": 0.38358885049819946, + "learning_rate": 2.675172682112258e-08, + "loss": 0.1763, + "step": 14357 + }, + { + "epoch": 3.82064928153273, + "grad_norm": 0.3452725112438202, + "learning_rate": 2.6740230440371546e-08, + "loss": 0.1607, + "step": 14358 + }, + { + "epoch": 3.820915380521554, + "grad_norm": 0.3478056490421295, + "learning_rate": 2.6728736149137977e-08, + "loss": 0.185, + "step": 14359 + }, + { + "epoch": 3.821181479510378, + "grad_norm": 0.3516692519187927, + "learning_rate": 2.671724394774978e-08, + "loss": 0.1874, + "step": 14360 + }, + { + "epoch": 3.8214475784992015, + "grad_norm": 0.2937411367893219, + "learning_rate": 2.6705753836534674e-08, + "loss": 0.1922, + "step": 14361 + }, + { + "epoch": 3.8217136774880256, + "grad_norm": 0.28313156962394714, + "learning_rate": 2.6694265815820404e-08, + "loss": 0.1572, + "step": 14362 + }, + { + "epoch": 3.8219797764768493, + "grad_norm": 0.38082993030548096, + "learning_rate": 2.6682779885934646e-08, + "loss": 0.188, + "step": 14363 + }, + { + "epoch": 3.822245875465673, + "grad_norm": 0.2830566465854645, + "learning_rate": 2.6671296047205026e-08, + "loss": 0.1634, + "step": 14364 + }, + { + "epoch": 3.822511974454497, + "grad_norm": 0.3443983197212219, + "learning_rate": 2.6659814299959015e-08, + "loss": 0.1728, + "step": 14365 + }, + { + "epoch": 3.8227780734433208, + "grad_norm": 0.28783154487609863, + "learning_rate": 2.6648334644524163e-08, + "loss": 0.1702, + "step": 14366 + }, + { + "epoch": 3.823044172432145, + "grad_norm": 0.27720171213150024, + "learning_rate": 2.663685708122785e-08, + "loss": 0.1811, + "step": 14367 + }, + { + "epoch": 3.8233102714209686, + "grad_norm": 0.29655492305755615, + "learning_rate": 2.662538161039747e-08, + "loss": 0.1824, + "step": 14368 + }, + { + "epoch": 3.8235763704097927, + "grad_norm": 0.2758360505104065, + "learning_rate": 2.661390823236028e-08, + "loss": 0.1852, + "step": 14369 + }, + { + "epoch": 3.8238424693986164, + "grad_norm": 0.5992519855499268, + "learning_rate": 2.660243694744353e-08, + "loss": 0.1756, + "step": 14370 + }, + { + "epoch": 3.82410856838744, + "grad_norm": 0.2693013846874237, + "learning_rate": 2.6590967755974492e-08, + "loss": 0.1729, + "step": 14371 + }, + { + "epoch": 3.824374667376264, + "grad_norm": 0.3793376088142395, + "learning_rate": 2.6579500658280185e-08, + "loss": 0.1925, + "step": 14372 + }, + { + "epoch": 3.824640766365088, + "grad_norm": 0.36851051449775696, + "learning_rate": 2.656803565468776e-08, + "loss": 0.1835, + "step": 14373 + }, + { + "epoch": 3.8249068653539116, + "grad_norm": 0.2955033779144287, + "learning_rate": 2.6556572745524142e-08, + "loss": 0.182, + "step": 14374 + }, + { + "epoch": 3.8251729643427357, + "grad_norm": 0.25774818658828735, + "learning_rate": 2.654511193111635e-08, + "loss": 0.18, + "step": 14375 + }, + { + "epoch": 3.8254390633315594, + "grad_norm": 0.2757955491542816, + "learning_rate": 2.6533653211791195e-08, + "loss": 0.1771, + "step": 14376 + }, + { + "epoch": 3.825705162320383, + "grad_norm": 0.892677903175354, + "learning_rate": 2.6522196587875545e-08, + "loss": 0.1924, + "step": 14377 + }, + { + "epoch": 3.825971261309207, + "grad_norm": 0.335078626871109, + "learning_rate": 2.6510742059696168e-08, + "loss": 0.1648, + "step": 14378 + }, + { + "epoch": 3.826237360298031, + "grad_norm": 0.3245508372783661, + "learning_rate": 2.649928962757979e-08, + "loss": 0.1851, + "step": 14379 + }, + { + "epoch": 3.8265034592868545, + "grad_norm": 0.326860636472702, + "learning_rate": 2.6487839291852998e-08, + "loss": 0.1901, + "step": 14380 + }, + { + "epoch": 3.8267695582756787, + "grad_norm": 0.32772544026374817, + "learning_rate": 2.6476391052842452e-08, + "loss": 0.1887, + "step": 14381 + }, + { + "epoch": 3.8270356572645023, + "grad_norm": 0.26524674892425537, + "learning_rate": 2.646494491087461e-08, + "loss": 0.1626, + "step": 14382 + }, + { + "epoch": 3.827301756253326, + "grad_norm": 0.32964155077934265, + "learning_rate": 2.6453500866275957e-08, + "loss": 0.1906, + "step": 14383 + }, + { + "epoch": 3.82756785524215, + "grad_norm": 0.2606751322746277, + "learning_rate": 2.6442058919372924e-08, + "loss": 0.1804, + "step": 14384 + }, + { + "epoch": 3.827833954230974, + "grad_norm": 0.30864235758781433, + "learning_rate": 2.6430619070491876e-08, + "loss": 0.183, + "step": 14385 + }, + { + "epoch": 3.8281000532197975, + "grad_norm": 0.37797224521636963, + "learning_rate": 2.6419181319959037e-08, + "loss": 0.1865, + "step": 14386 + }, + { + "epoch": 3.8283661522086216, + "grad_norm": 0.541749894618988, + "learning_rate": 2.640774566810071e-08, + "loss": 0.1888, + "step": 14387 + }, + { + "epoch": 3.8286322511974453, + "grad_norm": 0.3610127568244934, + "learning_rate": 2.639631211524298e-08, + "loss": 0.1645, + "step": 14388 + }, + { + "epoch": 3.8288983501862695, + "grad_norm": 0.3793638050556183, + "learning_rate": 2.6384880661712004e-08, + "loss": 0.1626, + "step": 14389 + }, + { + "epoch": 3.829164449175093, + "grad_norm": 0.3115416467189789, + "learning_rate": 2.637345130783386e-08, + "loss": 0.1777, + "step": 14390 + }, + { + "epoch": 3.829430548163917, + "grad_norm": 0.2685205936431885, + "learning_rate": 2.636202405393447e-08, + "loss": 0.1671, + "step": 14391 + }, + { + "epoch": 3.829696647152741, + "grad_norm": 0.2569943964481354, + "learning_rate": 2.6350598900339794e-08, + "loss": 0.1592, + "step": 14392 + }, + { + "epoch": 3.8299627461415646, + "grad_norm": 0.44210416078567505, + "learning_rate": 2.6339175847375696e-08, + "loss": 0.1859, + "step": 14393 + }, + { + "epoch": 3.8302288451303887, + "grad_norm": 0.2937765419483185, + "learning_rate": 2.6327754895368016e-08, + "loss": 0.1789, + "step": 14394 + }, + { + "epoch": 3.8304949441192124, + "grad_norm": 0.33021777868270874, + "learning_rate": 2.631633604464245e-08, + "loss": 0.1981, + "step": 14395 + }, + { + "epoch": 3.830761043108036, + "grad_norm": 0.359943687915802, + "learning_rate": 2.630491929552474e-08, + "loss": 0.1799, + "step": 14396 + }, + { + "epoch": 3.8310271420968602, + "grad_norm": 0.2780478894710541, + "learning_rate": 2.629350464834046e-08, + "loss": 0.1786, + "step": 14397 + }, + { + "epoch": 3.831293241085684, + "grad_norm": 0.3742665946483612, + "learning_rate": 2.62820921034152e-08, + "loss": 0.1761, + "step": 14398 + }, + { + "epoch": 3.8315593400745076, + "grad_norm": 0.2828143835067749, + "learning_rate": 2.627068166107448e-08, + "loss": 0.1753, + "step": 14399 + }, + { + "epoch": 3.8318254390633317, + "grad_norm": 0.36868199706077576, + "learning_rate": 2.6259273321643783e-08, + "loss": 0.1886, + "step": 14400 + }, + { + "epoch": 3.8320915380521554, + "grad_norm": 0.2503144145011902, + "learning_rate": 2.6247867085448418e-08, + "loss": 0.1587, + "step": 14401 + }, + { + "epoch": 3.832357637040979, + "grad_norm": 0.29210010170936584, + "learning_rate": 2.6236462952813786e-08, + "loss": 0.174, + "step": 14402 + }, + { + "epoch": 3.832623736029803, + "grad_norm": 0.34149351716041565, + "learning_rate": 2.6225060924065102e-08, + "loss": 0.1943, + "step": 14403 + }, + { + "epoch": 3.832889835018627, + "grad_norm": 0.272414892911911, + "learning_rate": 2.621366099952762e-08, + "loss": 0.1849, + "step": 14404 + }, + { + "epoch": 3.8331559340074506, + "grad_norm": 0.25964346528053284, + "learning_rate": 2.620226317952644e-08, + "loss": 0.1749, + "step": 14405 + }, + { + "epoch": 3.8334220329962747, + "grad_norm": 0.25095134973526, + "learning_rate": 2.619086746438668e-08, + "loss": 0.1565, + "step": 14406 + }, + { + "epoch": 3.8336881319850984, + "grad_norm": 0.28657427430152893, + "learning_rate": 2.6179473854433366e-08, + "loss": 0.1686, + "step": 14407 + }, + { + "epoch": 3.833954230973922, + "grad_norm": 0.36236658692359924, + "learning_rate": 2.616808234999146e-08, + "loss": 0.1768, + "step": 14408 + }, + { + "epoch": 3.834220329962746, + "grad_norm": 0.2813038229942322, + "learning_rate": 2.615669295138593e-08, + "loss": 0.1891, + "step": 14409 + }, + { + "epoch": 3.83448642895157, + "grad_norm": 0.29320529103279114, + "learning_rate": 2.6145305658941517e-08, + "loss": 0.1655, + "step": 14410 + }, + { + "epoch": 3.8347525279403936, + "grad_norm": 0.3407110869884491, + "learning_rate": 2.6133920472983117e-08, + "loss": 0.1785, + "step": 14411 + }, + { + "epoch": 3.8350186269292177, + "grad_norm": 0.4261418581008911, + "learning_rate": 2.6122537393835377e-08, + "loss": 0.1942, + "step": 14412 + }, + { + "epoch": 3.8352847259180414, + "grad_norm": 0.3294903039932251, + "learning_rate": 2.6111156421822988e-08, + "loss": 0.17, + "step": 14413 + }, + { + "epoch": 3.8355508249068655, + "grad_norm": 0.3452624976634979, + "learning_rate": 2.6099777557270573e-08, + "loss": 0.1857, + "step": 14414 + }, + { + "epoch": 3.835816923895689, + "grad_norm": 0.28405478596687317, + "learning_rate": 2.6088400800502698e-08, + "loss": 0.1729, + "step": 14415 + }, + { + "epoch": 3.8360830228845133, + "grad_norm": 0.2634807527065277, + "learning_rate": 2.6077026151843806e-08, + "loss": 0.1769, + "step": 14416 + }, + { + "epoch": 3.836349121873337, + "grad_norm": 0.24023090302944183, + "learning_rate": 2.6065653611618366e-08, + "loss": 0.1495, + "step": 14417 + }, + { + "epoch": 3.8366152208621607, + "grad_norm": 0.2739808261394501, + "learning_rate": 2.6054283180150703e-08, + "loss": 0.1662, + "step": 14418 + }, + { + "epoch": 3.836881319850985, + "grad_norm": 0.25979653000831604, + "learning_rate": 2.6042914857765185e-08, + "loss": 0.1584, + "step": 14419 + }, + { + "epoch": 3.8371474188398085, + "grad_norm": 0.42245832085609436, + "learning_rate": 2.603154864478596e-08, + "loss": 0.1812, + "step": 14420 + }, + { + "epoch": 3.837413517828632, + "grad_norm": 0.2371295988559723, + "learning_rate": 2.6020184541537326e-08, + "loss": 0.1575, + "step": 14421 + }, + { + "epoch": 3.8376796168174563, + "grad_norm": 0.3048984110355377, + "learning_rate": 2.6008822548343346e-08, + "loss": 0.1897, + "step": 14422 + }, + { + "epoch": 3.83794571580628, + "grad_norm": 0.377187043428421, + "learning_rate": 2.5997462665528136e-08, + "loss": 0.1906, + "step": 14423 + }, + { + "epoch": 3.8382118147951036, + "grad_norm": 0.7663853764533997, + "learning_rate": 2.598610489341564e-08, + "loss": 0.1939, + "step": 14424 + }, + { + "epoch": 3.8384779137839278, + "grad_norm": 0.33875560760498047, + "learning_rate": 2.597474923232983e-08, + "loss": 0.2053, + "step": 14425 + }, + { + "epoch": 3.8387440127727515, + "grad_norm": 0.25636768341064453, + "learning_rate": 2.5963395682594625e-08, + "loss": 0.1793, + "step": 14426 + }, + { + "epoch": 3.839010111761575, + "grad_norm": 0.27916428446769714, + "learning_rate": 2.59520442445338e-08, + "loss": 0.1655, + "step": 14427 + }, + { + "epoch": 3.8392762107503993, + "grad_norm": 0.2811829447746277, + "learning_rate": 2.5940694918471152e-08, + "loss": 0.1725, + "step": 14428 + }, + { + "epoch": 3.839542309739223, + "grad_norm": 0.30343249440193176, + "learning_rate": 2.5929347704730375e-08, + "loss": 0.1726, + "step": 14429 + }, + { + "epoch": 3.8398084087280466, + "grad_norm": 0.3257485628128052, + "learning_rate": 2.5918002603635146e-08, + "loss": 0.1621, + "step": 14430 + }, + { + "epoch": 3.8400745077168708, + "grad_norm": 0.272657185792923, + "learning_rate": 2.5906659615508997e-08, + "loss": 0.1651, + "step": 14431 + }, + { + "epoch": 3.8403406067056944, + "grad_norm": 0.36646443605422974, + "learning_rate": 2.589531874067552e-08, + "loss": 0.1738, + "step": 14432 + }, + { + "epoch": 3.840606705694518, + "grad_norm": 0.28719210624694824, + "learning_rate": 2.5883979979458093e-08, + "loss": 0.1584, + "step": 14433 + }, + { + "epoch": 3.8408728046833422, + "grad_norm": 0.30497443675994873, + "learning_rate": 2.5872643332180178e-08, + "loss": 0.1803, + "step": 14434 + }, + { + "epoch": 3.841138903672166, + "grad_norm": 0.2672058045864105, + "learning_rate": 2.5861308799165115e-08, + "loss": 0.1661, + "step": 14435 + }, + { + "epoch": 3.84140500266099, + "grad_norm": 0.25556665658950806, + "learning_rate": 2.5849976380736203e-08, + "loss": 0.155, + "step": 14436 + }, + { + "epoch": 3.8416711016498137, + "grad_norm": 0.2566050887107849, + "learning_rate": 2.583864607721662e-08, + "loss": 0.1622, + "step": 14437 + }, + { + "epoch": 3.8419372006386374, + "grad_norm": 0.37906894087791443, + "learning_rate": 2.5827317888929578e-08, + "loss": 0.1822, + "step": 14438 + }, + { + "epoch": 3.8422032996274615, + "grad_norm": 0.2910173237323761, + "learning_rate": 2.5815991816198134e-08, + "loss": 0.1865, + "step": 14439 + }, + { + "epoch": 3.8424693986162852, + "grad_norm": 0.34296974539756775, + "learning_rate": 2.5804667859345373e-08, + "loss": 0.1719, + "step": 14440 + }, + { + "epoch": 3.8427354976051094, + "grad_norm": 0.26524075865745544, + "learning_rate": 2.5793346018694228e-08, + "loss": 0.175, + "step": 14441 + }, + { + "epoch": 3.843001596593933, + "grad_norm": 0.5206151008605957, + "learning_rate": 2.578202629456766e-08, + "loss": 0.1883, + "step": 14442 + }, + { + "epoch": 3.8432676955827567, + "grad_norm": 0.4464775323867798, + "learning_rate": 2.577070868728851e-08, + "loss": 0.1928, + "step": 14443 + }, + { + "epoch": 3.843533794571581, + "grad_norm": 0.4200495481491089, + "learning_rate": 2.57593931971796e-08, + "loss": 0.2016, + "step": 14444 + }, + { + "epoch": 3.8437998935604045, + "grad_norm": 0.2922672629356384, + "learning_rate": 2.5748079824563683e-08, + "loss": 0.1805, + "step": 14445 + }, + { + "epoch": 3.844065992549228, + "grad_norm": 0.27867943048477173, + "learning_rate": 2.573676856976338e-08, + "loss": 0.183, + "step": 14446 + }, + { + "epoch": 3.8443320915380523, + "grad_norm": 0.28912216424942017, + "learning_rate": 2.572545943310138e-08, + "loss": 0.165, + "step": 14447 + }, + { + "epoch": 3.844598190526876, + "grad_norm": 0.2880892753601074, + "learning_rate": 2.5714152414900193e-08, + "loss": 0.1692, + "step": 14448 + }, + { + "epoch": 3.8448642895156997, + "grad_norm": 0.4934036135673523, + "learning_rate": 2.5702847515482328e-08, + "loss": 0.1927, + "step": 14449 + }, + { + "epoch": 3.845130388504524, + "grad_norm": 0.27584049105644226, + "learning_rate": 2.5691544735170222e-08, + "loss": 0.1653, + "step": 14450 + }, + { + "epoch": 3.8453964874933475, + "grad_norm": 0.3682119846343994, + "learning_rate": 2.5680244074286306e-08, + "loss": 0.1945, + "step": 14451 + }, + { + "epoch": 3.845662586482171, + "grad_norm": 0.29605910181999207, + "learning_rate": 2.566894553315282e-08, + "loss": 0.1846, + "step": 14452 + }, + { + "epoch": 3.8459286854709953, + "grad_norm": 0.26024314761161804, + "learning_rate": 2.5657649112092073e-08, + "loss": 0.1701, + "step": 14453 + }, + { + "epoch": 3.846194784459819, + "grad_norm": 0.4487203359603882, + "learning_rate": 2.564635481142623e-08, + "loss": 0.1977, + "step": 14454 + }, + { + "epoch": 3.8464608834486427, + "grad_norm": 0.28015950322151184, + "learning_rate": 2.5635062631477466e-08, + "loss": 0.1718, + "step": 14455 + }, + { + "epoch": 3.846726982437467, + "grad_norm": 0.3335394263267517, + "learning_rate": 2.5623772572567804e-08, + "loss": 0.1734, + "step": 14456 + }, + { + "epoch": 3.8469930814262905, + "grad_norm": 0.2806416153907776, + "learning_rate": 2.5612484635019284e-08, + "loss": 0.1647, + "step": 14457 + }, + { + "epoch": 3.847259180415114, + "grad_norm": 0.3452981114387512, + "learning_rate": 2.560119881915387e-08, + "loss": 0.1693, + "step": 14458 + }, + { + "epoch": 3.8475252794039383, + "grad_norm": 0.33527567982673645, + "learning_rate": 2.558991512529347e-08, + "loss": 0.1809, + "step": 14459 + }, + { + "epoch": 3.847791378392762, + "grad_norm": 0.2743940055370331, + "learning_rate": 2.5578633553759875e-08, + "loss": 0.1798, + "step": 14460 + }, + { + "epoch": 3.848057477381586, + "grad_norm": 0.4053594470024109, + "learning_rate": 2.5567354104874895e-08, + "loss": 0.2035, + "step": 14461 + }, + { + "epoch": 3.84832357637041, + "grad_norm": 0.3049611449241638, + "learning_rate": 2.5556076778960246e-08, + "loss": 0.1745, + "step": 14462 + }, + { + "epoch": 3.848589675359234, + "grad_norm": 0.3116534352302551, + "learning_rate": 2.5544801576337537e-08, + "loss": 0.1845, + "step": 14463 + }, + { + "epoch": 3.8488557743480576, + "grad_norm": 0.2607836425304413, + "learning_rate": 2.5533528497328394e-08, + "loss": 0.1833, + "step": 14464 + }, + { + "epoch": 3.8491218733368813, + "grad_norm": 0.2889181971549988, + "learning_rate": 2.5522257542254334e-08, + "loss": 0.174, + "step": 14465 + }, + { + "epoch": 3.8493879723257054, + "grad_norm": 0.32589495182037354, + "learning_rate": 2.5510988711436864e-08, + "loss": 0.1842, + "step": 14466 + }, + { + "epoch": 3.849654071314529, + "grad_norm": 0.315393328666687, + "learning_rate": 2.5499722005197345e-08, + "loss": 0.1901, + "step": 14467 + }, + { + "epoch": 3.8499201703033528, + "grad_norm": 0.2876092195510864, + "learning_rate": 2.548845742385717e-08, + "loss": 0.1677, + "step": 14468 + }, + { + "epoch": 3.850186269292177, + "grad_norm": 0.28555819392204285, + "learning_rate": 2.547719496773757e-08, + "loss": 0.1841, + "step": 14469 + }, + { + "epoch": 3.8504523682810006, + "grad_norm": 0.3856573700904846, + "learning_rate": 2.5465934637159835e-08, + "loss": 0.1882, + "step": 14470 + }, + { + "epoch": 3.8507184672698243, + "grad_norm": 0.36923331022262573, + "learning_rate": 2.5454676432445056e-08, + "loss": 0.183, + "step": 14471 + }, + { + "epoch": 3.8509845662586484, + "grad_norm": 0.28248876333236694, + "learning_rate": 2.5443420353914436e-08, + "loss": 0.1829, + "step": 14472 + }, + { + "epoch": 3.851250665247472, + "grad_norm": 0.3648589551448822, + "learning_rate": 2.5432166401888955e-08, + "loss": 0.1729, + "step": 14473 + }, + { + "epoch": 3.8515167642362957, + "grad_norm": 0.5000047087669373, + "learning_rate": 2.5420914576689646e-08, + "loss": 0.1926, + "step": 14474 + }, + { + "epoch": 3.85178286322512, + "grad_norm": 0.28053003549575806, + "learning_rate": 2.540966487863737e-08, + "loss": 0.1686, + "step": 14475 + }, + { + "epoch": 3.8520489622139436, + "grad_norm": 0.35710984468460083, + "learning_rate": 2.5398417308053056e-08, + "loss": 0.1734, + "step": 14476 + }, + { + "epoch": 3.8523150612027672, + "grad_norm": 0.2861597239971161, + "learning_rate": 2.538717186525745e-08, + "loss": 0.178, + "step": 14477 + }, + { + "epoch": 3.8525811601915914, + "grad_norm": 0.31701359152793884, + "learning_rate": 2.5375928550571325e-08, + "loss": 0.1674, + "step": 14478 + }, + { + "epoch": 3.852847259180415, + "grad_norm": 0.2514830529689789, + "learning_rate": 2.536468736431536e-08, + "loss": 0.1587, + "step": 14479 + }, + { + "epoch": 3.8531133581692387, + "grad_norm": 0.3175685405731201, + "learning_rate": 2.5353448306810176e-08, + "loss": 0.1983, + "step": 14480 + }, + { + "epoch": 3.853379457158063, + "grad_norm": 0.37230896949768066, + "learning_rate": 2.5342211378376365e-08, + "loss": 0.1588, + "step": 14481 + }, + { + "epoch": 3.8536455561468865, + "grad_norm": 0.35518431663513184, + "learning_rate": 2.5330976579334374e-08, + "loss": 0.1859, + "step": 14482 + }, + { + "epoch": 3.85391165513571, + "grad_norm": 0.2546428442001343, + "learning_rate": 2.5319743910004687e-08, + "loss": 0.1642, + "step": 14483 + }, + { + "epoch": 3.8541777541245343, + "grad_norm": 0.2903353273868561, + "learning_rate": 2.530851337070763e-08, + "loss": 0.1734, + "step": 14484 + }, + { + "epoch": 3.854443853113358, + "grad_norm": 0.2724947929382324, + "learning_rate": 2.5297284961763553e-08, + "loss": 0.1796, + "step": 14485 + }, + { + "epoch": 3.854709952102182, + "grad_norm": 0.3521503210067749, + "learning_rate": 2.5286058683492717e-08, + "loss": 0.1786, + "step": 14486 + }, + { + "epoch": 3.854976051091006, + "grad_norm": 0.2550899386405945, + "learning_rate": 2.5274834536215352e-08, + "loss": 0.1659, + "step": 14487 + }, + { + "epoch": 3.85524215007983, + "grad_norm": 0.3300192952156067, + "learning_rate": 2.526361252025151e-08, + "loss": 0.1996, + "step": 14488 + }, + { + "epoch": 3.8555082490686536, + "grad_norm": 0.31249260902404785, + "learning_rate": 2.525239263592135e-08, + "loss": 0.1603, + "step": 14489 + }, + { + "epoch": 3.8557743480574773, + "grad_norm": 0.34628400206565857, + "learning_rate": 2.5241174883544802e-08, + "loss": 0.1718, + "step": 14490 + }, + { + "epoch": 3.8560404470463014, + "grad_norm": 0.28585579991340637, + "learning_rate": 2.5229959263441903e-08, + "loss": 0.1698, + "step": 14491 + }, + { + "epoch": 3.856306546035125, + "grad_norm": 0.39315664768218994, + "learning_rate": 2.5218745775932483e-08, + "loss": 0.192, + "step": 14492 + }, + { + "epoch": 3.856572645023949, + "grad_norm": 0.28703561425209045, + "learning_rate": 2.5207534421336386e-08, + "loss": 0.1733, + "step": 14493 + }, + { + "epoch": 3.856838744012773, + "grad_norm": 0.2640586793422699, + "learning_rate": 2.5196325199973402e-08, + "loss": 0.1669, + "step": 14494 + }, + { + "epoch": 3.8571048430015966, + "grad_norm": 0.30401095747947693, + "learning_rate": 2.518511811216325e-08, + "loss": 0.1778, + "step": 14495 + }, + { + "epoch": 3.8573709419904203, + "grad_norm": 0.2632143497467041, + "learning_rate": 2.5173913158225536e-08, + "loss": 0.1771, + "step": 14496 + }, + { + "epoch": 3.8576370409792444, + "grad_norm": 0.3512868285179138, + "learning_rate": 2.5162710338479877e-08, + "loss": 0.1919, + "step": 14497 + }, + { + "epoch": 3.857903139968068, + "grad_norm": 0.30254122614860535, + "learning_rate": 2.5151509653245816e-08, + "loss": 0.1816, + "step": 14498 + }, + { + "epoch": 3.858169238956892, + "grad_norm": 0.41476237773895264, + "learning_rate": 2.5140311102842782e-08, + "loss": 0.2067, + "step": 14499 + }, + { + "epoch": 3.858435337945716, + "grad_norm": 0.36777636408805847, + "learning_rate": 2.512911468759019e-08, + "loss": 0.1877, + "step": 14500 + }, + { + "epoch": 3.8587014369345396, + "grad_norm": 0.28448233008384705, + "learning_rate": 2.5117920407807392e-08, + "loss": 0.1801, + "step": 14501 + }, + { + "epoch": 3.8589675359233633, + "grad_norm": 0.3815152049064636, + "learning_rate": 2.5106728263813702e-08, + "loss": 0.1902, + "step": 14502 + }, + { + "epoch": 3.8592336349121874, + "grad_norm": 0.2642795741558075, + "learning_rate": 2.5095538255928282e-08, + "loss": 0.1761, + "step": 14503 + }, + { + "epoch": 3.859499733901011, + "grad_norm": 0.5617486834526062, + "learning_rate": 2.5084350384470353e-08, + "loss": 0.1797, + "step": 14504 + }, + { + "epoch": 3.8597658328898348, + "grad_norm": 0.284811794757843, + "learning_rate": 2.507316464975896e-08, + "loss": 0.1744, + "step": 14505 + }, + { + "epoch": 3.860031931878659, + "grad_norm": 0.35227808356285095, + "learning_rate": 2.506198105211319e-08, + "loss": 0.1812, + "step": 14506 + }, + { + "epoch": 3.8602980308674826, + "grad_norm": 0.36978262662887573, + "learning_rate": 2.505079959185199e-08, + "loss": 0.1767, + "step": 14507 + }, + { + "epoch": 3.8605641298563067, + "grad_norm": 0.3550005257129669, + "learning_rate": 2.503962026929427e-08, + "loss": 0.1727, + "step": 14508 + }, + { + "epoch": 3.8608302288451304, + "grad_norm": 0.304611474275589, + "learning_rate": 2.502844308475892e-08, + "loss": 0.1721, + "step": 14509 + }, + { + "epoch": 3.861096327833954, + "grad_norm": 0.38014790415763855, + "learning_rate": 2.5017268038564753e-08, + "loss": 0.2029, + "step": 14510 + }, + { + "epoch": 3.861362426822778, + "grad_norm": 0.2913782298564911, + "learning_rate": 2.5006095131030435e-08, + "loss": 0.1678, + "step": 14511 + }, + { + "epoch": 3.861628525811602, + "grad_norm": 0.42345544695854187, + "learning_rate": 2.4994924362474712e-08, + "loss": 0.1853, + "step": 14512 + }, + { + "epoch": 3.861894624800426, + "grad_norm": 0.2704525589942932, + "learning_rate": 2.498375573321614e-08, + "loss": 0.1823, + "step": 14513 + }, + { + "epoch": 3.8621607237892497, + "grad_norm": 0.3252672255039215, + "learning_rate": 2.4972589243573293e-08, + "loss": 0.1865, + "step": 14514 + }, + { + "epoch": 3.8624268227780734, + "grad_norm": 0.2970760464668274, + "learning_rate": 2.4961424893864658e-08, + "loss": 0.1703, + "step": 14515 + }, + { + "epoch": 3.8626929217668975, + "grad_norm": 0.2715403735637665, + "learning_rate": 2.4950262684408672e-08, + "loss": 0.18, + "step": 14516 + }, + { + "epoch": 3.862959020755721, + "grad_norm": 0.3183780908584595, + "learning_rate": 2.4939102615523734e-08, + "loss": 0.1706, + "step": 14517 + }, + { + "epoch": 3.863225119744545, + "grad_norm": 0.5113644003868103, + "learning_rate": 2.4927944687528092e-08, + "loss": 0.1804, + "step": 14518 + }, + { + "epoch": 3.863491218733369, + "grad_norm": 0.2476474642753601, + "learning_rate": 2.491678890074005e-08, + "loss": 0.1703, + "step": 14519 + }, + { + "epoch": 3.8637573177221927, + "grad_norm": 0.2731420695781708, + "learning_rate": 2.490563525547773e-08, + "loss": 0.1813, + "step": 14520 + }, + { + "epoch": 3.8640234167110163, + "grad_norm": 0.27031010389328003, + "learning_rate": 2.4894483752059325e-08, + "loss": 0.1754, + "step": 14521 + }, + { + "epoch": 3.8642895156998405, + "grad_norm": 0.28519779443740845, + "learning_rate": 2.4883334390802813e-08, + "loss": 0.1886, + "step": 14522 + }, + { + "epoch": 3.864555614688664, + "grad_norm": 0.3149905204772949, + "learning_rate": 2.4872187172026303e-08, + "loss": 0.1898, + "step": 14523 + }, + { + "epoch": 3.864821713677488, + "grad_norm": 0.27004575729370117, + "learning_rate": 2.4861042096047647e-08, + "loss": 0.1772, + "step": 14524 + }, + { + "epoch": 3.865087812666312, + "grad_norm": 0.34485477209091187, + "learning_rate": 2.4849899163184797e-08, + "loss": 0.1824, + "step": 14525 + }, + { + "epoch": 3.8653539116551356, + "grad_norm": 0.27869391441345215, + "learning_rate": 2.48387583737555e-08, + "loss": 0.1675, + "step": 14526 + }, + { + "epoch": 3.8656200106439593, + "grad_norm": 0.25620219111442566, + "learning_rate": 2.4827619728077586e-08, + "loss": 0.162, + "step": 14527 + }, + { + "epoch": 3.8658861096327835, + "grad_norm": 0.3888768255710602, + "learning_rate": 2.4816483226468675e-08, + "loss": 0.1807, + "step": 14528 + }, + { + "epoch": 3.866152208621607, + "grad_norm": 0.25822892785072327, + "learning_rate": 2.4805348869246444e-08, + "loss": 0.1741, + "step": 14529 + }, + { + "epoch": 3.866418307610431, + "grad_norm": 0.2710884213447571, + "learning_rate": 2.4794216656728473e-08, + "loss": 0.1652, + "step": 14530 + }, + { + "epoch": 3.866684406599255, + "grad_norm": 0.2693013548851013, + "learning_rate": 2.4783086589232294e-08, + "loss": 0.1906, + "step": 14531 + }, + { + "epoch": 3.8669505055880786, + "grad_norm": 0.28281810879707336, + "learning_rate": 2.477195866707531e-08, + "loss": 0.1602, + "step": 14532 + }, + { + "epoch": 3.8672166045769027, + "grad_norm": 0.3880026936531067, + "learning_rate": 2.4760832890574956e-08, + "loss": 0.1676, + "step": 14533 + }, + { + "epoch": 3.8674827035657264, + "grad_norm": 0.36052629351615906, + "learning_rate": 2.4749709260048525e-08, + "loss": 0.1807, + "step": 14534 + }, + { + "epoch": 3.8677488025545506, + "grad_norm": 0.3261762261390686, + "learning_rate": 2.473858777581329e-08, + "loss": 0.1821, + "step": 14535 + }, + { + "epoch": 3.8680149015433742, + "grad_norm": 0.3567051887512207, + "learning_rate": 2.4727468438186473e-08, + "loss": 0.1741, + "step": 14536 + }, + { + "epoch": 3.868281000532198, + "grad_norm": 0.34442034363746643, + "learning_rate": 2.4716351247485214e-08, + "loss": 0.1711, + "step": 14537 + }, + { + "epoch": 3.868547099521022, + "grad_norm": 0.290361225605011, + "learning_rate": 2.4705236204026637e-08, + "loss": 0.1851, + "step": 14538 + }, + { + "epoch": 3.8688131985098457, + "grad_norm": 0.3580440580844879, + "learning_rate": 2.469412330812769e-08, + "loss": 0.1781, + "step": 14539 + }, + { + "epoch": 3.8690792974986694, + "grad_norm": 0.2763495147228241, + "learning_rate": 2.468301256010541e-08, + "loss": 0.1661, + "step": 14540 + }, + { + "epoch": 3.8693453964874935, + "grad_norm": 0.38077983260154724, + "learning_rate": 2.467190396027664e-08, + "loss": 0.1795, + "step": 14541 + }, + { + "epoch": 3.869611495476317, + "grad_norm": 0.4828135371208191, + "learning_rate": 2.4660797508958265e-08, + "loss": 0.1706, + "step": 14542 + }, + { + "epoch": 3.869877594465141, + "grad_norm": 0.2709850072860718, + "learning_rate": 2.4649693206467014e-08, + "loss": 0.1685, + "step": 14543 + }, + { + "epoch": 3.870143693453965, + "grad_norm": 0.29797741770744324, + "learning_rate": 2.4638591053119638e-08, + "loss": 0.1654, + "step": 14544 + }, + { + "epoch": 3.8704097924427887, + "grad_norm": 0.2779149115085602, + "learning_rate": 2.4627491049232785e-08, + "loss": 0.1655, + "step": 14545 + }, + { + "epoch": 3.8706758914316124, + "grad_norm": 0.25922706723213196, + "learning_rate": 2.4616393195123075e-08, + "loss": 0.164, + "step": 14546 + }, + { + "epoch": 3.8709419904204365, + "grad_norm": 0.39267656207084656, + "learning_rate": 2.4605297491106993e-08, + "loss": 0.1875, + "step": 14547 + }, + { + "epoch": 3.87120808940926, + "grad_norm": 0.27921512722969055, + "learning_rate": 2.4594203937501056e-08, + "loss": 0.1826, + "step": 14548 + }, + { + "epoch": 3.871474188398084, + "grad_norm": 0.2915983498096466, + "learning_rate": 2.458311253462163e-08, + "loss": 0.1685, + "step": 14549 + }, + { + "epoch": 3.871740287386908, + "grad_norm": 0.2762225568294525, + "learning_rate": 2.4572023282785093e-08, + "loss": 0.1564, + "step": 14550 + }, + { + "epoch": 3.8720063863757317, + "grad_norm": 0.35390472412109375, + "learning_rate": 2.4560936182307735e-08, + "loss": 0.1724, + "step": 14551 + }, + { + "epoch": 3.8722724853645554, + "grad_norm": 0.2584737241268158, + "learning_rate": 2.4549851233505768e-08, + "loss": 0.1565, + "step": 14552 + }, + { + "epoch": 3.8725385843533795, + "grad_norm": 0.2637515664100647, + "learning_rate": 2.453876843669539e-08, + "loss": 0.1658, + "step": 14553 + }, + { + "epoch": 3.872804683342203, + "grad_norm": 0.40187543630599976, + "learning_rate": 2.4527687792192665e-08, + "loss": 0.1909, + "step": 14554 + }, + { + "epoch": 3.8730707823310273, + "grad_norm": 0.37061426043510437, + "learning_rate": 2.4516609300313673e-08, + "loss": 0.1596, + "step": 14555 + }, + { + "epoch": 3.873336881319851, + "grad_norm": 0.2706436812877655, + "learning_rate": 2.4505532961374343e-08, + "loss": 0.1659, + "step": 14556 + }, + { + "epoch": 3.8736029803086747, + "grad_norm": 0.31907379627227783, + "learning_rate": 2.4494458775690652e-08, + "loss": 0.1622, + "step": 14557 + }, + { + "epoch": 3.873869079297499, + "grad_norm": 0.27376991510391235, + "learning_rate": 2.4483386743578403e-08, + "loss": 0.1794, + "step": 14558 + }, + { + "epoch": 3.8741351782863225, + "grad_norm": 0.30187860131263733, + "learning_rate": 2.4472316865353427e-08, + "loss": 0.1868, + "step": 14559 + }, + { + "epoch": 3.8744012772751466, + "grad_norm": 0.2862912714481354, + "learning_rate": 2.4461249141331454e-08, + "loss": 0.1768, + "step": 14560 + }, + { + "epoch": 3.8746673762639703, + "grad_norm": 0.24184934794902802, + "learning_rate": 2.4450183571828187e-08, + "loss": 0.1585, + "step": 14561 + }, + { + "epoch": 3.874933475252794, + "grad_norm": 0.28403350710868835, + "learning_rate": 2.4439120157159178e-08, + "loss": 0.1803, + "step": 14562 + }, + { + "epoch": 3.875199574241618, + "grad_norm": 0.3113029897212982, + "learning_rate": 2.4428058897640023e-08, + "loss": 0.1761, + "step": 14563 + }, + { + "epoch": 3.8754656732304418, + "grad_norm": 0.3951405882835388, + "learning_rate": 2.441699979358619e-08, + "loss": 0.1743, + "step": 14564 + }, + { + "epoch": 3.8757317722192655, + "grad_norm": 0.3383125960826874, + "learning_rate": 2.440594284531311e-08, + "loss": 0.1821, + "step": 14565 + }, + { + "epoch": 3.8759978712080896, + "grad_norm": 0.2701992392539978, + "learning_rate": 2.4394888053136152e-08, + "loss": 0.174, + "step": 14566 + }, + { + "epoch": 3.8762639701969133, + "grad_norm": 0.29595300555229187, + "learning_rate": 2.4383835417370646e-08, + "loss": 0.1801, + "step": 14567 + }, + { + "epoch": 3.876530069185737, + "grad_norm": 0.2686913311481476, + "learning_rate": 2.4372784938331793e-08, + "loss": 0.1707, + "step": 14568 + }, + { + "epoch": 3.876796168174561, + "grad_norm": 0.27413877844810486, + "learning_rate": 2.4361736616334827e-08, + "loss": 0.1621, + "step": 14569 + }, + { + "epoch": 3.8770622671633848, + "grad_norm": 0.2746196389198303, + "learning_rate": 2.43506904516948e-08, + "loss": 0.1708, + "step": 14570 + }, + { + "epoch": 3.8773283661522084, + "grad_norm": 0.3312995731830597, + "learning_rate": 2.4339646444726815e-08, + "loss": 0.1792, + "step": 14571 + }, + { + "epoch": 3.8775944651410326, + "grad_norm": 0.2905474007129669, + "learning_rate": 2.432860459574588e-08, + "loss": 0.1858, + "step": 14572 + }, + { + "epoch": 3.8778605641298562, + "grad_norm": 0.29854148626327515, + "learning_rate": 2.4317564905066867e-08, + "loss": 0.1648, + "step": 14573 + }, + { + "epoch": 3.87812666311868, + "grad_norm": 0.26945680379867554, + "learning_rate": 2.4306527373004748e-08, + "loss": 0.171, + "step": 14574 + }, + { + "epoch": 3.878392762107504, + "grad_norm": 0.3823561668395996, + "learning_rate": 2.4295491999874263e-08, + "loss": 0.1773, + "step": 14575 + }, + { + "epoch": 3.8786588610963277, + "grad_norm": 0.30219766497612, + "learning_rate": 2.4284458785990202e-08, + "loss": 0.1884, + "step": 14576 + }, + { + "epoch": 3.8789249600851514, + "grad_norm": 0.374944806098938, + "learning_rate": 2.4273427731667217e-08, + "loss": 0.1789, + "step": 14577 + }, + { + "epoch": 3.8791910590739755, + "grad_norm": 0.382962703704834, + "learning_rate": 2.4262398837219988e-08, + "loss": 0.2006, + "step": 14578 + }, + { + "epoch": 3.8794571580627992, + "grad_norm": 0.2536269724369049, + "learning_rate": 2.4251372102963018e-08, + "loss": 0.1742, + "step": 14579 + }, + { + "epoch": 3.8797232570516234, + "grad_norm": 0.3750319182872772, + "learning_rate": 2.4240347529210848e-08, + "loss": 0.1755, + "step": 14580 + }, + { + "epoch": 3.879989356040447, + "grad_norm": 0.2836934030056, + "learning_rate": 2.4229325116277923e-08, + "loss": 0.1833, + "step": 14581 + }, + { + "epoch": 3.880255455029271, + "grad_norm": 0.26859432458877563, + "learning_rate": 2.4218304864478633e-08, + "loss": 0.1698, + "step": 14582 + }, + { + "epoch": 3.880521554018095, + "grad_norm": 0.2615481913089752, + "learning_rate": 2.4207286774127268e-08, + "loss": 0.1634, + "step": 14583 + }, + { + "epoch": 3.8807876530069185, + "grad_norm": 0.29065555334091187, + "learning_rate": 2.4196270845538126e-08, + "loss": 0.1822, + "step": 14584 + }, + { + "epoch": 3.8810537519957427, + "grad_norm": 0.29070305824279785, + "learning_rate": 2.4185257079025345e-08, + "loss": 0.1843, + "step": 14585 + }, + { + "epoch": 3.8813198509845663, + "grad_norm": 0.29286471009254456, + "learning_rate": 2.417424547490313e-08, + "loss": 0.1601, + "step": 14586 + }, + { + "epoch": 3.88158594997339, + "grad_norm": 0.32664522528648376, + "learning_rate": 2.4163236033485458e-08, + "loss": 0.1745, + "step": 14587 + }, + { + "epoch": 3.881852048962214, + "grad_norm": 0.27620720863342285, + "learning_rate": 2.415222875508647e-08, + "loss": 0.1691, + "step": 14588 + }, + { + "epoch": 3.882118147951038, + "grad_norm": 0.35198843479156494, + "learning_rate": 2.4141223640020003e-08, + "loss": 0.1964, + "step": 14589 + }, + { + "epoch": 3.8823842469398615, + "grad_norm": 0.2467498928308487, + "learning_rate": 2.41302206886e-08, + "loss": 0.1592, + "step": 14590 + }, + { + "epoch": 3.8826503459286856, + "grad_norm": 0.36062175035476685, + "learning_rate": 2.411921990114031e-08, + "loss": 0.184, + "step": 14591 + }, + { + "epoch": 3.8829164449175093, + "grad_norm": 0.28532832860946655, + "learning_rate": 2.4108221277954633e-08, + "loss": 0.1761, + "step": 14592 + }, + { + "epoch": 3.883182543906333, + "grad_norm": 0.36569270491600037, + "learning_rate": 2.409722481935673e-08, + "loss": 0.175, + "step": 14593 + }, + { + "epoch": 3.883448642895157, + "grad_norm": 0.32903987169265747, + "learning_rate": 2.408623052566019e-08, + "loss": 0.1768, + "step": 14594 + }, + { + "epoch": 3.883714741883981, + "grad_norm": 0.2680298984050751, + "learning_rate": 2.407523839717862e-08, + "loss": 0.1766, + "step": 14595 + }, + { + "epoch": 3.8839808408728045, + "grad_norm": 0.2698526680469513, + "learning_rate": 2.406424843422553e-08, + "loss": 0.1705, + "step": 14596 + }, + { + "epoch": 3.8842469398616286, + "grad_norm": 0.2578449845314026, + "learning_rate": 2.4053260637114427e-08, + "loss": 0.1662, + "step": 14597 + }, + { + "epoch": 3.8845130388504523, + "grad_norm": 0.3724183440208435, + "learning_rate": 2.404227500615863e-08, + "loss": 0.1866, + "step": 14598 + }, + { + "epoch": 3.884779137839276, + "grad_norm": 0.3930353820323944, + "learning_rate": 2.4031291541671527e-08, + "loss": 0.1964, + "step": 14599 + }, + { + "epoch": 3.8850452368281, + "grad_norm": 0.41965675354003906, + "learning_rate": 2.4020310243966336e-08, + "loss": 0.1862, + "step": 14600 + }, + { + "epoch": 3.885311335816924, + "grad_norm": 0.6732192039489746, + "learning_rate": 2.400933111335629e-08, + "loss": 0.1768, + "step": 14601 + }, + { + "epoch": 3.8855774348057475, + "grad_norm": 0.2942206561565399, + "learning_rate": 2.3998354150154554e-08, + "loss": 0.179, + "step": 14602 + }, + { + "epoch": 3.8858435337945716, + "grad_norm": 0.30065858364105225, + "learning_rate": 2.3987379354674218e-08, + "loss": 0.1839, + "step": 14603 + }, + { + "epoch": 3.8861096327833953, + "grad_norm": 0.273633748292923, + "learning_rate": 2.397640672722826e-08, + "loss": 0.1629, + "step": 14604 + }, + { + "epoch": 3.8863757317722194, + "grad_norm": 0.28919392824172974, + "learning_rate": 2.3965436268129712e-08, + "loss": 0.187, + "step": 14605 + }, + { + "epoch": 3.886641830761043, + "grad_norm": 0.33417633175849915, + "learning_rate": 2.3954467977691383e-08, + "loss": 0.1768, + "step": 14606 + }, + { + "epoch": 3.886907929749867, + "grad_norm": 0.2704364061355591, + "learning_rate": 2.394350185622618e-08, + "loss": 0.1676, + "step": 14607 + }, + { + "epoch": 3.887174028738691, + "grad_norm": 0.3441488742828369, + "learning_rate": 2.393253790404687e-08, + "loss": 0.1991, + "step": 14608 + }, + { + "epoch": 3.8874401277275146, + "grad_norm": 0.3407983183860779, + "learning_rate": 2.3921576121466137e-08, + "loss": 0.191, + "step": 14609 + }, + { + "epoch": 3.8877062267163387, + "grad_norm": 0.4154006242752075, + "learning_rate": 2.3910616508796642e-08, + "loss": 0.1903, + "step": 14610 + }, + { + "epoch": 3.8879723257051624, + "grad_norm": 0.2755690813064575, + "learning_rate": 2.3899659066350985e-08, + "loss": 0.1716, + "step": 14611 + }, + { + "epoch": 3.888238424693986, + "grad_norm": 0.272670179605484, + "learning_rate": 2.3888703794441735e-08, + "loss": 0.156, + "step": 14612 + }, + { + "epoch": 3.88850452368281, + "grad_norm": 0.3332381248474121, + "learning_rate": 2.3877750693381283e-08, + "loss": 0.1843, + "step": 14613 + }, + { + "epoch": 3.888770622671634, + "grad_norm": 0.43466538190841675, + "learning_rate": 2.3866799763482094e-08, + "loss": 0.1824, + "step": 14614 + }, + { + "epoch": 3.8890367216604576, + "grad_norm": 0.3943907916545868, + "learning_rate": 2.385585100505646e-08, + "loss": 0.1949, + "step": 14615 + }, + { + "epoch": 3.8893028206492817, + "grad_norm": 0.3553108274936676, + "learning_rate": 2.384490441841669e-08, + "loss": 0.1904, + "step": 14616 + }, + { + "epoch": 3.8895689196381054, + "grad_norm": 0.3103790283203125, + "learning_rate": 2.3833960003874998e-08, + "loss": 0.1866, + "step": 14617 + }, + { + "epoch": 3.889835018626929, + "grad_norm": 0.27802160382270813, + "learning_rate": 2.382301776174358e-08, + "loss": 0.1759, + "step": 14618 + }, + { + "epoch": 3.890101117615753, + "grad_norm": 0.29072991013526917, + "learning_rate": 2.3812077692334453e-08, + "loss": 0.1953, + "step": 14619 + }, + { + "epoch": 3.890367216604577, + "grad_norm": 0.28262150287628174, + "learning_rate": 2.3801139795959723e-08, + "loss": 0.1699, + "step": 14620 + }, + { + "epoch": 3.8906333155934005, + "grad_norm": 0.27291053533554077, + "learning_rate": 2.379020407293131e-08, + "loss": 0.1791, + "step": 14621 + }, + { + "epoch": 3.8908994145822247, + "grad_norm": 0.27607405185699463, + "learning_rate": 2.3779270523561167e-08, + "loss": 0.169, + "step": 14622 + }, + { + "epoch": 3.8911655135710483, + "grad_norm": 0.3426445424556732, + "learning_rate": 2.3768339148161088e-08, + "loss": 0.1766, + "step": 14623 + }, + { + "epoch": 3.891431612559872, + "grad_norm": 0.2526792287826538, + "learning_rate": 2.3757409947042882e-08, + "loss": 0.1639, + "step": 14624 + }, + { + "epoch": 3.891697711548696, + "grad_norm": 0.3060077130794525, + "learning_rate": 2.3746482920518284e-08, + "loss": 0.1805, + "step": 14625 + }, + { + "epoch": 3.89196381053752, + "grad_norm": 0.28541436791419983, + "learning_rate": 2.3735558068898943e-08, + "loss": 0.1675, + "step": 14626 + }, + { + "epoch": 3.892229909526344, + "grad_norm": 0.2851518988609314, + "learning_rate": 2.3724635392496484e-08, + "loss": 0.1721, + "step": 14627 + }, + { + "epoch": 3.8924960085151676, + "grad_norm": 0.24980628490447998, + "learning_rate": 2.3713714891622404e-08, + "loss": 0.1674, + "step": 14628 + }, + { + "epoch": 3.8927621075039913, + "grad_norm": 0.3184691369533539, + "learning_rate": 2.3702796566588225e-08, + "loss": 0.1645, + "step": 14629 + }, + { + "epoch": 3.8930282064928154, + "grad_norm": 0.3528648018836975, + "learning_rate": 2.36918804177053e-08, + "loss": 0.1747, + "step": 14630 + }, + { + "epoch": 3.893294305481639, + "grad_norm": 3.457082748413086, + "learning_rate": 2.368096644528501e-08, + "loss": 0.1729, + "step": 14631 + }, + { + "epoch": 3.8935604044704633, + "grad_norm": 0.6878167390823364, + "learning_rate": 2.367005464963865e-08, + "loss": 0.1628, + "step": 14632 + }, + { + "epoch": 3.893826503459287, + "grad_norm": 0.2803846001625061, + "learning_rate": 2.3659145031077467e-08, + "loss": 0.174, + "step": 14633 + }, + { + "epoch": 3.8940926024481106, + "grad_norm": 0.4489145576953888, + "learning_rate": 2.3648237589912566e-08, + "loss": 0.1837, + "step": 14634 + }, + { + "epoch": 3.8943587014369347, + "grad_norm": 0.37145575881004333, + "learning_rate": 2.363733232645512e-08, + "loss": 0.1856, + "step": 14635 + }, + { + "epoch": 3.8946248004257584, + "grad_norm": 0.3974435329437256, + "learning_rate": 2.36264292410161e-08, + "loss": 0.1759, + "step": 14636 + }, + { + "epoch": 3.894890899414582, + "grad_norm": 0.2800564467906952, + "learning_rate": 2.361552833390651e-08, + "loss": 0.1743, + "step": 14637 + }, + { + "epoch": 3.8951569984034062, + "grad_norm": 0.2836388349533081, + "learning_rate": 2.3604629605437276e-08, + "loss": 0.1615, + "step": 14638 + }, + { + "epoch": 3.89542309739223, + "grad_norm": 0.26574376225471497, + "learning_rate": 2.3593733055919286e-08, + "loss": 0.1757, + "step": 14639 + }, + { + "epoch": 3.8956891963810536, + "grad_norm": 0.28276926279067993, + "learning_rate": 2.3582838685663253e-08, + "loss": 0.151, + "step": 14640 + }, + { + "epoch": 3.8959552953698777, + "grad_norm": 0.3701944351196289, + "learning_rate": 2.357194649497999e-08, + "loss": 0.1786, + "step": 14641 + }, + { + "epoch": 3.8962213943587014, + "grad_norm": 0.2627828121185303, + "learning_rate": 2.3561056484180086e-08, + "loss": 0.1664, + "step": 14642 + }, + { + "epoch": 3.896487493347525, + "grad_norm": 0.3796771764755249, + "learning_rate": 2.355016865357421e-08, + "loss": 0.1656, + "step": 14643 + }, + { + "epoch": 3.896753592336349, + "grad_norm": 0.3598959147930145, + "learning_rate": 2.3539283003472842e-08, + "loss": 0.1809, + "step": 14644 + }, + { + "epoch": 3.897019691325173, + "grad_norm": 0.28867307305336, + "learning_rate": 2.3528399534186515e-08, + "loss": 0.1639, + "step": 14645 + }, + { + "epoch": 3.8972857903139966, + "grad_norm": 0.3508453965187073, + "learning_rate": 2.3517518246025624e-08, + "loss": 0.1798, + "step": 14646 + }, + { + "epoch": 3.8975518893028207, + "grad_norm": 0.26873770356178284, + "learning_rate": 2.350663913930053e-08, + "loss": 0.1744, + "step": 14647 + }, + { + "epoch": 3.8978179882916444, + "grad_norm": 0.27305126190185547, + "learning_rate": 2.3495762214321558e-08, + "loss": 0.1625, + "step": 14648 + }, + { + "epoch": 3.898084087280468, + "grad_norm": 0.3034113347530365, + "learning_rate": 2.3484887471398884e-08, + "loss": 0.1726, + "step": 14649 + }, + { + "epoch": 3.898350186269292, + "grad_norm": 0.2839042544364929, + "learning_rate": 2.3474014910842732e-08, + "loss": 0.1787, + "step": 14650 + }, + { + "epoch": 3.898616285258116, + "grad_norm": 0.25825488567352295, + "learning_rate": 2.3463144532963165e-08, + "loss": 0.1613, + "step": 14651 + }, + { + "epoch": 3.89888238424694, + "grad_norm": 0.27008509635925293, + "learning_rate": 2.345227633807023e-08, + "loss": 0.1742, + "step": 14652 + }, + { + "epoch": 3.8991484832357637, + "grad_norm": 0.35897311568260193, + "learning_rate": 2.344141032647393e-08, + "loss": 0.1747, + "step": 14653 + }, + { + "epoch": 3.899414582224588, + "grad_norm": 0.4226161241531372, + "learning_rate": 2.3430546498484227e-08, + "loss": 0.1794, + "step": 14654 + }, + { + "epoch": 3.8996806812134115, + "grad_norm": 0.41461536288261414, + "learning_rate": 2.341968485441089e-08, + "loss": 0.2066, + "step": 14655 + }, + { + "epoch": 3.899946780202235, + "grad_norm": 0.2815679907798767, + "learning_rate": 2.340882539456379e-08, + "loss": 0.1616, + "step": 14656 + }, + { + "epoch": 3.9002128791910593, + "grad_norm": 0.4073818624019623, + "learning_rate": 2.339796811925261e-08, + "loss": 0.1689, + "step": 14657 + }, + { + "epoch": 3.900478978179883, + "grad_norm": 0.3892214298248291, + "learning_rate": 2.3387113028787066e-08, + "loss": 0.1982, + "step": 14658 + }, + { + "epoch": 3.9007450771687067, + "grad_norm": 0.31165459752082825, + "learning_rate": 2.337626012347671e-08, + "loss": 0.1826, + "step": 14659 + }, + { + "epoch": 3.901011176157531, + "grad_norm": 0.27951157093048096, + "learning_rate": 2.336540940363112e-08, + "loss": 0.1868, + "step": 14660 + }, + { + "epoch": 3.9012772751463545, + "grad_norm": 0.2985718548297882, + "learning_rate": 2.335456086955979e-08, + "loss": 0.1799, + "step": 14661 + }, + { + "epoch": 3.901543374135178, + "grad_norm": 0.2614765465259552, + "learning_rate": 2.3343714521572133e-08, + "loss": 0.1684, + "step": 14662 + }, + { + "epoch": 3.9018094731240023, + "grad_norm": 0.2682479918003082, + "learning_rate": 2.333287035997754e-08, + "loss": 0.1676, + "step": 14663 + }, + { + "epoch": 3.902075572112826, + "grad_norm": 0.2787902355194092, + "learning_rate": 2.3322028385085246e-08, + "loss": 0.1572, + "step": 14664 + }, + { + "epoch": 3.9023416711016496, + "grad_norm": 0.28064030408859253, + "learning_rate": 2.3311188597204556e-08, + "loss": 0.1636, + "step": 14665 + }, + { + "epoch": 3.9026077700904738, + "grad_norm": 0.27353519201278687, + "learning_rate": 2.330035099664458e-08, + "loss": 0.1713, + "step": 14666 + }, + { + "epoch": 3.9028738690792975, + "grad_norm": 0.2744218111038208, + "learning_rate": 2.3289515583714448e-08, + "loss": 0.1725, + "step": 14667 + }, + { + "epoch": 3.903139968068121, + "grad_norm": 0.3196781873703003, + "learning_rate": 2.3278682358723212e-08, + "loss": 0.171, + "step": 14668 + }, + { + "epoch": 3.9034060670569453, + "grad_norm": 0.5620816349983215, + "learning_rate": 2.3267851321979894e-08, + "loss": 0.1774, + "step": 14669 + }, + { + "epoch": 3.903672166045769, + "grad_norm": 0.2691856920719147, + "learning_rate": 2.3257022473793353e-08, + "loss": 0.1855, + "step": 14670 + }, + { + "epoch": 3.9039382650345926, + "grad_norm": 0.553806722164154, + "learning_rate": 2.3246195814472502e-08, + "loss": 0.1606, + "step": 14671 + }, + { + "epoch": 3.9042043640234168, + "grad_norm": 0.24813897907733917, + "learning_rate": 2.3235371344326105e-08, + "loss": 0.1606, + "step": 14672 + }, + { + "epoch": 3.9044704630122404, + "grad_norm": 0.34373053908348083, + "learning_rate": 2.3224549063662923e-08, + "loss": 0.1835, + "step": 14673 + }, + { + "epoch": 3.9047365620010646, + "grad_norm": 0.29160478711128235, + "learning_rate": 2.321372897279157e-08, + "loss": 0.1807, + "step": 14674 + }, + { + "epoch": 3.9050026609898882, + "grad_norm": 0.2802359461784363, + "learning_rate": 2.3202911072020757e-08, + "loss": 0.1743, + "step": 14675 + }, + { + "epoch": 3.905268759978712, + "grad_norm": 0.3740413784980774, + "learning_rate": 2.319209536165896e-08, + "loss": 0.1831, + "step": 14676 + }, + { + "epoch": 3.905534858967536, + "grad_norm": 0.2800848186016083, + "learning_rate": 2.31812818420147e-08, + "loss": 0.1964, + "step": 14677 + }, + { + "epoch": 3.9058009579563597, + "grad_norm": 0.27822577953338623, + "learning_rate": 2.3170470513396357e-08, + "loss": 0.168, + "step": 14678 + }, + { + "epoch": 3.906067056945184, + "grad_norm": 0.2758423984050751, + "learning_rate": 2.315966137611236e-08, + "loss": 0.1933, + "step": 14679 + }, + { + "epoch": 3.9063331559340075, + "grad_norm": 0.2572270929813385, + "learning_rate": 2.3148854430470933e-08, + "loss": 0.1601, + "step": 14680 + }, + { + "epoch": 3.906599254922831, + "grad_norm": 0.4589514434337616, + "learning_rate": 2.3138049676780335e-08, + "loss": 0.1791, + "step": 14681 + }, + { + "epoch": 3.9068653539116553, + "grad_norm": 0.26057204604148865, + "learning_rate": 2.312724711534877e-08, + "loss": 0.1659, + "step": 14682 + }, + { + "epoch": 3.907131452900479, + "grad_norm": 0.2601912319660187, + "learning_rate": 2.311644674648432e-08, + "loss": 0.154, + "step": 14683 + }, + { + "epoch": 3.9073975518893027, + "grad_norm": 0.28323599696159363, + "learning_rate": 2.3105648570495072e-08, + "loss": 0.1707, + "step": 14684 + }, + { + "epoch": 3.907663650878127, + "grad_norm": 0.28256383538246155, + "learning_rate": 2.3094852587688964e-08, + "loss": 0.1641, + "step": 14685 + }, + { + "epoch": 3.9079297498669505, + "grad_norm": 0.4293936789035797, + "learning_rate": 2.308405879837395e-08, + "loss": 0.1911, + "step": 14686 + }, + { + "epoch": 3.908195848855774, + "grad_norm": 0.31550610065460205, + "learning_rate": 2.3073267202857873e-08, + "loss": 0.1795, + "step": 14687 + }, + { + "epoch": 3.9084619478445983, + "grad_norm": 0.28927478194236755, + "learning_rate": 2.306247780144852e-08, + "loss": 0.1744, + "step": 14688 + }, + { + "epoch": 3.908728046833422, + "grad_norm": 0.4430968761444092, + "learning_rate": 2.3051690594453656e-08, + "loss": 0.1801, + "step": 14689 + }, + { + "epoch": 3.9089941458222457, + "grad_norm": 0.4326363801956177, + "learning_rate": 2.304090558218096e-08, + "loss": 0.1782, + "step": 14690 + }, + { + "epoch": 3.90926024481107, + "grad_norm": 0.31397005915641785, + "learning_rate": 2.303012276493801e-08, + "loss": 0.1844, + "step": 14691 + }, + { + "epoch": 3.9095263437998935, + "grad_norm": 0.2812488079071045, + "learning_rate": 2.3019342143032385e-08, + "loss": 0.1834, + "step": 14692 + }, + { + "epoch": 3.909792442788717, + "grad_norm": 0.30698272585868835, + "learning_rate": 2.3008563716771533e-08, + "loss": 0.1821, + "step": 14693 + }, + { + "epoch": 3.9100585417775413, + "grad_norm": 0.3262800872325897, + "learning_rate": 2.2997787486462927e-08, + "loss": 0.1863, + "step": 14694 + }, + { + "epoch": 3.910324640766365, + "grad_norm": 0.5107713937759399, + "learning_rate": 2.2987013452413863e-08, + "loss": 0.1823, + "step": 14695 + }, + { + "epoch": 3.9105907397551887, + "grad_norm": 0.27913495898246765, + "learning_rate": 2.297624161493168e-08, + "loss": 0.1765, + "step": 14696 + }, + { + "epoch": 3.910856838744013, + "grad_norm": 0.28660544753074646, + "learning_rate": 2.29654719743236e-08, + "loss": 0.1665, + "step": 14697 + }, + { + "epoch": 3.9111229377328365, + "grad_norm": 0.2976163923740387, + "learning_rate": 2.2954704530896807e-08, + "loss": 0.1835, + "step": 14698 + }, + { + "epoch": 3.9113890367216606, + "grad_norm": 0.2870473861694336, + "learning_rate": 2.2943939284958424e-08, + "loss": 0.178, + "step": 14699 + }, + { + "epoch": 3.9116551357104843, + "grad_norm": 0.34332770109176636, + "learning_rate": 2.2933176236815467e-08, + "loss": 0.1792, + "step": 14700 + }, + { + "epoch": 3.9119212346993084, + "grad_norm": 0.26593655347824097, + "learning_rate": 2.292241538677494e-08, + "loss": 0.1732, + "step": 14701 + }, + { + "epoch": 3.912187333688132, + "grad_norm": 0.26683491468429565, + "learning_rate": 2.2911656735143736e-08, + "loss": 0.1845, + "step": 14702 + }, + { + "epoch": 3.9124534326769558, + "grad_norm": 0.30006521940231323, + "learning_rate": 2.290090028222873e-08, + "loss": 0.1778, + "step": 14703 + }, + { + "epoch": 3.91271953166578, + "grad_norm": 0.38130486011505127, + "learning_rate": 2.2890146028336722e-08, + "loss": 0.2018, + "step": 14704 + }, + { + "epoch": 3.9129856306546036, + "grad_norm": 0.2811397612094879, + "learning_rate": 2.2879393973774476e-08, + "loss": 0.1822, + "step": 14705 + }, + { + "epoch": 3.9132517296434273, + "grad_norm": 0.26930058002471924, + "learning_rate": 2.2868644118848603e-08, + "loss": 0.1579, + "step": 14706 + }, + { + "epoch": 3.9135178286322514, + "grad_norm": 0.29681333899497986, + "learning_rate": 2.2857896463865757e-08, + "loss": 0.167, + "step": 14707 + }, + { + "epoch": 3.913783927621075, + "grad_norm": 0.29317378997802734, + "learning_rate": 2.284715100913245e-08, + "loss": 0.1635, + "step": 14708 + }, + { + "epoch": 3.9140500266098988, + "grad_norm": 0.3086821138858795, + "learning_rate": 2.283640775495519e-08, + "loss": 0.1748, + "step": 14709 + }, + { + "epoch": 3.914316125598723, + "grad_norm": 0.38235312700271606, + "learning_rate": 2.282566670164037e-08, + "loss": 0.1834, + "step": 14710 + }, + { + "epoch": 3.9145822245875466, + "grad_norm": 0.27348291873931885, + "learning_rate": 2.2814927849494348e-08, + "loss": 0.1613, + "step": 14711 + }, + { + "epoch": 3.9148483235763702, + "grad_norm": 0.3747629225254059, + "learning_rate": 2.2804191198823443e-08, + "loss": 0.1704, + "step": 14712 + }, + { + "epoch": 3.9151144225651944, + "grad_norm": 0.3070122003555298, + "learning_rate": 2.2793456749933905e-08, + "loss": 0.1864, + "step": 14713 + }, + { + "epoch": 3.915380521554018, + "grad_norm": 0.3202458620071411, + "learning_rate": 2.2782724503131834e-08, + "loss": 0.1855, + "step": 14714 + }, + { + "epoch": 3.9156466205428417, + "grad_norm": 0.28726547956466675, + "learning_rate": 2.2771994458723397e-08, + "loss": 0.189, + "step": 14715 + }, + { + "epoch": 3.915912719531666, + "grad_norm": 0.5738477110862732, + "learning_rate": 2.2761266617014596e-08, + "loss": 0.1873, + "step": 14716 + }, + { + "epoch": 3.9161788185204895, + "grad_norm": 0.263658344745636, + "learning_rate": 2.275054097831143e-08, + "loss": 0.1693, + "step": 14717 + }, + { + "epoch": 3.9164449175093132, + "grad_norm": 0.2958405315876007, + "learning_rate": 2.2739817542919805e-08, + "loss": 0.1665, + "step": 14718 + }, + { + "epoch": 3.9167110164981374, + "grad_norm": 0.36199596524238586, + "learning_rate": 2.272909631114559e-08, + "loss": 0.1846, + "step": 14719 + }, + { + "epoch": 3.916977115486961, + "grad_norm": 0.27866795659065247, + "learning_rate": 2.2718377283294586e-08, + "loss": 0.1713, + "step": 14720 + }, + { + "epoch": 3.9172432144757847, + "grad_norm": 0.2862803339958191, + "learning_rate": 2.2707660459672497e-08, + "loss": 0.1768, + "step": 14721 + }, + { + "epoch": 3.917509313464609, + "grad_norm": 0.3305469751358032, + "learning_rate": 2.2696945840585012e-08, + "loss": 0.1796, + "step": 14722 + }, + { + "epoch": 3.9177754124534325, + "grad_norm": 0.3244760036468506, + "learning_rate": 2.2686233426337686e-08, + "loss": 0.1696, + "step": 14723 + }, + { + "epoch": 3.9180415114422567, + "grad_norm": 0.27276530861854553, + "learning_rate": 2.2675523217236135e-08, + "loss": 0.1628, + "step": 14724 + }, + { + "epoch": 3.9183076104310803, + "grad_norm": 0.29016613960266113, + "learning_rate": 2.2664815213585732e-08, + "loss": 0.1844, + "step": 14725 + }, + { + "epoch": 3.9185737094199045, + "grad_norm": 0.2662605345249176, + "learning_rate": 2.2654109415692e-08, + "loss": 0.1652, + "step": 14726 + }, + { + "epoch": 3.918839808408728, + "grad_norm": 0.36807504296302795, + "learning_rate": 2.2643405823860217e-08, + "loss": 0.1843, + "step": 14727 + }, + { + "epoch": 3.919105907397552, + "grad_norm": 0.2522839605808258, + "learning_rate": 2.2632704438395722e-08, + "loss": 0.162, + "step": 14728 + }, + { + "epoch": 3.919372006386376, + "grad_norm": 0.31604668498039246, + "learning_rate": 2.2622005259603694e-08, + "loss": 0.1673, + "step": 14729 + }, + { + "epoch": 3.9196381053751996, + "grad_norm": 0.2932634949684143, + "learning_rate": 2.2611308287789342e-08, + "loss": 0.1879, + "step": 14730 + }, + { + "epoch": 3.9199042043640233, + "grad_norm": 0.2867797613143921, + "learning_rate": 2.2600613523257704e-08, + "loss": 0.1699, + "step": 14731 + }, + { + "epoch": 3.9201703033528474, + "grad_norm": 0.3390856683254242, + "learning_rate": 2.2589920966313857e-08, + "loss": 0.1943, + "step": 14732 + }, + { + "epoch": 3.920436402341671, + "grad_norm": 0.27038270235061646, + "learning_rate": 2.2579230617262767e-08, + "loss": 0.1626, + "step": 14733 + }, + { + "epoch": 3.920702501330495, + "grad_norm": 0.3120211958885193, + "learning_rate": 2.256854247640938e-08, + "loss": 0.1769, + "step": 14734 + }, + { + "epoch": 3.920968600319319, + "grad_norm": 0.5074047446250916, + "learning_rate": 2.2557856544058473e-08, + "loss": 0.1861, + "step": 14735 + }, + { + "epoch": 3.9212346993081426, + "grad_norm": 0.36173805594444275, + "learning_rate": 2.254717282051487e-08, + "loss": 0.1712, + "step": 14736 + }, + { + "epoch": 3.9215007982969663, + "grad_norm": 0.46569013595581055, + "learning_rate": 2.2536491306083328e-08, + "loss": 0.1874, + "step": 14737 + }, + { + "epoch": 3.9217668972857904, + "grad_norm": 0.4222605526447296, + "learning_rate": 2.2525812001068422e-08, + "loss": 0.1692, + "step": 14738 + }, + { + "epoch": 3.922032996274614, + "grad_norm": 0.3150661587715149, + "learning_rate": 2.25151349057748e-08, + "loss": 0.1819, + "step": 14739 + }, + { + "epoch": 3.922299095263438, + "grad_norm": 0.26965436339378357, + "learning_rate": 2.2504460020507e-08, + "loss": 0.1668, + "step": 14740 + }, + { + "epoch": 3.922565194252262, + "grad_norm": 0.2842465937137604, + "learning_rate": 2.2493787345569492e-08, + "loss": 0.1821, + "step": 14741 + }, + { + "epoch": 3.9228312932410856, + "grad_norm": 0.4051007628440857, + "learning_rate": 2.2483116881266638e-08, + "loss": 0.1729, + "step": 14742 + }, + { + "epoch": 3.9230973922299093, + "grad_norm": 0.2695007026195526, + "learning_rate": 2.2472448627902852e-08, + "loss": 0.1531, + "step": 14743 + }, + { + "epoch": 3.9233634912187334, + "grad_norm": 0.2638270854949951, + "learning_rate": 2.246178258578234e-08, + "loss": 0.1774, + "step": 14744 + }, + { + "epoch": 3.923629590207557, + "grad_norm": 0.268259733915329, + "learning_rate": 2.2451118755209386e-08, + "loss": 0.1669, + "step": 14745 + }, + { + "epoch": 3.923895689196381, + "grad_norm": 0.283708393573761, + "learning_rate": 2.2440457136488078e-08, + "loss": 0.1708, + "step": 14746 + }, + { + "epoch": 3.924161788185205, + "grad_norm": 0.40897780656814575, + "learning_rate": 2.242979772992254e-08, + "loss": 0.1945, + "step": 14747 + }, + { + "epoch": 3.9244278871740286, + "grad_norm": 0.28215575218200684, + "learning_rate": 2.2419140535816805e-08, + "loss": 0.1622, + "step": 14748 + }, + { + "epoch": 3.9246939861628527, + "grad_norm": 0.2706936299800873, + "learning_rate": 2.2408485554474865e-08, + "loss": 0.1685, + "step": 14749 + }, + { + "epoch": 3.9249600851516764, + "grad_norm": 0.2745538055896759, + "learning_rate": 2.2397832786200565e-08, + "loss": 0.1759, + "step": 14750 + }, + { + "epoch": 3.9252261841405005, + "grad_norm": 0.2770071029663086, + "learning_rate": 2.2387182231297784e-08, + "loss": 0.185, + "step": 14751 + }, + { + "epoch": 3.925492283129324, + "grad_norm": 0.27675214409828186, + "learning_rate": 2.237653389007027e-08, + "loss": 0.1648, + "step": 14752 + }, + { + "epoch": 3.925758382118148, + "grad_norm": 0.281362920999527, + "learning_rate": 2.2365887762821733e-08, + "loss": 0.1971, + "step": 14753 + }, + { + "epoch": 3.926024481106972, + "grad_norm": 0.39213043451309204, + "learning_rate": 2.2355243849855842e-08, + "loss": 0.1755, + "step": 14754 + }, + { + "epoch": 3.9262905800957957, + "grad_norm": 0.2790509760379791, + "learning_rate": 2.2344602151476187e-08, + "loss": 0.1636, + "step": 14755 + }, + { + "epoch": 3.9265566790846194, + "grad_norm": 0.4180935025215149, + "learning_rate": 2.2333962667986307e-08, + "loss": 0.1908, + "step": 14756 + }, + { + "epoch": 3.9268227780734435, + "grad_norm": 0.32007908821105957, + "learning_rate": 2.2323325399689597e-08, + "loss": 0.1639, + "step": 14757 + }, + { + "epoch": 3.927088877062267, + "grad_norm": 0.30141082406044006, + "learning_rate": 2.231269034688953e-08, + "loss": 0.209, + "step": 14758 + }, + { + "epoch": 3.927354976051091, + "grad_norm": 0.28015002608299255, + "learning_rate": 2.230205750988937e-08, + "loss": 0.1769, + "step": 14759 + }, + { + "epoch": 3.927621075039915, + "grad_norm": 0.2941567003726959, + "learning_rate": 2.229142688899246e-08, + "loss": 0.1752, + "step": 14760 + }, + { + "epoch": 3.9278871740287387, + "grad_norm": 0.27424806356430054, + "learning_rate": 2.2280798484501927e-08, + "loss": 0.1775, + "step": 14761 + }, + { + "epoch": 3.9281532730175623, + "grad_norm": 0.2770819664001465, + "learning_rate": 2.2270172296720957e-08, + "loss": 0.176, + "step": 14762 + }, + { + "epoch": 3.9284193720063865, + "grad_norm": 0.31597423553466797, + "learning_rate": 2.2259548325952625e-08, + "loss": 0.1814, + "step": 14763 + }, + { + "epoch": 3.92868547099521, + "grad_norm": 0.2775544822216034, + "learning_rate": 2.2248926572499983e-08, + "loss": 0.1811, + "step": 14764 + }, + { + "epoch": 3.928951569984034, + "grad_norm": 0.3296121060848236, + "learning_rate": 2.2238307036665914e-08, + "loss": 0.1728, + "step": 14765 + }, + { + "epoch": 3.929217668972858, + "grad_norm": 0.28575390577316284, + "learning_rate": 2.2227689718753396e-08, + "loss": 0.1633, + "step": 14766 + }, + { + "epoch": 3.9294837679616816, + "grad_norm": 0.27885735034942627, + "learning_rate": 2.2217074619065167e-08, + "loss": 0.1796, + "step": 14767 + }, + { + "epoch": 3.9297498669505053, + "grad_norm": 0.29244744777679443, + "learning_rate": 2.2206461737904037e-08, + "loss": 0.1859, + "step": 14768 + }, + { + "epoch": 3.9300159659393294, + "grad_norm": 0.2803609073162079, + "learning_rate": 2.219585107557269e-08, + "loss": 0.1962, + "step": 14769 + }, + { + "epoch": 3.930282064928153, + "grad_norm": 0.26240500807762146, + "learning_rate": 2.2185242632373812e-08, + "loss": 0.1675, + "step": 14770 + }, + { + "epoch": 3.9305481639169773, + "grad_norm": 0.27724504470825195, + "learning_rate": 2.2174636408609916e-08, + "loss": 0.1674, + "step": 14771 + }, + { + "epoch": 3.930814262905801, + "grad_norm": 0.28316763043403625, + "learning_rate": 2.2164032404583543e-08, + "loss": 0.1818, + "step": 14772 + }, + { + "epoch": 3.931080361894625, + "grad_norm": 0.3858935832977295, + "learning_rate": 2.2153430620597156e-08, + "loss": 0.2026, + "step": 14773 + }, + { + "epoch": 3.9313464608834487, + "grad_norm": 0.2904353737831116, + "learning_rate": 2.2142831056953093e-08, + "loss": 0.1898, + "step": 14774 + }, + { + "epoch": 3.9316125598722724, + "grad_norm": 0.27899396419525146, + "learning_rate": 2.213223371395373e-08, + "loss": 0.1858, + "step": 14775 + }, + { + "epoch": 3.9318786588610966, + "grad_norm": 0.40210244059562683, + "learning_rate": 2.212163859190124e-08, + "loss": 0.1876, + "step": 14776 + }, + { + "epoch": 3.9321447578499202, + "grad_norm": 0.2814883589744568, + "learning_rate": 2.211104569109793e-08, + "loss": 0.1725, + "step": 14777 + }, + { + "epoch": 3.932410856838744, + "grad_norm": 0.34619179368019104, + "learning_rate": 2.2100455011845854e-08, + "loss": 0.1845, + "step": 14778 + }, + { + "epoch": 3.932676955827568, + "grad_norm": 0.36934247612953186, + "learning_rate": 2.2089866554447112e-08, + "loss": 0.2009, + "step": 14779 + }, + { + "epoch": 3.9329430548163917, + "grad_norm": 0.25966954231262207, + "learning_rate": 2.2079280319203687e-08, + "loss": 0.1737, + "step": 14780 + }, + { + "epoch": 3.9332091538052154, + "grad_norm": 0.30990251898765564, + "learning_rate": 2.2068696306417543e-08, + "loss": 0.1882, + "step": 14781 + }, + { + "epoch": 3.9334752527940395, + "grad_norm": 0.27346840500831604, + "learning_rate": 2.2058114516390524e-08, + "loss": 0.1666, + "step": 14782 + }, + { + "epoch": 3.933741351782863, + "grad_norm": 0.2659624516963959, + "learning_rate": 2.204753494942446e-08, + "loss": 0.1627, + "step": 14783 + }, + { + "epoch": 3.934007450771687, + "grad_norm": 0.38109514117240906, + "learning_rate": 2.2036957605821115e-08, + "loss": 0.1848, + "step": 14784 + }, + { + "epoch": 3.934273549760511, + "grad_norm": 0.287750244140625, + "learning_rate": 2.202638248588219e-08, + "loss": 0.1565, + "step": 14785 + }, + { + "epoch": 3.9345396487493347, + "grad_norm": 0.27685508131980896, + "learning_rate": 2.2015809589909252e-08, + "loss": 0.1747, + "step": 14786 + }, + { + "epoch": 3.9348057477381584, + "grad_norm": 0.2873600125312805, + "learning_rate": 2.200523891820393e-08, + "loss": 0.177, + "step": 14787 + }, + { + "epoch": 3.9350718467269825, + "grad_norm": 0.30700433254241943, + "learning_rate": 2.1994670471067666e-08, + "loss": 0.1769, + "step": 14788 + }, + { + "epoch": 3.935337945715806, + "grad_norm": 0.28384101390838623, + "learning_rate": 2.1984104248801926e-08, + "loss": 0.1787, + "step": 14789 + }, + { + "epoch": 3.93560404470463, + "grad_norm": 0.2555658221244812, + "learning_rate": 2.1973540251708034e-08, + "loss": 0.1671, + "step": 14790 + }, + { + "epoch": 3.935870143693454, + "grad_norm": 0.3567236363887787, + "learning_rate": 2.1962978480087345e-08, + "loss": 0.1854, + "step": 14791 + }, + { + "epoch": 3.9361362426822777, + "grad_norm": 0.3421589136123657, + "learning_rate": 2.195241893424111e-08, + "loss": 0.1776, + "step": 14792 + }, + { + "epoch": 3.936402341671102, + "grad_norm": 0.3324519991874695, + "learning_rate": 2.194186161447048e-08, + "loss": 0.189, + "step": 14793 + }, + { + "epoch": 3.9366684406599255, + "grad_norm": 0.33860230445861816, + "learning_rate": 2.1931306521076588e-08, + "loss": 0.1724, + "step": 14794 + }, + { + "epoch": 3.936934539648749, + "grad_norm": 0.2720561921596527, + "learning_rate": 2.1920753654360456e-08, + "loss": 0.1614, + "step": 14795 + }, + { + "epoch": 3.9372006386375733, + "grad_norm": 0.2751784324645996, + "learning_rate": 2.191020301462313e-08, + "loss": 0.1708, + "step": 14796 + }, + { + "epoch": 3.937466737626397, + "grad_norm": 0.29427531361579895, + "learning_rate": 2.1899654602165463e-08, + "loss": 0.1929, + "step": 14797 + }, + { + "epoch": 3.937732836615221, + "grad_norm": 0.2823130190372467, + "learning_rate": 2.188910841728835e-08, + "loss": 0.161, + "step": 14798 + }, + { + "epoch": 3.937998935604045, + "grad_norm": 0.29385480284690857, + "learning_rate": 2.1878564460292603e-08, + "loss": 0.1794, + "step": 14799 + }, + { + "epoch": 3.9382650345928685, + "grad_norm": 0.36711952090263367, + "learning_rate": 2.1868022731478974e-08, + "loss": 0.1909, + "step": 14800 + }, + { + "epoch": 3.9385311335816926, + "grad_norm": 0.25843968987464905, + "learning_rate": 2.185748323114808e-08, + "loss": 0.1712, + "step": 14801 + }, + { + "epoch": 3.9387972325705163, + "grad_norm": 0.24256955087184906, + "learning_rate": 2.184694595960058e-08, + "loss": 0.1542, + "step": 14802 + }, + { + "epoch": 3.93906333155934, + "grad_norm": 0.41697943210601807, + "learning_rate": 2.183641091713697e-08, + "loss": 0.1732, + "step": 14803 + }, + { + "epoch": 3.939329430548164, + "grad_norm": 0.2960912585258484, + "learning_rate": 2.182587810405777e-08, + "loss": 0.1913, + "step": 14804 + }, + { + "epoch": 3.9395955295369878, + "grad_norm": 0.29293984174728394, + "learning_rate": 2.1815347520663376e-08, + "loss": 0.1824, + "step": 14805 + }, + { + "epoch": 3.9398616285258115, + "grad_norm": 0.2797451913356781, + "learning_rate": 2.1804819167254164e-08, + "loss": 0.1698, + "step": 14806 + }, + { + "epoch": 3.9401277275146356, + "grad_norm": 0.28781744837760925, + "learning_rate": 2.1794293044130397e-08, + "loss": 0.1817, + "step": 14807 + }, + { + "epoch": 3.9403938265034593, + "grad_norm": 0.2976709306240082, + "learning_rate": 2.1783769151592313e-08, + "loss": 0.1727, + "step": 14808 + }, + { + "epoch": 3.940659925492283, + "grad_norm": 0.2808366119861603, + "learning_rate": 2.1773247489940094e-08, + "loss": 0.1744, + "step": 14809 + }, + { + "epoch": 3.940926024481107, + "grad_norm": 0.34413203597068787, + "learning_rate": 2.17627280594738e-08, + "loss": 0.1928, + "step": 14810 + }, + { + "epoch": 3.9411921234699308, + "grad_norm": 0.2666862905025482, + "learning_rate": 2.1752210860493514e-08, + "loss": 0.1777, + "step": 14811 + }, + { + "epoch": 3.9414582224587544, + "grad_norm": 0.26314419507980347, + "learning_rate": 2.174169589329915e-08, + "loss": 0.1684, + "step": 14812 + }, + { + "epoch": 3.9417243214475786, + "grad_norm": 0.3132100999355316, + "learning_rate": 2.173118315819066e-08, + "loss": 0.1731, + "step": 14813 + }, + { + "epoch": 3.9419904204364022, + "grad_norm": 0.30247005820274353, + "learning_rate": 2.1720672655467864e-08, + "loss": 0.1902, + "step": 14814 + }, + { + "epoch": 3.942256519425226, + "grad_norm": 0.2695516347885132, + "learning_rate": 2.1710164385430583e-08, + "loss": 0.1619, + "step": 14815 + }, + { + "epoch": 3.94252261841405, + "grad_norm": 0.3758884370326996, + "learning_rate": 2.169965834837849e-08, + "loss": 0.1841, + "step": 14816 + }, + { + "epoch": 3.9427887174028737, + "grad_norm": 0.2986931800842285, + "learning_rate": 2.1689154544611264e-08, + "loss": 0.1843, + "step": 14817 + }, + { + "epoch": 3.943054816391698, + "grad_norm": 0.30180439352989197, + "learning_rate": 2.167865297442848e-08, + "loss": 0.1753, + "step": 14818 + }, + { + "epoch": 3.9433209153805215, + "grad_norm": 0.34948694705963135, + "learning_rate": 2.166815363812965e-08, + "loss": 0.1836, + "step": 14819 + }, + { + "epoch": 3.9435870143693457, + "grad_norm": 0.36394068598747253, + "learning_rate": 2.165765653601427e-08, + "loss": 0.1802, + "step": 14820 + }, + { + "epoch": 3.9438531133581693, + "grad_norm": 0.3109833300113678, + "learning_rate": 2.164716166838174e-08, + "loss": 0.1737, + "step": 14821 + }, + { + "epoch": 3.944119212346993, + "grad_norm": 0.2663150727748871, + "learning_rate": 2.1636669035531362e-08, + "loss": 0.16, + "step": 14822 + }, + { + "epoch": 3.944385311335817, + "grad_norm": 0.28995421528816223, + "learning_rate": 2.1626178637762448e-08, + "loss": 0.1803, + "step": 14823 + }, + { + "epoch": 3.944651410324641, + "grad_norm": 0.32286179065704346, + "learning_rate": 2.161569047537416e-08, + "loss": 0.1727, + "step": 14824 + }, + { + "epoch": 3.9449175093134645, + "grad_norm": 0.2993650436401367, + "learning_rate": 2.160520454866569e-08, + "loss": 0.1785, + "step": 14825 + }, + { + "epoch": 3.9451836083022886, + "grad_norm": 0.3518449068069458, + "learning_rate": 2.1594720857936067e-08, + "loss": 0.2042, + "step": 14826 + }, + { + "epoch": 3.9454497072911123, + "grad_norm": 0.2799135148525238, + "learning_rate": 2.1584239403484293e-08, + "loss": 0.178, + "step": 14827 + }, + { + "epoch": 3.945715806279936, + "grad_norm": 0.25978344678878784, + "learning_rate": 2.1573760185609425e-08, + "loss": 0.1641, + "step": 14828 + }, + { + "epoch": 3.94598190526876, + "grad_norm": 0.30109280347824097, + "learning_rate": 2.156328320461026e-08, + "loss": 0.1795, + "step": 14829 + }, + { + "epoch": 3.946248004257584, + "grad_norm": 0.275801420211792, + "learning_rate": 2.1552808460785665e-08, + "loss": 0.1674, + "step": 14830 + }, + { + "epoch": 3.9465141032464075, + "grad_norm": 0.27007028460502625, + "learning_rate": 2.1542335954434365e-08, + "loss": 0.1768, + "step": 14831 + }, + { + "epoch": 3.9467802022352316, + "grad_norm": 0.26801779866218567, + "learning_rate": 2.153186568585511e-08, + "loss": 0.1706, + "step": 14832 + }, + { + "epoch": 3.9470463012240553, + "grad_norm": 0.2923491895198822, + "learning_rate": 2.1521397655346462e-08, + "loss": 0.186, + "step": 14833 + }, + { + "epoch": 3.947312400212879, + "grad_norm": 0.4101177752017975, + "learning_rate": 2.151093186320704e-08, + "loss": 0.1915, + "step": 14834 + }, + { + "epoch": 3.947578499201703, + "grad_norm": 0.2794971466064453, + "learning_rate": 2.1500468309735333e-08, + "loss": 0.1796, + "step": 14835 + }, + { + "epoch": 3.947844598190527, + "grad_norm": 0.37438687682151794, + "learning_rate": 2.1490006995229816e-08, + "loss": 0.1924, + "step": 14836 + }, + { + "epoch": 3.9481106971793505, + "grad_norm": 0.2845936417579651, + "learning_rate": 2.1479547919988817e-08, + "loss": 0.1853, + "step": 14837 + }, + { + "epoch": 3.9483767961681746, + "grad_norm": 0.28328418731689453, + "learning_rate": 2.1469091084310686e-08, + "loss": 0.178, + "step": 14838 + }, + { + "epoch": 3.9486428951569983, + "grad_norm": 0.385530948638916, + "learning_rate": 2.1458636488493642e-08, + "loss": 0.1785, + "step": 14839 + }, + { + "epoch": 3.948908994145822, + "grad_norm": 0.2679336369037628, + "learning_rate": 2.1448184132835922e-08, + "loss": 0.1588, + "step": 14840 + }, + { + "epoch": 3.949175093134646, + "grad_norm": 0.273971825838089, + "learning_rate": 2.1437734017635555e-08, + "loss": 0.1825, + "step": 14841 + }, + { + "epoch": 3.94944119212347, + "grad_norm": 0.35056743025779724, + "learning_rate": 2.1427286143190714e-08, + "loss": 0.1551, + "step": 14842 + }, + { + "epoch": 3.949707291112294, + "grad_norm": 0.3194335699081421, + "learning_rate": 2.1416840509799317e-08, + "loss": 0.1854, + "step": 14843 + }, + { + "epoch": 3.9499733901011176, + "grad_norm": 0.29101675748825073, + "learning_rate": 2.1406397117759335e-08, + "loss": 0.1858, + "step": 14844 + }, + { + "epoch": 3.9502394890899417, + "grad_norm": 0.6012185215950012, + "learning_rate": 2.1395955967368596e-08, + "loss": 0.1834, + "step": 14845 + }, + { + "epoch": 3.9505055880787654, + "grad_norm": 0.305202454328537, + "learning_rate": 2.1385517058924928e-08, + "loss": 0.1905, + "step": 14846 + }, + { + "epoch": 3.950771687067589, + "grad_norm": 0.4792887568473816, + "learning_rate": 2.1375080392726085e-08, + "loss": 0.1688, + "step": 14847 + }, + { + "epoch": 3.951037786056413, + "grad_norm": 0.3070279061794281, + "learning_rate": 2.1364645969069695e-08, + "loss": 0.1969, + "step": 14848 + }, + { + "epoch": 3.951303885045237, + "grad_norm": 0.29795733094215393, + "learning_rate": 2.1354213788253406e-08, + "loss": 0.1914, + "step": 14849 + }, + { + "epoch": 3.9515699840340606, + "grad_norm": 0.2941209375858307, + "learning_rate": 2.134378385057475e-08, + "loss": 0.1668, + "step": 14850 + }, + { + "epoch": 3.9518360830228847, + "grad_norm": 0.35653582215309143, + "learning_rate": 2.133335615633124e-08, + "loss": 0.1734, + "step": 14851 + }, + { + "epoch": 3.9521021820117084, + "grad_norm": 0.37636083364486694, + "learning_rate": 2.1322930705820242e-08, + "loss": 0.1792, + "step": 14852 + }, + { + "epoch": 3.952368281000532, + "grad_norm": 0.27820074558258057, + "learning_rate": 2.1312507499339173e-08, + "loss": 0.1768, + "step": 14853 + }, + { + "epoch": 3.952634379989356, + "grad_norm": 0.3115555942058563, + "learning_rate": 2.130208653718526e-08, + "loss": 0.1798, + "step": 14854 + }, + { + "epoch": 3.95290047897818, + "grad_norm": 0.2829173803329468, + "learning_rate": 2.1291667819655766e-08, + "loss": 0.1837, + "step": 14855 + }, + { + "epoch": 3.9531665779670035, + "grad_norm": 0.2660953402519226, + "learning_rate": 2.1281251347047846e-08, + "loss": 0.1729, + "step": 14856 + }, + { + "epoch": 3.9534326769558277, + "grad_norm": 0.27960771322250366, + "learning_rate": 2.1270837119658624e-08, + "loss": 0.1617, + "step": 14857 + }, + { + "epoch": 3.9536987759446514, + "grad_norm": 0.2728317081928253, + "learning_rate": 2.126042513778509e-08, + "loss": 0.1641, + "step": 14858 + }, + { + "epoch": 3.953964874933475, + "grad_norm": 0.347551167011261, + "learning_rate": 2.1250015401724265e-08, + "loss": 0.183, + "step": 14859 + }, + { + "epoch": 3.954230973922299, + "grad_norm": 0.2805548310279846, + "learning_rate": 2.1239607911773006e-08, + "loss": 0.1679, + "step": 14860 + }, + { + "epoch": 3.954497072911123, + "grad_norm": 0.281571626663208, + "learning_rate": 2.1229202668228196e-08, + "loss": 0.1824, + "step": 14861 + }, + { + "epoch": 3.9547631718999465, + "grad_norm": 0.2956030070781708, + "learning_rate": 2.121879967138658e-08, + "loss": 0.1964, + "step": 14862 + }, + { + "epoch": 3.9550292708887707, + "grad_norm": 0.26169630885124207, + "learning_rate": 2.120839892154488e-08, + "loss": 0.1744, + "step": 14863 + }, + { + "epoch": 3.9552953698775943, + "grad_norm": 0.27551552653312683, + "learning_rate": 2.1198000418999752e-08, + "loss": 0.1802, + "step": 14864 + }, + { + "epoch": 3.9555614688664185, + "grad_norm": 0.3085433542728424, + "learning_rate": 2.118760416404779e-08, + "loss": 0.1654, + "step": 14865 + }, + { + "epoch": 3.955827567855242, + "grad_norm": 0.28300756216049194, + "learning_rate": 2.1177210156985535e-08, + "loss": 0.1744, + "step": 14866 + }, + { + "epoch": 3.9560936668440663, + "grad_norm": 0.33398956060409546, + "learning_rate": 2.1166818398109398e-08, + "loss": 0.1649, + "step": 14867 + }, + { + "epoch": 3.95635976583289, + "grad_norm": 0.31986960768699646, + "learning_rate": 2.1156428887715826e-08, + "loss": 0.1877, + "step": 14868 + }, + { + "epoch": 3.9566258648217136, + "grad_norm": 0.285013884305954, + "learning_rate": 2.1146041626101085e-08, + "loss": 0.1757, + "step": 14869 + }, + { + "epoch": 3.9568919638105378, + "grad_norm": 0.36156952381134033, + "learning_rate": 2.1135656613561482e-08, + "loss": 0.1758, + "step": 14870 + }, + { + "epoch": 3.9571580627993614, + "grad_norm": 0.2925887405872345, + "learning_rate": 2.112527385039321e-08, + "loss": 0.2001, + "step": 14871 + }, + { + "epoch": 3.957424161788185, + "grad_norm": 0.2763109505176544, + "learning_rate": 2.111489333689245e-08, + "loss": 0.172, + "step": 14872 + }, + { + "epoch": 3.9576902607770093, + "grad_norm": 0.3252219557762146, + "learning_rate": 2.1104515073355188e-08, + "loss": 0.1666, + "step": 14873 + }, + { + "epoch": 3.957956359765833, + "grad_norm": 0.3024168014526367, + "learning_rate": 2.109413906007751e-08, + "loss": 0.1738, + "step": 14874 + }, + { + "epoch": 3.9582224587546566, + "grad_norm": 0.27298638224601746, + "learning_rate": 2.1083765297355303e-08, + "loss": 0.1662, + "step": 14875 + }, + { + "epoch": 3.9584885577434807, + "grad_norm": 0.2751539349555969, + "learning_rate": 2.1073393785484518e-08, + "loss": 0.1687, + "step": 14876 + }, + { + "epoch": 3.9587546567323044, + "grad_norm": 0.25779423117637634, + "learning_rate": 2.1063024524760885e-08, + "loss": 0.1665, + "step": 14877 + }, + { + "epoch": 3.959020755721128, + "grad_norm": 0.25792232155799866, + "learning_rate": 2.105265751548021e-08, + "loss": 0.1566, + "step": 14878 + }, + { + "epoch": 3.9592868547099522, + "grad_norm": 0.3515424430370331, + "learning_rate": 2.1042292757938162e-08, + "loss": 0.1769, + "step": 14879 + }, + { + "epoch": 3.959552953698776, + "grad_norm": 0.37001240253448486, + "learning_rate": 2.1031930252430407e-08, + "loss": 0.1639, + "step": 14880 + }, + { + "epoch": 3.9598190526875996, + "grad_norm": 0.30049780011177063, + "learning_rate": 2.1021569999252444e-08, + "loss": 0.1929, + "step": 14881 + }, + { + "epoch": 3.9600851516764237, + "grad_norm": 0.3463074862957001, + "learning_rate": 2.1011211998699795e-08, + "loss": 0.1731, + "step": 14882 + }, + { + "epoch": 3.9603512506652474, + "grad_norm": 0.27512630820274353, + "learning_rate": 2.1000856251067922e-08, + "loss": 0.1755, + "step": 14883 + }, + { + "epoch": 3.960617349654071, + "grad_norm": 0.2810750901699066, + "learning_rate": 2.099050275665213e-08, + "loss": 0.1752, + "step": 14884 + }, + { + "epoch": 3.960883448642895, + "grad_norm": 0.3501087725162506, + "learning_rate": 2.0980151515747756e-08, + "loss": 0.1896, + "step": 14885 + }, + { + "epoch": 3.961149547631719, + "grad_norm": 0.27759528160095215, + "learning_rate": 2.096980252865005e-08, + "loss": 0.1519, + "step": 14886 + }, + { + "epoch": 3.9614156466205426, + "grad_norm": 0.3479222059249878, + "learning_rate": 2.095945579565418e-08, + "loss": 0.1643, + "step": 14887 + }, + { + "epoch": 3.9616817456093667, + "grad_norm": 0.3134611248970032, + "learning_rate": 2.0949111317055235e-08, + "loss": 0.1673, + "step": 14888 + }, + { + "epoch": 3.9619478445981904, + "grad_norm": 0.26499247550964355, + "learning_rate": 2.09387690931483e-08, + "loss": 0.1649, + "step": 14889 + }, + { + "epoch": 3.9622139435870145, + "grad_norm": 0.3285839855670929, + "learning_rate": 2.09284291242283e-08, + "loss": 0.1676, + "step": 14890 + }, + { + "epoch": 3.962480042575838, + "grad_norm": 2.489605188369751, + "learning_rate": 2.0918091410590177e-08, + "loss": 0.1751, + "step": 14891 + }, + { + "epoch": 3.9627461415646623, + "grad_norm": 0.2739814221858978, + "learning_rate": 2.0907755952528805e-08, + "loss": 0.181, + "step": 14892 + }, + { + "epoch": 3.963012240553486, + "grad_norm": 0.2908601760864258, + "learning_rate": 2.0897422750338965e-08, + "loss": 0.1901, + "step": 14893 + }, + { + "epoch": 3.9632783395423097, + "grad_norm": 0.287056565284729, + "learning_rate": 2.088709180431536e-08, + "loss": 0.1787, + "step": 14894 + }, + { + "epoch": 3.963544438531134, + "grad_norm": 0.2716057598590851, + "learning_rate": 2.087676311475268e-08, + "loss": 0.1625, + "step": 14895 + }, + { + "epoch": 3.9638105375199575, + "grad_norm": 0.47627395391464233, + "learning_rate": 2.0866436681945478e-08, + "loss": 0.1842, + "step": 14896 + }, + { + "epoch": 3.964076636508781, + "grad_norm": 0.25883087515830994, + "learning_rate": 2.085611250618834e-08, + "loss": 0.1604, + "step": 14897 + }, + { + "epoch": 3.9643427354976053, + "grad_norm": 0.2941405177116394, + "learning_rate": 2.084579058777567e-08, + "loss": 0.1772, + "step": 14898 + }, + { + "epoch": 3.964608834486429, + "grad_norm": 0.29369696974754333, + "learning_rate": 2.0835470927001895e-08, + "loss": 0.1889, + "step": 14899 + }, + { + "epoch": 3.9648749334752527, + "grad_norm": 0.33833637833595276, + "learning_rate": 2.0825153524161377e-08, + "loss": 0.1886, + "step": 14900 + }, + { + "epoch": 3.965141032464077, + "grad_norm": 0.487179160118103, + "learning_rate": 2.081483837954836e-08, + "loss": 0.2079, + "step": 14901 + }, + { + "epoch": 3.9654071314529005, + "grad_norm": 0.2630009949207306, + "learning_rate": 2.0804525493457093e-08, + "loss": 0.1616, + "step": 14902 + }, + { + "epoch": 3.965673230441724, + "grad_norm": 0.2595894932746887, + "learning_rate": 2.079421486618166e-08, + "loss": 0.1736, + "step": 14903 + }, + { + "epoch": 3.9659393294305483, + "grad_norm": 0.30212166905403137, + "learning_rate": 2.07839064980162e-08, + "loss": 0.1931, + "step": 14904 + }, + { + "epoch": 3.966205428419372, + "grad_norm": 0.3401106297969818, + "learning_rate": 2.0773600389254685e-08, + "loss": 0.1804, + "step": 14905 + }, + { + "epoch": 3.9664715274081956, + "grad_norm": 0.36106348037719727, + "learning_rate": 2.0763296540191077e-08, + "loss": 0.1614, + "step": 14906 + }, + { + "epoch": 3.9667376263970198, + "grad_norm": 0.31388020515441895, + "learning_rate": 2.0752994951119263e-08, + "loss": 0.1893, + "step": 14907 + }, + { + "epoch": 3.9670037253858434, + "grad_norm": 0.2902831733226776, + "learning_rate": 2.0742695622333107e-08, + "loss": 0.1766, + "step": 14908 + }, + { + "epoch": 3.967269824374667, + "grad_norm": 0.3635363280773163, + "learning_rate": 2.0732398554126295e-08, + "loss": 0.183, + "step": 14909 + }, + { + "epoch": 3.9675359233634913, + "grad_norm": 0.36846137046813965, + "learning_rate": 2.0722103746792585e-08, + "loss": 0.1654, + "step": 14910 + }, + { + "epoch": 3.967802022352315, + "grad_norm": 0.2632080018520355, + "learning_rate": 2.071181120062555e-08, + "loss": 0.1612, + "step": 14911 + }, + { + "epoch": 3.968068121341139, + "grad_norm": 0.3275376260280609, + "learning_rate": 2.0701520915918813e-08, + "loss": 0.2096, + "step": 14912 + }, + { + "epoch": 3.9683342203299627, + "grad_norm": 0.45764341950416565, + "learning_rate": 2.0691232892965805e-08, + "loss": 0.1948, + "step": 14913 + }, + { + "epoch": 3.9686003193187864, + "grad_norm": 0.3721349239349365, + "learning_rate": 2.068094713206e-08, + "loss": 0.1698, + "step": 14914 + }, + { + "epoch": 3.9688664183076106, + "grad_norm": 0.2568158507347107, + "learning_rate": 2.067066363349478e-08, + "loss": 0.1579, + "step": 14915 + }, + { + "epoch": 3.9691325172964342, + "grad_norm": 0.36015287041664124, + "learning_rate": 2.0660382397563447e-08, + "loss": 0.2022, + "step": 14916 + }, + { + "epoch": 3.9693986162852584, + "grad_norm": 0.2534208595752716, + "learning_rate": 2.0650103424559217e-08, + "loss": 0.1583, + "step": 14917 + }, + { + "epoch": 3.969664715274082, + "grad_norm": 0.31926295161247253, + "learning_rate": 2.0639826714775276e-08, + "loss": 0.1883, + "step": 14918 + }, + { + "epoch": 3.9699308142629057, + "grad_norm": 0.2573254704475403, + "learning_rate": 2.062955226850478e-08, + "loss": 0.1662, + "step": 14919 + }, + { + "epoch": 3.97019691325173, + "grad_norm": 0.2772377133369446, + "learning_rate": 2.0619280086040714e-08, + "loss": 0.1835, + "step": 14920 + }, + { + "epoch": 3.9704630122405535, + "grad_norm": 0.294111430644989, + "learning_rate": 2.0609010167676087e-08, + "loss": 0.1771, + "step": 14921 + }, + { + "epoch": 3.970729111229377, + "grad_norm": 0.2738526463508606, + "learning_rate": 2.0598742513703825e-08, + "loss": 0.1717, + "step": 14922 + }, + { + "epoch": 3.9709952102182013, + "grad_norm": 0.2702401280403137, + "learning_rate": 2.0588477124416804e-08, + "loss": 0.1557, + "step": 14923 + }, + { + "epoch": 3.971261309207025, + "grad_norm": 0.33606022596359253, + "learning_rate": 2.057821400010775e-08, + "loss": 0.1637, + "step": 14924 + }, + { + "epoch": 3.9715274081958487, + "grad_norm": 0.2871153950691223, + "learning_rate": 2.056795314106946e-08, + "loss": 0.1684, + "step": 14925 + }, + { + "epoch": 3.971793507184673, + "grad_norm": 0.27731361985206604, + "learning_rate": 2.0557694547594528e-08, + "loss": 0.1783, + "step": 14926 + }, + { + "epoch": 3.9720596061734965, + "grad_norm": 0.2800666093826294, + "learning_rate": 2.054743821997562e-08, + "loss": 0.1672, + "step": 14927 + }, + { + "epoch": 3.97232570516232, + "grad_norm": 0.3080366253852844, + "learning_rate": 2.0537184158505172e-08, + "loss": 0.1887, + "step": 14928 + }, + { + "epoch": 3.9725918041511443, + "grad_norm": 0.36590775847435, + "learning_rate": 2.0526932363475758e-08, + "loss": 0.1923, + "step": 14929 + }, + { + "epoch": 3.972857903139968, + "grad_norm": 0.3453121781349182, + "learning_rate": 2.05166828351797e-08, + "loss": 0.1737, + "step": 14930 + }, + { + "epoch": 3.9731240021287917, + "grad_norm": 0.25448545813560486, + "learning_rate": 2.05064355739094e-08, + "loss": 0.1672, + "step": 14931 + }, + { + "epoch": 3.973390101117616, + "grad_norm": 0.3772278130054474, + "learning_rate": 2.049619057995706e-08, + "loss": 0.1803, + "step": 14932 + }, + { + "epoch": 3.9736562001064395, + "grad_norm": 0.33173906803131104, + "learning_rate": 2.048594785361496e-08, + "loss": 0.1923, + "step": 14933 + }, + { + "epoch": 3.973922299095263, + "grad_norm": 0.3510711193084717, + "learning_rate": 2.0475707395175178e-08, + "loss": 0.1919, + "step": 14934 + }, + { + "epoch": 3.9741883980840873, + "grad_norm": 0.28621459007263184, + "learning_rate": 2.046546920492982e-08, + "loss": 0.1923, + "step": 14935 + }, + { + "epoch": 3.974454497072911, + "grad_norm": 0.3024296462535858, + "learning_rate": 2.04552332831709e-08, + "loss": 0.1827, + "step": 14936 + }, + { + "epoch": 3.974720596061735, + "grad_norm": 0.2713222801685333, + "learning_rate": 2.0444999630190374e-08, + "loss": 0.1647, + "step": 14937 + }, + { + "epoch": 3.974986695050559, + "grad_norm": 0.3166968822479248, + "learning_rate": 2.0434768246280143e-08, + "loss": 0.1708, + "step": 14938 + }, + { + "epoch": 3.975252794039383, + "grad_norm": 0.30845409631729126, + "learning_rate": 2.0424539131731988e-08, + "loss": 0.1702, + "step": 14939 + }, + { + "epoch": 3.9755188930282066, + "grad_norm": 0.3821154832839966, + "learning_rate": 2.04143122868377e-08, + "loss": 0.182, + "step": 14940 + }, + { + "epoch": 3.9757849920170303, + "grad_norm": 0.28279730677604675, + "learning_rate": 2.040408771188893e-08, + "loss": 0.1765, + "step": 14941 + }, + { + "epoch": 3.9760510910058544, + "grad_norm": 0.27154314517974854, + "learning_rate": 2.0393865407177324e-08, + "loss": 0.1744, + "step": 14942 + }, + { + "epoch": 3.976317189994678, + "grad_norm": 0.3341352343559265, + "learning_rate": 2.038364537299445e-08, + "loss": 0.1859, + "step": 14943 + }, + { + "epoch": 3.9765832889835018, + "grad_norm": 0.46015265583992004, + "learning_rate": 2.037342760963182e-08, + "loss": 0.199, + "step": 14944 + }, + { + "epoch": 3.976849387972326, + "grad_norm": 0.2618098855018616, + "learning_rate": 2.0363212117380813e-08, + "loss": 0.1647, + "step": 14945 + }, + { + "epoch": 3.9771154869611496, + "grad_norm": 0.256656289100647, + "learning_rate": 2.0352998896532858e-08, + "loss": 0.16, + "step": 14946 + }, + { + "epoch": 3.9773815859499733, + "grad_norm": 0.2748206853866577, + "learning_rate": 2.034278794737919e-08, + "loss": 0.1695, + "step": 14947 + }, + { + "epoch": 3.9776476849387974, + "grad_norm": 0.350935161113739, + "learning_rate": 2.0332579270211116e-08, + "loss": 0.1957, + "step": 14948 + }, + { + "epoch": 3.977913783927621, + "grad_norm": 0.3011494278907776, + "learning_rate": 2.0322372865319748e-08, + "loss": 0.1739, + "step": 14949 + }, + { + "epoch": 3.9781798829164448, + "grad_norm": 0.2938629388809204, + "learning_rate": 2.0312168732996214e-08, + "loss": 0.1686, + "step": 14950 + }, + { + "epoch": 3.978445981905269, + "grad_norm": 0.35209891200065613, + "learning_rate": 2.0301966873531564e-08, + "loss": 0.1841, + "step": 14951 + }, + { + "epoch": 3.9787120808940926, + "grad_norm": 0.2750852406024933, + "learning_rate": 2.0291767287216788e-08, + "loss": 0.166, + "step": 14952 + }, + { + "epoch": 3.9789781798829162, + "grad_norm": 0.394011914730072, + "learning_rate": 2.028156997434277e-08, + "loss": 0.1705, + "step": 14953 + }, + { + "epoch": 3.9792442788717404, + "grad_norm": 0.2735658884048462, + "learning_rate": 2.027137493520037e-08, + "loss": 0.1778, + "step": 14954 + }, + { + "epoch": 3.979510377860564, + "grad_norm": 0.6184704899787903, + "learning_rate": 2.0261182170080395e-08, + "loss": 0.1808, + "step": 14955 + }, + { + "epoch": 3.9797764768493877, + "grad_norm": 0.34698575735092163, + "learning_rate": 2.025099167927352e-08, + "loss": 0.1716, + "step": 14956 + }, + { + "epoch": 3.980042575838212, + "grad_norm": 0.2886190116405487, + "learning_rate": 2.0240803463070423e-08, + "loss": 0.1819, + "step": 14957 + }, + { + "epoch": 3.9803086748270355, + "grad_norm": 0.30214163661003113, + "learning_rate": 2.0230617521761682e-08, + "loss": 0.1745, + "step": 14958 + }, + { + "epoch": 3.9805747738158592, + "grad_norm": 0.28673893213272095, + "learning_rate": 2.0220433855637853e-08, + "loss": 0.1745, + "step": 14959 + }, + { + "epoch": 3.9808408728046834, + "grad_norm": 0.2889816462993622, + "learning_rate": 2.0210252464989352e-08, + "loss": 0.1699, + "step": 14960 + }, + { + "epoch": 3.981106971793507, + "grad_norm": 0.3377097249031067, + "learning_rate": 2.0200073350106615e-08, + "loss": 0.1757, + "step": 14961 + }, + { + "epoch": 3.981373070782331, + "grad_norm": 0.4061448574066162, + "learning_rate": 2.0189896511279914e-08, + "loss": 0.1683, + "step": 14962 + }, + { + "epoch": 3.981639169771155, + "grad_norm": 0.2968675196170807, + "learning_rate": 2.0179721948799588e-08, + "loss": 0.1948, + "step": 14963 + }, + { + "epoch": 3.981905268759979, + "grad_norm": 0.2596985399723053, + "learning_rate": 2.0169549662955754e-08, + "loss": 0.1647, + "step": 14964 + }, + { + "epoch": 3.9821713677488026, + "grad_norm": 0.25773605704307556, + "learning_rate": 2.015937965403859e-08, + "loss": 0.1708, + "step": 14965 + }, + { + "epoch": 3.9824374667376263, + "grad_norm": 0.2631385028362274, + "learning_rate": 2.0149211922338162e-08, + "loss": 0.1675, + "step": 14966 + }, + { + "epoch": 3.9827035657264505, + "grad_norm": 0.274652361869812, + "learning_rate": 2.0139046468144506e-08, + "loss": 0.1574, + "step": 14967 + }, + { + "epoch": 3.982969664715274, + "grad_norm": 0.37402814626693726, + "learning_rate": 2.01288832917475e-08, + "loss": 0.1904, + "step": 14968 + }, + { + "epoch": 3.983235763704098, + "grad_norm": 0.2905345857143402, + "learning_rate": 2.0118722393437083e-08, + "loss": 0.185, + "step": 14969 + }, + { + "epoch": 3.983501862692922, + "grad_norm": 0.34723302721977234, + "learning_rate": 2.0108563773502985e-08, + "loss": 0.1667, + "step": 14970 + }, + { + "epoch": 3.9837679616817456, + "grad_norm": 0.3919961452484131, + "learning_rate": 2.009840743223501e-08, + "loss": 0.1869, + "step": 14971 + }, + { + "epoch": 3.9840340606705693, + "grad_norm": 0.2847253680229187, + "learning_rate": 2.0088253369922815e-08, + "loss": 0.1805, + "step": 14972 + }, + { + "epoch": 3.9843001596593934, + "grad_norm": 0.358411580324173, + "learning_rate": 2.0078101586856033e-08, + "loss": 0.1812, + "step": 14973 + }, + { + "epoch": 3.984566258648217, + "grad_norm": 0.333163321018219, + "learning_rate": 2.0067952083324225e-08, + "loss": 0.178, + "step": 14974 + }, + { + "epoch": 3.984832357637041, + "grad_norm": 0.2779621183872223, + "learning_rate": 2.0057804859616823e-08, + "loss": 0.166, + "step": 14975 + }, + { + "epoch": 3.985098456625865, + "grad_norm": 0.34662893414497375, + "learning_rate": 2.0047659916023297e-08, + "loss": 0.1933, + "step": 14976 + }, + { + "epoch": 3.9853645556146886, + "grad_norm": 0.29872986674308777, + "learning_rate": 2.003751725283297e-08, + "loss": 0.1745, + "step": 14977 + }, + { + "epoch": 3.9856306546035123, + "grad_norm": 0.2876844108104706, + "learning_rate": 2.0027376870335157e-08, + "loss": 0.1695, + "step": 14978 + }, + { + "epoch": 3.9858967535923364, + "grad_norm": 0.2755703926086426, + "learning_rate": 2.0017238768819023e-08, + "loss": 0.1743, + "step": 14979 + }, + { + "epoch": 3.98616285258116, + "grad_norm": 0.29252755641937256, + "learning_rate": 2.000710294857382e-08, + "loss": 0.1682, + "step": 14980 + }, + { + "epoch": 3.986428951569984, + "grad_norm": 0.26921162009239197, + "learning_rate": 1.9996969409888564e-08, + "loss": 0.1666, + "step": 14981 + }, + { + "epoch": 3.986695050558808, + "grad_norm": 1.81047523021698, + "learning_rate": 1.9986838153052342e-08, + "loss": 0.1894, + "step": 14982 + }, + { + "epoch": 3.9869611495476316, + "grad_norm": 0.2748053967952728, + "learning_rate": 1.9976709178354077e-08, + "loss": 0.1811, + "step": 14983 + }, + { + "epoch": 3.9872272485364557, + "grad_norm": 0.3637252748012543, + "learning_rate": 1.9966582486082696e-08, + "loss": 0.1861, + "step": 14984 + }, + { + "epoch": 3.9874933475252794, + "grad_norm": 0.2845066487789154, + "learning_rate": 1.9956458076526995e-08, + "loss": 0.1675, + "step": 14985 + }, + { + "epoch": 3.9877594465141035, + "grad_norm": 0.3368176519870758, + "learning_rate": 1.994633594997577e-08, + "loss": 0.1751, + "step": 14986 + }, + { + "epoch": 3.988025545502927, + "grad_norm": 0.3405209183692932, + "learning_rate": 1.9936216106717708e-08, + "loss": 0.185, + "step": 14987 + }, + { + "epoch": 3.988291644491751, + "grad_norm": 0.2930508553981781, + "learning_rate": 1.9926098547041493e-08, + "loss": 0.1713, + "step": 14988 + }, + { + "epoch": 3.988557743480575, + "grad_norm": 0.3027922809123993, + "learning_rate": 1.9915983271235635e-08, + "loss": 0.1746, + "step": 14989 + }, + { + "epoch": 3.9888238424693987, + "grad_norm": 0.2852058708667755, + "learning_rate": 1.990587027958871e-08, + "loss": 0.1734, + "step": 14990 + }, + { + "epoch": 3.9890899414582224, + "grad_norm": 0.2736283838748932, + "learning_rate": 1.9895759572389093e-08, + "loss": 0.1633, + "step": 14991 + }, + { + "epoch": 3.9893560404470465, + "grad_norm": 0.25719210505485535, + "learning_rate": 1.9885651149925187e-08, + "loss": 0.1593, + "step": 14992 + }, + { + "epoch": 3.98962213943587, + "grad_norm": 0.26474958658218384, + "learning_rate": 1.9875545012485318e-08, + "loss": 0.1807, + "step": 14993 + }, + { + "epoch": 3.989888238424694, + "grad_norm": 0.2598131000995636, + "learning_rate": 1.986544116035772e-08, + "loss": 0.1713, + "step": 14994 + }, + { + "epoch": 3.990154337413518, + "grad_norm": 0.2970253527164459, + "learning_rate": 1.985533959383061e-08, + "loss": 0.1782, + "step": 14995 + }, + { + "epoch": 3.9904204364023417, + "grad_norm": 0.2747269868850708, + "learning_rate": 1.9845240313192047e-08, + "loss": 0.1807, + "step": 14996 + }, + { + "epoch": 3.9906865353911654, + "grad_norm": 0.4240647554397583, + "learning_rate": 1.983514331873015e-08, + "loss": 0.1618, + "step": 14997 + }, + { + "epoch": 3.9909526343799895, + "grad_norm": 0.27255529165267944, + "learning_rate": 1.982504861073283e-08, + "loss": 0.1757, + "step": 14998 + }, + { + "epoch": 3.991218733368813, + "grad_norm": 0.3518223464488983, + "learning_rate": 1.9814956189488075e-08, + "loss": 0.1866, + "step": 14999 + }, + { + "epoch": 3.991484832357637, + "grad_norm": 0.28625333309173584, + "learning_rate": 1.9804866055283688e-08, + "loss": 0.1678, + "step": 15000 + }, + { + "epoch": 3.991750931346461, + "grad_norm": 0.2731756269931793, + "learning_rate": 1.9794778208407492e-08, + "loss": 0.1595, + "step": 15001 + }, + { + "epoch": 3.9920170303352847, + "grad_norm": 0.42369598150253296, + "learning_rate": 1.9784692649147194e-08, + "loss": 0.1967, + "step": 15002 + }, + { + "epoch": 3.9922831293241083, + "grad_norm": 0.2659044861793518, + "learning_rate": 1.9774609377790506e-08, + "loss": 0.164, + "step": 15003 + }, + { + "epoch": 3.9925492283129325, + "grad_norm": 0.30545270442962646, + "learning_rate": 1.9764528394624948e-08, + "loss": 0.1682, + "step": 15004 + }, + { + "epoch": 3.992815327301756, + "grad_norm": 0.2899221181869507, + "learning_rate": 1.9754449699938126e-08, + "loss": 0.1852, + "step": 15005 + }, + { + "epoch": 3.99308142629058, + "grad_norm": 0.33151134848594666, + "learning_rate": 1.9744373294017426e-08, + "loss": 0.1921, + "step": 15006 + }, + { + "epoch": 3.993347525279404, + "grad_norm": 0.36843711137771606, + "learning_rate": 1.9734299177150293e-08, + "loss": 0.1991, + "step": 15007 + }, + { + "epoch": 3.9936136242682276, + "grad_norm": 0.29691240191459656, + "learning_rate": 1.9724227349624044e-08, + "loss": 0.1904, + "step": 15008 + }, + { + "epoch": 3.9938797232570518, + "grad_norm": 0.2883864939212799, + "learning_rate": 1.9714157811725974e-08, + "loss": 0.177, + "step": 15009 + }, + { + "epoch": 3.9941458222458754, + "grad_norm": 0.2828022241592407, + "learning_rate": 1.9704090563743292e-08, + "loss": 0.1902, + "step": 15010 + }, + { + "epoch": 3.9944119212346996, + "grad_norm": 0.27543455362319946, + "learning_rate": 1.9694025605963094e-08, + "loss": 0.1788, + "step": 15011 + }, + { + "epoch": 3.9946780202235233, + "grad_norm": 0.7754088640213013, + "learning_rate": 1.9683962938672493e-08, + "loss": 0.1924, + "step": 15012 + }, + { + "epoch": 3.994944119212347, + "grad_norm": 0.36310750246047974, + "learning_rate": 1.9673902562158462e-08, + "loss": 0.166, + "step": 15013 + }, + { + "epoch": 3.995210218201171, + "grad_norm": 0.3567225933074951, + "learning_rate": 1.9663844476707982e-08, + "loss": 0.1942, + "step": 15014 + }, + { + "epoch": 3.9954763171899947, + "grad_norm": 0.2927091419696808, + "learning_rate": 1.9653788682607875e-08, + "loss": 0.1746, + "step": 15015 + }, + { + "epoch": 3.9957424161788184, + "grad_norm": 0.31012946367263794, + "learning_rate": 1.9643735180144994e-08, + "loss": 0.1729, + "step": 15016 + }, + { + "epoch": 3.9960085151676425, + "grad_norm": 0.3401484191417694, + "learning_rate": 1.9633683969606073e-08, + "loss": 0.1663, + "step": 15017 + }, + { + "epoch": 3.9962746141564662, + "grad_norm": 0.27599748969078064, + "learning_rate": 1.9623635051277833e-08, + "loss": 0.1708, + "step": 15018 + }, + { + "epoch": 3.99654071314529, + "grad_norm": 0.2666580080986023, + "learning_rate": 1.9613588425446826e-08, + "loss": 0.1706, + "step": 15019 + }, + { + "epoch": 3.996806812134114, + "grad_norm": 0.3235085904598236, + "learning_rate": 1.960354409239965e-08, + "loss": 0.1676, + "step": 15020 + }, + { + "epoch": 3.9970729111229377, + "grad_norm": 0.562934398651123, + "learning_rate": 1.959350205242275e-08, + "loss": 0.218, + "step": 15021 + }, + { + "epoch": 3.9973390101117614, + "grad_norm": 0.40412282943725586, + "learning_rate": 1.9583462305802557e-08, + "loss": 0.1709, + "step": 15022 + }, + { + "epoch": 3.9976051091005855, + "grad_norm": 0.3105832040309906, + "learning_rate": 1.9573424852825437e-08, + "loss": 0.1715, + "step": 15023 + }, + { + "epoch": 3.997871208089409, + "grad_norm": 0.3244423270225525, + "learning_rate": 1.9563389693777697e-08, + "loss": 0.1986, + "step": 15024 + }, + { + "epoch": 3.998137307078233, + "grad_norm": 0.28808853030204773, + "learning_rate": 1.9553356828945522e-08, + "loss": 0.1892, + "step": 15025 + }, + { + "epoch": 3.998403406067057, + "grad_norm": 0.4618569016456604, + "learning_rate": 1.95433262586151e-08, + "loss": 0.1861, + "step": 15026 + }, + { + "epoch": 3.9986695050558807, + "grad_norm": 0.2859199643135071, + "learning_rate": 1.9533297983072496e-08, + "loss": 0.1723, + "step": 15027 + }, + { + "epoch": 3.9989356040447044, + "grad_norm": 0.2623538672924042, + "learning_rate": 1.9523272002603742e-08, + "loss": 0.1703, + "step": 15028 + }, + { + "epoch": 3.9992017030335285, + "grad_norm": 0.3979734182357788, + "learning_rate": 1.9513248317494836e-08, + "loss": 0.1924, + "step": 15029 + }, + { + "epoch": 3.999467802022352, + "grad_norm": 0.2758195400238037, + "learning_rate": 1.950322692803159e-08, + "loss": 0.1773, + "step": 15030 + }, + { + "epoch": 3.9997339010111763, + "grad_norm": 0.2957785427570343, + "learning_rate": 1.9493207834499938e-08, + "loss": 0.189, + "step": 15031 + }, + { + "epoch": 4.0, + "grad_norm": 0.3013212978839874, + "learning_rate": 1.9483191037185575e-08, + "loss": 0.1668, + "step": 15032 + }, + { + "epoch": 4.000266098988824, + "grad_norm": 0.25059399008750916, + "learning_rate": 1.947317653637425e-08, + "loss": 0.1833, + "step": 15033 + }, + { + "epoch": 4.000532197977647, + "grad_norm": 0.2845379114151001, + "learning_rate": 1.9463164332351535e-08, + "loss": 0.1803, + "step": 15034 + }, + { + "epoch": 4.0007982969664715, + "grad_norm": 0.35375016927719116, + "learning_rate": 1.9453154425403063e-08, + "loss": 0.1948, + "step": 15035 + }, + { + "epoch": 4.001064395955296, + "grad_norm": 0.3040436804294586, + "learning_rate": 1.9443146815814282e-08, + "loss": 0.1646, + "step": 15036 + }, + { + "epoch": 4.001330494944119, + "grad_norm": 0.3930593729019165, + "learning_rate": 1.9433141503870643e-08, + "loss": 0.1948, + "step": 15037 + }, + { + "epoch": 4.001596593932943, + "grad_norm": 0.34940028190612793, + "learning_rate": 1.942313848985754e-08, + "loss": 0.1705, + "step": 15038 + }, + { + "epoch": 4.001862692921767, + "grad_norm": 0.29811742901802063, + "learning_rate": 1.9413137774060285e-08, + "loss": 0.1788, + "step": 15039 + }, + { + "epoch": 4.00212879191059, + "grad_norm": 0.48987501859664917, + "learning_rate": 1.9403139356764077e-08, + "loss": 0.1811, + "step": 15040 + }, + { + "epoch": 4.0023948908994145, + "grad_norm": 0.36866098642349243, + "learning_rate": 1.939314323825414e-08, + "loss": 0.1697, + "step": 15041 + }, + { + "epoch": 4.002660989888239, + "grad_norm": 0.2697522044181824, + "learning_rate": 1.9383149418815537e-08, + "loss": 0.1801, + "step": 15042 + }, + { + "epoch": 4.002927088877062, + "grad_norm": 0.3398462235927582, + "learning_rate": 1.937315789873336e-08, + "loss": 0.1832, + "step": 15043 + }, + { + "epoch": 4.003193187865886, + "grad_norm": 0.3610372245311737, + "learning_rate": 1.9363168678292506e-08, + "loss": 0.1776, + "step": 15044 + }, + { + "epoch": 4.00345928685471, + "grad_norm": 0.29484379291534424, + "learning_rate": 1.9353181757778005e-08, + "loss": 0.1742, + "step": 15045 + }, + { + "epoch": 4.003725385843534, + "grad_norm": 0.34093624353408813, + "learning_rate": 1.93431971374746e-08, + "loss": 0.1605, + "step": 15046 + }, + { + "epoch": 4.0039914848323575, + "grad_norm": 0.322342187166214, + "learning_rate": 1.933321481766712e-08, + "loss": 0.1773, + "step": 15047 + }, + { + "epoch": 4.004257583821182, + "grad_norm": 0.2728304862976074, + "learning_rate": 1.932323479864031e-08, + "loss": 0.166, + "step": 15048 + }, + { + "epoch": 4.004523682810006, + "grad_norm": 0.4154239892959595, + "learning_rate": 1.9313257080678758e-08, + "loss": 0.159, + "step": 15049 + }, + { + "epoch": 4.004789781798829, + "grad_norm": 0.31717056035995483, + "learning_rate": 1.9303281664067105e-08, + "loss": 0.1792, + "step": 15050 + }, + { + "epoch": 4.005055880787653, + "grad_norm": 0.34260451793670654, + "learning_rate": 1.9293308549089805e-08, + "loss": 0.1773, + "step": 15051 + }, + { + "epoch": 4.005321979776477, + "grad_norm": 0.44369393587112427, + "learning_rate": 1.9283337736031357e-08, + "loss": 0.1985, + "step": 15052 + }, + { + "epoch": 4.0055880787653, + "grad_norm": 0.42520585656166077, + "learning_rate": 1.927336922517614e-08, + "loss": 0.176, + "step": 15053 + }, + { + "epoch": 4.005854177754125, + "grad_norm": 0.2799547612667084, + "learning_rate": 1.926340301680851e-08, + "loss": 0.1783, + "step": 15054 + }, + { + "epoch": 4.006120276742949, + "grad_norm": 0.29843607544898987, + "learning_rate": 1.9253439111212655e-08, + "loss": 0.2049, + "step": 15055 + }, + { + "epoch": 4.006386375731772, + "grad_norm": 0.3584607243537903, + "learning_rate": 1.9243477508672834e-08, + "loss": 0.1871, + "step": 15056 + }, + { + "epoch": 4.006652474720596, + "grad_norm": 0.44033190608024597, + "learning_rate": 1.923351820947311e-08, + "loss": 0.1895, + "step": 15057 + }, + { + "epoch": 4.00691857370942, + "grad_norm": 0.41766491532325745, + "learning_rate": 1.9223561213897567e-08, + "loss": 0.1704, + "step": 15058 + }, + { + "epoch": 4.007184672698243, + "grad_norm": 0.43948039412498474, + "learning_rate": 1.921360652223021e-08, + "loss": 0.202, + "step": 15059 + }, + { + "epoch": 4.0074507716870675, + "grad_norm": 1.0214484930038452, + "learning_rate": 1.9203654134754988e-08, + "loss": 0.1777, + "step": 15060 + }, + { + "epoch": 4.007716870675892, + "grad_norm": 0.24353845417499542, + "learning_rate": 1.919370405175571e-08, + "loss": 0.1425, + "step": 15061 + }, + { + "epoch": 4.007982969664715, + "grad_norm": 0.27967390418052673, + "learning_rate": 1.9183756273516216e-08, + "loss": 0.1793, + "step": 15062 + }, + { + "epoch": 4.008249068653539, + "grad_norm": 0.34964725375175476, + "learning_rate": 1.91738108003202e-08, + "loss": 0.1695, + "step": 15063 + }, + { + "epoch": 4.008515167642363, + "grad_norm": 0.31373023986816406, + "learning_rate": 1.9163867632451347e-08, + "loss": 0.1668, + "step": 15064 + }, + { + "epoch": 4.008781266631186, + "grad_norm": 0.28082624077796936, + "learning_rate": 1.9153926770193285e-08, + "loss": 0.1659, + "step": 15065 + }, + { + "epoch": 4.0090473656200105, + "grad_norm": 0.2741636335849762, + "learning_rate": 1.9143988213829486e-08, + "loss": 0.1814, + "step": 15066 + }, + { + "epoch": 4.009313464608835, + "grad_norm": 0.2633020281791687, + "learning_rate": 1.913405196364346e-08, + "loss": 0.1758, + "step": 15067 + }, + { + "epoch": 4.009579563597659, + "grad_norm": 0.3267744183540344, + "learning_rate": 1.912411801991859e-08, + "loss": 0.1712, + "step": 15068 + }, + { + "epoch": 4.009845662586482, + "grad_norm": 0.4099908471107483, + "learning_rate": 1.9114186382938257e-08, + "loss": 0.1902, + "step": 15069 + }, + { + "epoch": 4.010111761575306, + "grad_norm": 0.35859042406082153, + "learning_rate": 1.9104257052985662e-08, + "loss": 0.1803, + "step": 15070 + }, + { + "epoch": 4.01037786056413, + "grad_norm": 0.3742746114730835, + "learning_rate": 1.9094330030344073e-08, + "loss": 0.1812, + "step": 15071 + }, + { + "epoch": 4.0106439595529535, + "grad_norm": 0.3532724380493164, + "learning_rate": 1.908440531529657e-08, + "loss": 0.1737, + "step": 15072 + }, + { + "epoch": 4.010910058541778, + "grad_norm": 0.2690945863723755, + "learning_rate": 1.9074482908126255e-08, + "loss": 0.1837, + "step": 15073 + }, + { + "epoch": 4.011176157530602, + "grad_norm": 0.261619508266449, + "learning_rate": 1.9064562809116148e-08, + "loss": 0.1528, + "step": 15074 + }, + { + "epoch": 4.011442256519425, + "grad_norm": 0.29498395323753357, + "learning_rate": 1.9054645018549188e-08, + "loss": 0.1715, + "step": 15075 + }, + { + "epoch": 4.011708355508249, + "grad_norm": 0.27901822328567505, + "learning_rate": 1.9044729536708225e-08, + "loss": 0.1575, + "step": 15076 + }, + { + "epoch": 4.011974454497073, + "grad_norm": 0.25801408290863037, + "learning_rate": 1.9034816363876115e-08, + "loss": 0.1748, + "step": 15077 + }, + { + "epoch": 4.0122405534858965, + "grad_norm": 0.2715569734573364, + "learning_rate": 1.9024905500335532e-08, + "loss": 0.1795, + "step": 15078 + }, + { + "epoch": 4.012506652474721, + "grad_norm": 0.26174649596214294, + "learning_rate": 1.9014996946369233e-08, + "loss": 0.1528, + "step": 15079 + }, + { + "epoch": 4.012772751463545, + "grad_norm": 0.3459305167198181, + "learning_rate": 1.9005090702259753e-08, + "loss": 0.183, + "step": 15080 + }, + { + "epoch": 4.013038850452368, + "grad_norm": 0.37520697712898254, + "learning_rate": 1.8995186768289683e-08, + "loss": 0.1685, + "step": 15081 + }, + { + "epoch": 4.013304949441192, + "grad_norm": 0.2631641924381256, + "learning_rate": 1.8985285144741492e-08, + "loss": 0.1671, + "step": 15082 + }, + { + "epoch": 4.013571048430016, + "grad_norm": 0.2692255973815918, + "learning_rate": 1.8975385831897593e-08, + "loss": 0.1506, + "step": 15083 + }, + { + "epoch": 4.0138371474188395, + "grad_norm": 0.3301083743572235, + "learning_rate": 1.8965488830040376e-08, + "loss": 0.1699, + "step": 15084 + }, + { + "epoch": 4.014103246407664, + "grad_norm": 0.34385451674461365, + "learning_rate": 1.8955594139452057e-08, + "loss": 0.1736, + "step": 15085 + }, + { + "epoch": 4.014369345396488, + "grad_norm": 0.381422758102417, + "learning_rate": 1.8945701760414912e-08, + "loss": 0.1694, + "step": 15086 + }, + { + "epoch": 4.014635444385311, + "grad_norm": 0.3332178592681885, + "learning_rate": 1.8935811693211034e-08, + "loss": 0.1726, + "step": 15087 + }, + { + "epoch": 4.014901543374135, + "grad_norm": 0.25922662019729614, + "learning_rate": 1.8925923938122545e-08, + "loss": 0.1739, + "step": 15088 + }, + { + "epoch": 4.015167642362959, + "grad_norm": 0.26926034688949585, + "learning_rate": 1.891603849543145e-08, + "loss": 0.1641, + "step": 15089 + }, + { + "epoch": 4.015433741351782, + "grad_norm": 0.2830990254878998, + "learning_rate": 1.8906155365419728e-08, + "loss": 0.1742, + "step": 15090 + }, + { + "epoch": 4.015699840340607, + "grad_norm": 0.3458974063396454, + "learning_rate": 1.889627454836923e-08, + "loss": 0.1823, + "step": 15091 + }, + { + "epoch": 4.015965939329431, + "grad_norm": 0.302436888217926, + "learning_rate": 1.8886396044561813e-08, + "loss": 0.1715, + "step": 15092 + }, + { + "epoch": 4.016232038318255, + "grad_norm": 0.4251839220523834, + "learning_rate": 1.887651985427918e-08, + "loss": 0.2104, + "step": 15093 + }, + { + "epoch": 4.016498137307078, + "grad_norm": 0.4043721556663513, + "learning_rate": 1.8866645977803087e-08, + "loss": 0.1827, + "step": 15094 + }, + { + "epoch": 4.016764236295902, + "grad_norm": 0.31607192754745483, + "learning_rate": 1.8856774415415066e-08, + "loss": 0.1904, + "step": 15095 + }, + { + "epoch": 4.017030335284726, + "grad_norm": 0.28990182280540466, + "learning_rate": 1.8846905167396775e-08, + "loss": 0.191, + "step": 15096 + }, + { + "epoch": 4.0172964342735495, + "grad_norm": 0.24960385262966156, + "learning_rate": 1.8837038234029645e-08, + "loss": 0.1608, + "step": 15097 + }, + { + "epoch": 4.017562533262374, + "grad_norm": 0.27618977427482605, + "learning_rate": 1.8827173615595137e-08, + "loss": 0.1776, + "step": 15098 + }, + { + "epoch": 4.017828632251198, + "grad_norm": 0.2571652829647064, + "learning_rate": 1.8817311312374563e-08, + "loss": 0.1688, + "step": 15099 + }, + { + "epoch": 4.018094731240021, + "grad_norm": 0.28587305545806885, + "learning_rate": 1.880745132464924e-08, + "loss": 0.1853, + "step": 15100 + }, + { + "epoch": 4.018360830228845, + "grad_norm": 0.28132569789886475, + "learning_rate": 1.8797593652700415e-08, + "loss": 0.1734, + "step": 15101 + }, + { + "epoch": 4.018626929217669, + "grad_norm": 0.3073999285697937, + "learning_rate": 1.878773829680922e-08, + "loss": 0.172, + "step": 15102 + }, + { + "epoch": 4.0188930282064925, + "grad_norm": 0.2773711085319519, + "learning_rate": 1.8777885257256755e-08, + "loss": 0.1609, + "step": 15103 + }, + { + "epoch": 4.019159127195317, + "grad_norm": 0.334543913602829, + "learning_rate": 1.876803453432405e-08, + "loss": 0.1847, + "step": 15104 + }, + { + "epoch": 4.019425226184141, + "grad_norm": 0.3175777792930603, + "learning_rate": 1.8758186128292098e-08, + "loss": 0.1642, + "step": 15105 + }, + { + "epoch": 4.019691325172964, + "grad_norm": 0.24262166023254395, + "learning_rate": 1.8748340039441735e-08, + "loss": 0.1604, + "step": 15106 + }, + { + "epoch": 4.019957424161788, + "grad_norm": 0.3284491002559662, + "learning_rate": 1.873849626805386e-08, + "loss": 0.168, + "step": 15107 + }, + { + "epoch": 4.020223523150612, + "grad_norm": 0.2821456789970398, + "learning_rate": 1.8728654814409183e-08, + "loss": 0.1665, + "step": 15108 + }, + { + "epoch": 4.0204896221394355, + "grad_norm": 0.2877674400806427, + "learning_rate": 1.8718815678788412e-08, + "loss": 0.1868, + "step": 15109 + }, + { + "epoch": 4.02075572112826, + "grad_norm": 0.2409440279006958, + "learning_rate": 1.870897886147219e-08, + "loss": 0.1565, + "step": 15110 + }, + { + "epoch": 4.021021820117084, + "grad_norm": 0.26105305552482605, + "learning_rate": 1.8699144362741116e-08, + "loss": 0.1711, + "step": 15111 + }, + { + "epoch": 4.021287919105907, + "grad_norm": 0.28148937225341797, + "learning_rate": 1.8689312182875617e-08, + "loss": 0.1837, + "step": 15112 + }, + { + "epoch": 4.021554018094731, + "grad_norm": 0.2947741150856018, + "learning_rate": 1.8679482322156203e-08, + "loss": 0.1691, + "step": 15113 + }, + { + "epoch": 4.021820117083555, + "grad_norm": 0.3571397662162781, + "learning_rate": 1.8669654780863177e-08, + "loss": 0.1627, + "step": 15114 + }, + { + "epoch": 4.0220862160723785, + "grad_norm": 0.3106311559677124, + "learning_rate": 1.8659829559276884e-08, + "loss": 0.1589, + "step": 15115 + }, + { + "epoch": 4.022352315061203, + "grad_norm": 0.3702261447906494, + "learning_rate": 1.865000665767753e-08, + "loss": 0.1759, + "step": 15116 + }, + { + "epoch": 4.022618414050027, + "grad_norm": 0.36102980375289917, + "learning_rate": 1.8640186076345286e-08, + "loss": 0.1732, + "step": 15117 + }, + { + "epoch": 4.022884513038851, + "grad_norm": 0.30779263377189636, + "learning_rate": 1.8630367815560277e-08, + "loss": 0.1908, + "step": 15118 + }, + { + "epoch": 4.023150612027674, + "grad_norm": 0.2720293402671814, + "learning_rate": 1.862055187560252e-08, + "loss": 0.165, + "step": 15119 + }, + { + "epoch": 4.023416711016498, + "grad_norm": 0.30412372946739197, + "learning_rate": 1.8610738256752024e-08, + "loss": 0.175, + "step": 15120 + }, + { + "epoch": 4.023682810005322, + "grad_norm": 0.3076395094394684, + "learning_rate": 1.8600926959288644e-08, + "loss": 0.179, + "step": 15121 + }, + { + "epoch": 4.023948908994146, + "grad_norm": 0.2647908329963684, + "learning_rate": 1.8591117983492256e-08, + "loss": 0.1727, + "step": 15122 + }, + { + "epoch": 4.02421500798297, + "grad_norm": 0.27156132459640503, + "learning_rate": 1.858131132964259e-08, + "loss": 0.1619, + "step": 15123 + }, + { + "epoch": 4.024481106971794, + "grad_norm": 0.3920149803161621, + "learning_rate": 1.857150699801937e-08, + "loss": 0.2003, + "step": 15124 + }, + { + "epoch": 4.024747205960617, + "grad_norm": 0.3019268810749054, + "learning_rate": 1.856170498890225e-08, + "loss": 0.1725, + "step": 15125 + }, + { + "epoch": 4.025013304949441, + "grad_norm": 0.6172129511833191, + "learning_rate": 1.8551905302570815e-08, + "loss": 0.1961, + "step": 15126 + }, + { + "epoch": 4.025279403938265, + "grad_norm": 0.2866799831390381, + "learning_rate": 1.854210793930453e-08, + "loss": 0.1827, + "step": 15127 + }, + { + "epoch": 4.025545502927089, + "grad_norm": 0.2936598062515259, + "learning_rate": 1.8532312899382873e-08, + "loss": 0.1763, + "step": 15128 + }, + { + "epoch": 4.025811601915913, + "grad_norm": 0.413662314414978, + "learning_rate": 1.8522520183085188e-08, + "loss": 0.1783, + "step": 15129 + }, + { + "epoch": 4.026077700904737, + "grad_norm": 0.2745124399662018, + "learning_rate": 1.851272979069082e-08, + "loss": 0.1621, + "step": 15130 + }, + { + "epoch": 4.02634379989356, + "grad_norm": 0.26637551188468933, + "learning_rate": 1.850294172247896e-08, + "loss": 0.1669, + "step": 15131 + }, + { + "epoch": 4.026609898882384, + "grad_norm": 0.28232908248901367, + "learning_rate": 1.8493155978728815e-08, + "loss": 0.1672, + "step": 15132 + }, + { + "epoch": 4.026875997871208, + "grad_norm": 0.3106384873390198, + "learning_rate": 1.848337255971949e-08, + "loss": 0.1576, + "step": 15133 + }, + { + "epoch": 4.0271420968600316, + "grad_norm": 0.2822505831718445, + "learning_rate": 1.8473591465730054e-08, + "loss": 0.168, + "step": 15134 + }, + { + "epoch": 4.027408195848856, + "grad_norm": 0.3267376720905304, + "learning_rate": 1.846381269703943e-08, + "loss": 0.17, + "step": 15135 + }, + { + "epoch": 4.02767429483768, + "grad_norm": 0.2690924108028412, + "learning_rate": 1.8454036253926587e-08, + "loss": 0.1752, + "step": 15136 + }, + { + "epoch": 4.027940393826503, + "grad_norm": 0.27325040102005005, + "learning_rate": 1.8444262136670308e-08, + "loss": 0.1936, + "step": 15137 + }, + { + "epoch": 4.028206492815327, + "grad_norm": 0.4693801999092102, + "learning_rate": 1.8434490345549412e-08, + "loss": 0.208, + "step": 15138 + }, + { + "epoch": 4.028472591804151, + "grad_norm": 0.26869451999664307, + "learning_rate": 1.8424720880842593e-08, + "loss": 0.1698, + "step": 15139 + }, + { + "epoch": 4.028738690792975, + "grad_norm": 0.25178930163383484, + "learning_rate": 1.841495374282851e-08, + "loss": 0.1561, + "step": 15140 + }, + { + "epoch": 4.029004789781799, + "grad_norm": 0.3731285035610199, + "learning_rate": 1.8405188931785752e-08, + "loss": 0.1798, + "step": 15141 + }, + { + "epoch": 4.029270888770623, + "grad_norm": 0.25892525911331177, + "learning_rate": 1.83954264479928e-08, + "loss": 0.1583, + "step": 15142 + }, + { + "epoch": 4.029536987759447, + "grad_norm": 0.4476707875728607, + "learning_rate": 1.838566629172813e-08, + "loss": 0.2191, + "step": 15143 + }, + { + "epoch": 4.02980308674827, + "grad_norm": 0.3614056408405304, + "learning_rate": 1.8375908463270085e-08, + "loss": 0.1849, + "step": 15144 + }, + { + "epoch": 4.030069185737094, + "grad_norm": 0.3925582468509674, + "learning_rate": 1.8366152962897e-08, + "loss": 0.1809, + "step": 15145 + }, + { + "epoch": 4.030335284725918, + "grad_norm": 0.3437640368938446, + "learning_rate": 1.835639979088711e-08, + "loss": 0.1705, + "step": 15146 + }, + { + "epoch": 4.030601383714742, + "grad_norm": 0.28936895728111267, + "learning_rate": 1.834664894751864e-08, + "loss": 0.1734, + "step": 15147 + }, + { + "epoch": 4.030867482703566, + "grad_norm": 0.31652048230171204, + "learning_rate": 1.8336900433069647e-08, + "loss": 0.1635, + "step": 15148 + }, + { + "epoch": 4.03113358169239, + "grad_norm": 0.40901267528533936, + "learning_rate": 1.832715424781822e-08, + "loss": 0.1913, + "step": 15149 + }, + { + "epoch": 4.031399680681213, + "grad_norm": 0.2890597879886627, + "learning_rate": 1.8317410392042297e-08, + "loss": 0.1677, + "step": 15150 + }, + { + "epoch": 4.031665779670037, + "grad_norm": 0.27829474210739136, + "learning_rate": 1.8307668866019844e-08, + "loss": 0.1735, + "step": 15151 + }, + { + "epoch": 4.031931878658861, + "grad_norm": 0.2698100209236145, + "learning_rate": 1.8297929670028654e-08, + "loss": 0.1609, + "step": 15152 + }, + { + "epoch": 4.032197977647685, + "grad_norm": 0.3669854998588562, + "learning_rate": 1.8288192804346536e-08, + "loss": 0.1713, + "step": 15153 + }, + { + "epoch": 4.032464076636509, + "grad_norm": 0.3094053268432617, + "learning_rate": 1.8278458269251207e-08, + "loss": 0.183, + "step": 15154 + }, + { + "epoch": 4.032730175625333, + "grad_norm": 0.37767770886421204, + "learning_rate": 1.8268726065020312e-08, + "loss": 0.1909, + "step": 15155 + }, + { + "epoch": 4.032996274614156, + "grad_norm": 0.37585920095443726, + "learning_rate": 1.8258996191931462e-08, + "loss": 0.205, + "step": 15156 + }, + { + "epoch": 4.03326237360298, + "grad_norm": 0.39924055337905884, + "learning_rate": 1.8249268650262128e-08, + "loss": 0.1823, + "step": 15157 + }, + { + "epoch": 4.033528472591804, + "grad_norm": 0.26119592785835266, + "learning_rate": 1.8239543440289794e-08, + "loss": 0.1594, + "step": 15158 + }, + { + "epoch": 4.033794571580628, + "grad_norm": 0.295324444770813, + "learning_rate": 1.8229820562291808e-08, + "loss": 0.1885, + "step": 15159 + }, + { + "epoch": 4.034060670569452, + "grad_norm": 0.2676769495010376, + "learning_rate": 1.8220100016545504e-08, + "loss": 0.1746, + "step": 15160 + }, + { + "epoch": 4.034326769558276, + "grad_norm": 0.323586106300354, + "learning_rate": 1.8210381803328123e-08, + "loss": 0.1793, + "step": 15161 + }, + { + "epoch": 4.034592868547099, + "grad_norm": 0.35408392548561096, + "learning_rate": 1.820066592291689e-08, + "loss": 0.181, + "step": 15162 + }, + { + "epoch": 4.034858967535923, + "grad_norm": 0.32952162623405457, + "learning_rate": 1.8190952375588863e-08, + "loss": 0.1728, + "step": 15163 + }, + { + "epoch": 4.035125066524747, + "grad_norm": 0.4097862243652344, + "learning_rate": 1.8181241161621142e-08, + "loss": 0.1694, + "step": 15164 + }, + { + "epoch": 4.0353911655135715, + "grad_norm": 0.3904241919517517, + "learning_rate": 1.8171532281290657e-08, + "loss": 0.1996, + "step": 15165 + }, + { + "epoch": 4.035657264502395, + "grad_norm": 0.34344717860221863, + "learning_rate": 1.816182573487439e-08, + "loss": 0.1953, + "step": 15166 + }, + { + "epoch": 4.035923363491219, + "grad_norm": 0.3630867004394531, + "learning_rate": 1.8152121522649122e-08, + "loss": 0.1775, + "step": 15167 + }, + { + "epoch": 4.036189462480043, + "grad_norm": 0.26107099652290344, + "learning_rate": 1.8142419644891683e-08, + "loss": 0.1723, + "step": 15168 + }, + { + "epoch": 4.036455561468866, + "grad_norm": 0.26828035712242126, + "learning_rate": 1.813272010187876e-08, + "loss": 0.1606, + "step": 15169 + }, + { + "epoch": 4.03672166045769, + "grad_norm": 0.4168696105480194, + "learning_rate": 1.8123022893887064e-08, + "loss": 0.1788, + "step": 15170 + }, + { + "epoch": 4.0369877594465144, + "grad_norm": 0.28919336199760437, + "learning_rate": 1.811332802119311e-08, + "loss": 0.1875, + "step": 15171 + }, + { + "epoch": 4.037253858435338, + "grad_norm": 0.3778829574584961, + "learning_rate": 1.8103635484073454e-08, + "loss": 0.1934, + "step": 15172 + }, + { + "epoch": 4.037519957424162, + "grad_norm": 0.325214147567749, + "learning_rate": 1.8093945282804524e-08, + "loss": 0.1811, + "step": 15173 + }, + { + "epoch": 4.037786056412986, + "grad_norm": 0.29401832818984985, + "learning_rate": 1.8084257417662706e-08, + "loss": 0.1895, + "step": 15174 + }, + { + "epoch": 4.038052155401809, + "grad_norm": 0.3863675892353058, + "learning_rate": 1.807457188892434e-08, + "loss": 0.187, + "step": 15175 + }, + { + "epoch": 4.038318254390633, + "grad_norm": 0.3476179838180542, + "learning_rate": 1.8064888696865653e-08, + "loss": 0.186, + "step": 15176 + }, + { + "epoch": 4.038584353379457, + "grad_norm": 0.2612091600894928, + "learning_rate": 1.8055207841762865e-08, + "loss": 0.1527, + "step": 15177 + }, + { + "epoch": 4.038850452368281, + "grad_norm": 0.34693604707717896, + "learning_rate": 1.804552932389205e-08, + "loss": 0.1848, + "step": 15178 + }, + { + "epoch": 4.039116551357105, + "grad_norm": 0.39916783571243286, + "learning_rate": 1.80358531435293e-08, + "loss": 0.1911, + "step": 15179 + }, + { + "epoch": 4.039382650345929, + "grad_norm": 0.2827881872653961, + "learning_rate": 1.802617930095055e-08, + "loss": 0.1845, + "step": 15180 + }, + { + "epoch": 4.039648749334752, + "grad_norm": 0.28380087018013, + "learning_rate": 1.8016507796431778e-08, + "loss": 0.1762, + "step": 15181 + }, + { + "epoch": 4.039914848323576, + "grad_norm": 0.29437512159347534, + "learning_rate": 1.8006838630248743e-08, + "loss": 0.1751, + "step": 15182 + }, + { + "epoch": 4.0401809473124, + "grad_norm": 0.40534257888793945, + "learning_rate": 1.7997171802677346e-08, + "loss": 0.1894, + "step": 15183 + }, + { + "epoch": 4.040447046301224, + "grad_norm": 0.32278910279273987, + "learning_rate": 1.798750731399322e-08, + "loss": 0.1653, + "step": 15184 + }, + { + "epoch": 4.040713145290048, + "grad_norm": 0.30693936347961426, + "learning_rate": 1.797784516447207e-08, + "loss": 0.1624, + "step": 15185 + }, + { + "epoch": 4.040979244278872, + "grad_norm": 0.26834309101104736, + "learning_rate": 1.796818535438942e-08, + "loss": 0.1694, + "step": 15186 + }, + { + "epoch": 4.041245343267696, + "grad_norm": 0.3681272566318512, + "learning_rate": 1.7958527884020858e-08, + "loss": 0.1769, + "step": 15187 + }, + { + "epoch": 4.041511442256519, + "grad_norm": 0.33661720156669617, + "learning_rate": 1.7948872753641754e-08, + "loss": 0.1804, + "step": 15188 + }, + { + "epoch": 4.041777541245343, + "grad_norm": 0.32327091693878174, + "learning_rate": 1.7939219963527552e-08, + "loss": 0.1823, + "step": 15189 + }, + { + "epoch": 4.0420436402341675, + "grad_norm": 0.32106760144233704, + "learning_rate": 1.7929569513953547e-08, + "loss": 0.1712, + "step": 15190 + }, + { + "epoch": 4.042309739222991, + "grad_norm": 0.2818031907081604, + "learning_rate": 1.7919921405195014e-08, + "loss": 0.1744, + "step": 15191 + }, + { + "epoch": 4.042575838211815, + "grad_norm": 0.2697335183620453, + "learning_rate": 1.7910275637527095e-08, + "loss": 0.1699, + "step": 15192 + }, + { + "epoch": 4.042841937200639, + "grad_norm": 0.4097647964954376, + "learning_rate": 1.790063221122493e-08, + "loss": 0.1846, + "step": 15193 + }, + { + "epoch": 4.043108036189462, + "grad_norm": 0.2945009171962738, + "learning_rate": 1.7890991126563592e-08, + "loss": 0.1769, + "step": 15194 + }, + { + "epoch": 4.043374135178286, + "grad_norm": 0.3034147322177887, + "learning_rate": 1.7881352383818014e-08, + "loss": 0.1689, + "step": 15195 + }, + { + "epoch": 4.0436402341671105, + "grad_norm": 0.368513286113739, + "learning_rate": 1.7871715983263147e-08, + "loss": 0.1915, + "step": 15196 + }, + { + "epoch": 4.043906333155934, + "grad_norm": 0.3808552026748657, + "learning_rate": 1.7862081925173822e-08, + "loss": 0.1813, + "step": 15197 + }, + { + "epoch": 4.044172432144758, + "grad_norm": 0.360341876745224, + "learning_rate": 1.7852450209824865e-08, + "loss": 0.176, + "step": 15198 + }, + { + "epoch": 4.044438531133582, + "grad_norm": 0.2970253527164459, + "learning_rate": 1.7842820837490934e-08, + "loss": 0.1793, + "step": 15199 + }, + { + "epoch": 4.044704630122405, + "grad_norm": 0.3207224905490875, + "learning_rate": 1.7833193808446722e-08, + "loss": 0.1646, + "step": 15200 + }, + { + "epoch": 4.044970729111229, + "grad_norm": 0.302494078874588, + "learning_rate": 1.7823569122966774e-08, + "loss": 0.1695, + "step": 15201 + }, + { + "epoch": 4.0452368281000535, + "grad_norm": 0.2774604260921478, + "learning_rate": 1.7813946781325663e-08, + "loss": 0.1644, + "step": 15202 + }, + { + "epoch": 4.045502927088877, + "grad_norm": 0.31231430172920227, + "learning_rate": 1.7804326783797752e-08, + "loss": 0.1888, + "step": 15203 + }, + { + "epoch": 4.045769026077701, + "grad_norm": 0.25434795022010803, + "learning_rate": 1.779470913065748e-08, + "loss": 0.1553, + "step": 15204 + }, + { + "epoch": 4.046035125066525, + "grad_norm": 0.37451425194740295, + "learning_rate": 1.7785093822179164e-08, + "loss": 0.1867, + "step": 15205 + }, + { + "epoch": 4.046301224055348, + "grad_norm": 0.39520490169525146, + "learning_rate": 1.777548085863705e-08, + "loss": 0.2024, + "step": 15206 + }, + { + "epoch": 4.046567323044172, + "grad_norm": 0.27644482254981995, + "learning_rate": 1.7765870240305292e-08, + "loss": 0.1771, + "step": 15207 + }, + { + "epoch": 4.0468334220329965, + "grad_norm": 0.3504427969455719, + "learning_rate": 1.7756261967458054e-08, + "loss": 0.1781, + "step": 15208 + }, + { + "epoch": 4.04709952102182, + "grad_norm": 0.4449678957462311, + "learning_rate": 1.7746656040369312e-08, + "loss": 0.1584, + "step": 15209 + }, + { + "epoch": 4.047365620010644, + "grad_norm": 0.3387675881385803, + "learning_rate": 1.7737052459313097e-08, + "loss": 0.1835, + "step": 15210 + }, + { + "epoch": 4.047631718999468, + "grad_norm": 0.3643919825553894, + "learning_rate": 1.7727451224563306e-08, + "loss": 0.1854, + "step": 15211 + }, + { + "epoch": 4.047897817988292, + "grad_norm": 0.3558456301689148, + "learning_rate": 1.77178523363938e-08, + "loss": 0.1765, + "step": 15212 + }, + { + "epoch": 4.048163916977115, + "grad_norm": 0.45422473549842834, + "learning_rate": 1.7708255795078362e-08, + "loss": 0.1901, + "step": 15213 + }, + { + "epoch": 4.048430015965939, + "grad_norm": 0.34194767475128174, + "learning_rate": 1.7698661600890685e-08, + "loss": 0.1841, + "step": 15214 + }, + { + "epoch": 4.048696114954764, + "grad_norm": 0.3399520218372345, + "learning_rate": 1.7689069754104436e-08, + "loss": 0.1705, + "step": 15215 + }, + { + "epoch": 4.048962213943587, + "grad_norm": 0.2907472252845764, + "learning_rate": 1.767948025499316e-08, + "loss": 0.1695, + "step": 15216 + }, + { + "epoch": 4.049228312932411, + "grad_norm": 0.26736536622047424, + "learning_rate": 1.7669893103830425e-08, + "loss": 0.1605, + "step": 15217 + }, + { + "epoch": 4.049494411921235, + "grad_norm": 0.2591927945613861, + "learning_rate": 1.7660308300889604e-08, + "loss": 0.1638, + "step": 15218 + }, + { + "epoch": 4.049760510910058, + "grad_norm": 0.2850167453289032, + "learning_rate": 1.7650725846444116e-08, + "loss": 0.1746, + "step": 15219 + }, + { + "epoch": 4.050026609898882, + "grad_norm": 0.40417101979255676, + "learning_rate": 1.7641145740767273e-08, + "loss": 0.1899, + "step": 15220 + }, + { + "epoch": 4.0502927088877065, + "grad_norm": 0.31460270285606384, + "learning_rate": 1.7631567984132333e-08, + "loss": 0.1855, + "step": 15221 + }, + { + "epoch": 4.05055880787653, + "grad_norm": 0.26800721883773804, + "learning_rate": 1.7621992576812428e-08, + "loss": 0.1537, + "step": 15222 + }, + { + "epoch": 4.050824906865354, + "grad_norm": 0.2933259904384613, + "learning_rate": 1.7612419519080733e-08, + "loss": 0.1638, + "step": 15223 + }, + { + "epoch": 4.051091005854178, + "grad_norm": 0.4040476679801941, + "learning_rate": 1.7602848811210213e-08, + "loss": 0.1847, + "step": 15224 + }, + { + "epoch": 4.051357104843001, + "grad_norm": 0.25253939628601074, + "learning_rate": 1.7593280453473892e-08, + "loss": 0.166, + "step": 15225 + }, + { + "epoch": 4.051623203831825, + "grad_norm": 0.29701724648475647, + "learning_rate": 1.758371444614467e-08, + "loss": 0.1777, + "step": 15226 + }, + { + "epoch": 4.0518893028206495, + "grad_norm": 0.27575936913490295, + "learning_rate": 1.7574150789495413e-08, + "loss": 0.1854, + "step": 15227 + }, + { + "epoch": 4.052155401809473, + "grad_norm": 0.3967496156692505, + "learning_rate": 1.7564589483798852e-08, + "loss": 0.1804, + "step": 15228 + }, + { + "epoch": 4.052421500798297, + "grad_norm": 0.2780114412307739, + "learning_rate": 1.7555030529327707e-08, + "loss": 0.1753, + "step": 15229 + }, + { + "epoch": 4.052687599787121, + "grad_norm": 0.29517310857772827, + "learning_rate": 1.754547392635466e-08, + "loss": 0.1821, + "step": 15230 + }, + { + "epoch": 4.052953698775944, + "grad_norm": 0.28378283977508545, + "learning_rate": 1.7535919675152234e-08, + "loss": 0.1862, + "step": 15231 + }, + { + "epoch": 4.053219797764768, + "grad_norm": 0.3249664008617401, + "learning_rate": 1.7526367775992968e-08, + "loss": 0.1602, + "step": 15232 + }, + { + "epoch": 4.0534858967535925, + "grad_norm": 0.2831094264984131, + "learning_rate": 1.751681822914923e-08, + "loss": 0.1672, + "step": 15233 + }, + { + "epoch": 4.053751995742417, + "grad_norm": 0.33194297552108765, + "learning_rate": 1.7507271034893512e-08, + "loss": 0.1661, + "step": 15234 + }, + { + "epoch": 4.05401809473124, + "grad_norm": 0.32617151737213135, + "learning_rate": 1.749772619349803e-08, + "loss": 0.1916, + "step": 15235 + }, + { + "epoch": 4.054284193720064, + "grad_norm": 0.2971539795398712, + "learning_rate": 1.7488183705235082e-08, + "loss": 0.181, + "step": 15236 + }, + { + "epoch": 4.054550292708888, + "grad_norm": 0.29228469729423523, + "learning_rate": 1.7478643570376772e-08, + "loss": 0.1658, + "step": 15237 + }, + { + "epoch": 4.054816391697711, + "grad_norm": 0.26774585247039795, + "learning_rate": 1.746910578919526e-08, + "loss": 0.1654, + "step": 15238 + }, + { + "epoch": 4.0550824906865355, + "grad_norm": 0.23351244628429413, + "learning_rate": 1.745957036196255e-08, + "loss": 0.1441, + "step": 15239 + }, + { + "epoch": 4.05534858967536, + "grad_norm": 0.3094632923603058, + "learning_rate": 1.7450037288950615e-08, + "loss": 0.1579, + "step": 15240 + }, + { + "epoch": 4.055614688664183, + "grad_norm": 0.27487683296203613, + "learning_rate": 1.744050657043137e-08, + "loss": 0.1785, + "step": 15241 + }, + { + "epoch": 4.055880787653007, + "grad_norm": 0.2940203547477722, + "learning_rate": 1.743097820667666e-08, + "loss": 0.1575, + "step": 15242 + }, + { + "epoch": 4.056146886641831, + "grad_norm": 0.4703734219074249, + "learning_rate": 1.742145219795823e-08, + "loss": 0.1911, + "step": 15243 + }, + { + "epoch": 4.056412985630654, + "grad_norm": 0.36681774258613586, + "learning_rate": 1.7411928544547805e-08, + "loss": 0.1715, + "step": 15244 + }, + { + "epoch": 4.0566790846194785, + "grad_norm": 0.2932898998260498, + "learning_rate": 1.7402407246716987e-08, + "loss": 0.1841, + "step": 15245 + }, + { + "epoch": 4.056945183608303, + "grad_norm": 0.3831078112125397, + "learning_rate": 1.7392888304737384e-08, + "loss": 0.1723, + "step": 15246 + }, + { + "epoch": 4.057211282597126, + "grad_norm": 0.3089294731616974, + "learning_rate": 1.7383371718880423e-08, + "loss": 0.1673, + "step": 15247 + }, + { + "epoch": 4.05747738158595, + "grad_norm": 0.2769514322280884, + "learning_rate": 1.737385748941761e-08, + "loss": 0.1669, + "step": 15248 + }, + { + "epoch": 4.057743480574774, + "grad_norm": 0.3610583245754242, + "learning_rate": 1.7364345616620312e-08, + "loss": 0.1916, + "step": 15249 + }, + { + "epoch": 4.058009579563597, + "grad_norm": 0.30189192295074463, + "learning_rate": 1.7354836100759773e-08, + "loss": 0.176, + "step": 15250 + }, + { + "epoch": 4.058275678552421, + "grad_norm": 0.5302133560180664, + "learning_rate": 1.7345328942107273e-08, + "loss": 0.2052, + "step": 15251 + }, + { + "epoch": 4.058541777541246, + "grad_norm": 0.4050939977169037, + "learning_rate": 1.733582414093393e-08, + "loss": 0.1794, + "step": 15252 + }, + { + "epoch": 4.058807876530069, + "grad_norm": 0.33257442712783813, + "learning_rate": 1.7326321697510892e-08, + "loss": 0.1898, + "step": 15253 + }, + { + "epoch": 4.059073975518893, + "grad_norm": 0.27248480916023254, + "learning_rate": 1.7316821612109135e-08, + "loss": 0.1708, + "step": 15254 + }, + { + "epoch": 4.059340074507717, + "grad_norm": 0.3064803183078766, + "learning_rate": 1.730732388499965e-08, + "loss": 0.1789, + "step": 15255 + }, + { + "epoch": 4.05960617349654, + "grad_norm": 0.6832014918327332, + "learning_rate": 1.7297828516453328e-08, + "loss": 0.1917, + "step": 15256 + }, + { + "epoch": 4.059872272485364, + "grad_norm": 0.28535500168800354, + "learning_rate": 1.7288335506741014e-08, + "loss": 0.1734, + "step": 15257 + }, + { + "epoch": 4.0601383714741885, + "grad_norm": 0.27759850025177, + "learning_rate": 1.7278844856133425e-08, + "loss": 0.1776, + "step": 15258 + }, + { + "epoch": 4.060404470463013, + "grad_norm": 0.26593682169914246, + "learning_rate": 1.7269356564901306e-08, + "loss": 0.1593, + "step": 15259 + }, + { + "epoch": 4.060670569451836, + "grad_norm": 0.25995883345603943, + "learning_rate": 1.725987063331522e-08, + "loss": 0.175, + "step": 15260 + }, + { + "epoch": 4.06093666844066, + "grad_norm": 0.43572038412094116, + "learning_rate": 1.7250387061645777e-08, + "loss": 0.1664, + "step": 15261 + }, + { + "epoch": 4.061202767429484, + "grad_norm": 0.2815394997596741, + "learning_rate": 1.7240905850163434e-08, + "loss": 0.1738, + "step": 15262 + }, + { + "epoch": 4.061468866418307, + "grad_norm": 0.4174916744232178, + "learning_rate": 1.7231426999138664e-08, + "loss": 0.1654, + "step": 15263 + }, + { + "epoch": 4.0617349654071315, + "grad_norm": 0.31472402811050415, + "learning_rate": 1.7221950508841766e-08, + "loss": 0.1602, + "step": 15264 + }, + { + "epoch": 4.062001064395956, + "grad_norm": 0.36917123198509216, + "learning_rate": 1.721247637954305e-08, + "loss": 0.1901, + "step": 15265 + }, + { + "epoch": 4.062267163384779, + "grad_norm": 0.2570047974586487, + "learning_rate": 1.7203004611512763e-08, + "loss": 0.1661, + "step": 15266 + }, + { + "epoch": 4.062533262373603, + "grad_norm": 0.4470002353191376, + "learning_rate": 1.7193535205021015e-08, + "loss": 0.1814, + "step": 15267 + }, + { + "epoch": 4.062799361362427, + "grad_norm": 0.26293617486953735, + "learning_rate": 1.718406816033794e-08, + "loss": 0.1542, + "step": 15268 + }, + { + "epoch": 4.06306546035125, + "grad_norm": 0.2842250466346741, + "learning_rate": 1.7174603477733497e-08, + "loss": 0.1865, + "step": 15269 + }, + { + "epoch": 4.0633315593400745, + "grad_norm": 0.28115004301071167, + "learning_rate": 1.716514115747768e-08, + "loss": 0.1745, + "step": 15270 + }, + { + "epoch": 4.063597658328899, + "grad_norm": 0.3394383192062378, + "learning_rate": 1.7155681199840367e-08, + "loss": 0.1902, + "step": 15271 + }, + { + "epoch": 4.063863757317722, + "grad_norm": 0.2651754319667816, + "learning_rate": 1.714622360509139e-08, + "loss": 0.1715, + "step": 15272 + }, + { + "epoch": 4.064129856306546, + "grad_norm": 0.308784544467926, + "learning_rate": 1.7136768373500464e-08, + "loss": 0.2003, + "step": 15273 + }, + { + "epoch": 4.06439595529537, + "grad_norm": 0.25798022747039795, + "learning_rate": 1.7127315505337315e-08, + "loss": 0.1517, + "step": 15274 + }, + { + "epoch": 4.064662054284193, + "grad_norm": 0.2721714675426483, + "learning_rate": 1.7117865000871502e-08, + "loss": 0.1737, + "step": 15275 + }, + { + "epoch": 4.0649281532730175, + "grad_norm": 0.2502434551715851, + "learning_rate": 1.710841686037261e-08, + "loss": 0.166, + "step": 15276 + }, + { + "epoch": 4.065194252261842, + "grad_norm": 0.2758135199546814, + "learning_rate": 1.7098971084110114e-08, + "loss": 0.157, + "step": 15277 + }, + { + "epoch": 4.065460351250665, + "grad_norm": 0.30403056740760803, + "learning_rate": 1.7089527672353443e-08, + "loss": 0.1956, + "step": 15278 + }, + { + "epoch": 4.065726450239489, + "grad_norm": 0.26559922099113464, + "learning_rate": 1.7080086625371915e-08, + "loss": 0.1771, + "step": 15279 + }, + { + "epoch": 4.065992549228313, + "grad_norm": 0.2910149097442627, + "learning_rate": 1.7070647943434824e-08, + "loss": 0.1839, + "step": 15280 + }, + { + "epoch": 4.066258648217136, + "grad_norm": 0.3470465838909149, + "learning_rate": 1.7061211626811366e-08, + "loss": 0.1667, + "step": 15281 + }, + { + "epoch": 4.0665247472059605, + "grad_norm": 0.2721264660358429, + "learning_rate": 1.7051777675770705e-08, + "loss": 0.179, + "step": 15282 + }, + { + "epoch": 4.066790846194785, + "grad_norm": 0.2843848168849945, + "learning_rate": 1.704234609058188e-08, + "loss": 0.1786, + "step": 15283 + }, + { + "epoch": 4.067056945183609, + "grad_norm": 0.2592637538909912, + "learning_rate": 1.7032916871513904e-08, + "loss": 0.1715, + "step": 15284 + }, + { + "epoch": 4.067323044172432, + "grad_norm": 0.27849483489990234, + "learning_rate": 1.7023490018835783e-08, + "loss": 0.1767, + "step": 15285 + }, + { + "epoch": 4.067589143161256, + "grad_norm": 0.2695786952972412, + "learning_rate": 1.7014065532816325e-08, + "loss": 0.1698, + "step": 15286 + }, + { + "epoch": 4.06785524215008, + "grad_norm": 0.28938424587249756, + "learning_rate": 1.7004643413724364e-08, + "loss": 0.1745, + "step": 15287 + }, + { + "epoch": 4.0681213411389034, + "grad_norm": 0.3615025579929352, + "learning_rate": 1.6995223661828607e-08, + "loss": 0.1781, + "step": 15288 + }, + { + "epoch": 4.068387440127728, + "grad_norm": 0.34321436285972595, + "learning_rate": 1.6985806277397773e-08, + "loss": 0.1633, + "step": 15289 + }, + { + "epoch": 4.068653539116552, + "grad_norm": 0.3251943290233612, + "learning_rate": 1.6976391260700417e-08, + "loss": 0.1913, + "step": 15290 + }, + { + "epoch": 4.068919638105375, + "grad_norm": 0.373925119638443, + "learning_rate": 1.696697861200509e-08, + "loss": 0.1792, + "step": 15291 + }, + { + "epoch": 4.069185737094199, + "grad_norm": 0.25867584347724915, + "learning_rate": 1.6957568331580262e-08, + "loss": 0.1783, + "step": 15292 + }, + { + "epoch": 4.069451836083023, + "grad_norm": 0.576992928981781, + "learning_rate": 1.694816041969436e-08, + "loss": 0.2006, + "step": 15293 + }, + { + "epoch": 4.069717935071846, + "grad_norm": 0.2643241584300995, + "learning_rate": 1.6938754876615667e-08, + "loss": 0.1682, + "step": 15294 + }, + { + "epoch": 4.0699840340606706, + "grad_norm": 0.3564181625843048, + "learning_rate": 1.6929351702612495e-08, + "loss": 0.1985, + "step": 15295 + }, + { + "epoch": 4.070250133049495, + "grad_norm": 0.285159707069397, + "learning_rate": 1.6919950897953007e-08, + "loss": 0.1692, + "step": 15296 + }, + { + "epoch": 4.070516232038318, + "grad_norm": 0.31316450238227844, + "learning_rate": 1.691055246290536e-08, + "loss": 0.1688, + "step": 15297 + }, + { + "epoch": 4.070782331027142, + "grad_norm": 0.2868122458457947, + "learning_rate": 1.690115639773756e-08, + "loss": 0.174, + "step": 15298 + }, + { + "epoch": 4.071048430015966, + "grad_norm": 0.2714076638221741, + "learning_rate": 1.6891762702717694e-08, + "loss": 0.167, + "step": 15299 + }, + { + "epoch": 4.071314529004789, + "grad_norm": 0.28418678045272827, + "learning_rate": 1.6882371378113603e-08, + "loss": 0.1751, + "step": 15300 + }, + { + "epoch": 4.0715806279936135, + "grad_norm": 0.2893841862678528, + "learning_rate": 1.6872982424193193e-08, + "loss": 0.1783, + "step": 15301 + }, + { + "epoch": 4.071846726982438, + "grad_norm": 0.31014272570610046, + "learning_rate": 1.6863595841224278e-08, + "loss": 0.1868, + "step": 15302 + }, + { + "epoch": 4.072112825971261, + "grad_norm": 0.28184065222740173, + "learning_rate": 1.6854211629474514e-08, + "loss": 0.1827, + "step": 15303 + }, + { + "epoch": 4.072378924960085, + "grad_norm": 0.2947678565979004, + "learning_rate": 1.6844829789211624e-08, + "loss": 0.1776, + "step": 15304 + }, + { + "epoch": 4.072645023948909, + "grad_norm": 0.2715086042881012, + "learning_rate": 1.6835450320703147e-08, + "loss": 0.1817, + "step": 15305 + }, + { + "epoch": 4.072911122937732, + "grad_norm": 0.27323994040489197, + "learning_rate": 1.682607322421662e-08, + "loss": 0.1857, + "step": 15306 + }, + { + "epoch": 4.0731772219265565, + "grad_norm": 0.24870961904525757, + "learning_rate": 1.68166985000195e-08, + "loss": 0.1606, + "step": 15307 + }, + { + "epoch": 4.073443320915381, + "grad_norm": 0.3099018931388855, + "learning_rate": 1.680732614837921e-08, + "loss": 0.1817, + "step": 15308 + }, + { + "epoch": 4.073709419904205, + "grad_norm": 0.29247644543647766, + "learning_rate": 1.6797956169562998e-08, + "loss": 0.1782, + "step": 15309 + }, + { + "epoch": 4.073975518893028, + "grad_norm": 0.28659042716026306, + "learning_rate": 1.678858856383818e-08, + "loss": 0.1747, + "step": 15310 + }, + { + "epoch": 4.074241617881852, + "grad_norm": 0.4314073622226715, + "learning_rate": 1.677922333147188e-08, + "loss": 0.1739, + "step": 15311 + }, + { + "epoch": 4.074507716870676, + "grad_norm": 0.30594226717948914, + "learning_rate": 1.6769860472731257e-08, + "loss": 0.1573, + "step": 15312 + }, + { + "epoch": 4.0747738158594995, + "grad_norm": 0.3225177228450775, + "learning_rate": 1.6760499987883336e-08, + "loss": 0.1705, + "step": 15313 + }, + { + "epoch": 4.075039914848324, + "grad_norm": 0.44293636083602905, + "learning_rate": 1.675114187719513e-08, + "loss": 0.1832, + "step": 15314 + }, + { + "epoch": 4.075306013837148, + "grad_norm": 0.3511384129524231, + "learning_rate": 1.67417861409335e-08, + "loss": 0.1728, + "step": 15315 + }, + { + "epoch": 4.075572112825971, + "grad_norm": 0.36897194385528564, + "learning_rate": 1.6732432779365347e-08, + "loss": 0.1764, + "step": 15316 + }, + { + "epoch": 4.075838211814795, + "grad_norm": 0.946095883846283, + "learning_rate": 1.6723081792757398e-08, + "loss": 0.1808, + "step": 15317 + }, + { + "epoch": 4.076104310803619, + "grad_norm": 0.2748899459838867, + "learning_rate": 1.67137331813764e-08, + "loss": 0.1827, + "step": 15318 + }, + { + "epoch": 4.0763704097924425, + "grad_norm": 0.255500853061676, + "learning_rate": 1.6704386945488956e-08, + "loss": 0.1686, + "step": 15319 + }, + { + "epoch": 4.076636508781267, + "grad_norm": 0.32231205701828003, + "learning_rate": 1.669504308536167e-08, + "loss": 0.1802, + "step": 15320 + }, + { + "epoch": 4.076902607770091, + "grad_norm": 0.30578523874282837, + "learning_rate": 1.6685701601261026e-08, + "loss": 0.1864, + "step": 15321 + }, + { + "epoch": 4.077168706758914, + "grad_norm": 0.2733098268508911, + "learning_rate": 1.667636249345348e-08, + "loss": 0.1609, + "step": 15322 + }, + { + "epoch": 4.077434805747738, + "grad_norm": 0.2776721119880676, + "learning_rate": 1.666702576220542e-08, + "loss": 0.1657, + "step": 15323 + }, + { + "epoch": 4.077700904736562, + "grad_norm": 0.45478296279907227, + "learning_rate": 1.6657691407783105e-08, + "loss": 0.1726, + "step": 15324 + }, + { + "epoch": 4.0779670037253855, + "grad_norm": 0.2689265012741089, + "learning_rate": 1.664835943045282e-08, + "loss": 0.1719, + "step": 15325 + }, + { + "epoch": 4.07823310271421, + "grad_norm": 0.3726164400577545, + "learning_rate": 1.6639029830480666e-08, + "loss": 0.1913, + "step": 15326 + }, + { + "epoch": 4.078499201703034, + "grad_norm": 0.3579277992248535, + "learning_rate": 1.662970260813279e-08, + "loss": 0.1734, + "step": 15327 + }, + { + "epoch": 4.078765300691857, + "grad_norm": 0.27986255288124084, + "learning_rate": 1.6620377763675198e-08, + "loss": 0.1743, + "step": 15328 + }, + { + "epoch": 4.079031399680681, + "grad_norm": 0.25171130895614624, + "learning_rate": 1.6611055297373912e-08, + "loss": 0.1686, + "step": 15329 + }, + { + "epoch": 4.079297498669505, + "grad_norm": 0.31114667654037476, + "learning_rate": 1.6601735209494737e-08, + "loss": 0.168, + "step": 15330 + }, + { + "epoch": 4.079563597658329, + "grad_norm": 0.3480420708656311, + "learning_rate": 1.659241750030359e-08, + "loss": 0.1698, + "step": 15331 + }, + { + "epoch": 4.079829696647153, + "grad_norm": 0.26203280687332153, + "learning_rate": 1.6583102170066153e-08, + "loss": 0.1596, + "step": 15332 + }, + { + "epoch": 4.080095795635977, + "grad_norm": 0.31056615710258484, + "learning_rate": 1.6573789219048185e-08, + "loss": 0.1848, + "step": 15333 + }, + { + "epoch": 4.080361894624801, + "grad_norm": 0.27271246910095215, + "learning_rate": 1.6564478647515245e-08, + "loss": 0.1719, + "step": 15334 + }, + { + "epoch": 4.080627993613624, + "grad_norm": 0.27100831270217896, + "learning_rate": 1.6555170455732925e-08, + "loss": 0.1713, + "step": 15335 + }, + { + "epoch": 4.080894092602448, + "grad_norm": 0.297845721244812, + "learning_rate": 1.6545864643966722e-08, + "loss": 0.156, + "step": 15336 + }, + { + "epoch": 4.081160191591272, + "grad_norm": 0.2942059636116028, + "learning_rate": 1.653656121248206e-08, + "loss": 0.1842, + "step": 15337 + }, + { + "epoch": 4.0814262905800955, + "grad_norm": 0.2973872125148773, + "learning_rate": 1.6527260161544254e-08, + "loss": 0.1845, + "step": 15338 + }, + { + "epoch": 4.08169238956892, + "grad_norm": 0.3122895359992981, + "learning_rate": 1.651796149141862e-08, + "loss": 0.1755, + "step": 15339 + }, + { + "epoch": 4.081958488557744, + "grad_norm": 0.2728223502635956, + "learning_rate": 1.6508665202370387e-08, + "loss": 0.1712, + "step": 15340 + }, + { + "epoch": 4.082224587546567, + "grad_norm": 0.4120509922504425, + "learning_rate": 1.6499371294664667e-08, + "loss": 0.2019, + "step": 15341 + }, + { + "epoch": 4.082490686535391, + "grad_norm": 0.3419058918952942, + "learning_rate": 1.6490079768566544e-08, + "loss": 0.1858, + "step": 15342 + }, + { + "epoch": 4.082756785524215, + "grad_norm": 0.30373576283454895, + "learning_rate": 1.6480790624341067e-08, + "loss": 0.1666, + "step": 15343 + }, + { + "epoch": 4.0830228845130385, + "grad_norm": 0.3999399244785309, + "learning_rate": 1.6471503862253167e-08, + "loss": 0.1712, + "step": 15344 + }, + { + "epoch": 4.083288983501863, + "grad_norm": 0.29843851923942566, + "learning_rate": 1.6462219482567696e-08, + "loss": 0.1618, + "step": 15345 + }, + { + "epoch": 4.083555082490687, + "grad_norm": 0.35802990198135376, + "learning_rate": 1.6452937485549512e-08, + "loss": 0.1714, + "step": 15346 + }, + { + "epoch": 4.08382118147951, + "grad_norm": 0.25201982259750366, + "learning_rate": 1.64436578714633e-08, + "loss": 0.1682, + "step": 15347 + }, + { + "epoch": 4.084087280468334, + "grad_norm": 0.3711272180080414, + "learning_rate": 1.6434380640573787e-08, + "loss": 0.1802, + "step": 15348 + }, + { + "epoch": 4.084353379457158, + "grad_norm": 0.26868417859077454, + "learning_rate": 1.64251057931455e-08, + "loss": 0.1637, + "step": 15349 + }, + { + "epoch": 4.0846194784459815, + "grad_norm": 0.2863853871822357, + "learning_rate": 1.6415833329443075e-08, + "loss": 0.1629, + "step": 15350 + }, + { + "epoch": 4.084885577434806, + "grad_norm": 0.2737022042274475, + "learning_rate": 1.6406563249730908e-08, + "loss": 0.1633, + "step": 15351 + }, + { + "epoch": 4.08515167642363, + "grad_norm": 0.29003849625587463, + "learning_rate": 1.639729555427345e-08, + "loss": 0.185, + "step": 15352 + }, + { + "epoch": 4.085417775412454, + "grad_norm": 0.4664115905761719, + "learning_rate": 1.6388030243334994e-08, + "loss": 0.1944, + "step": 15353 + }, + { + "epoch": 4.085683874401277, + "grad_norm": 0.2557017505168915, + "learning_rate": 1.637876731717984e-08, + "loss": 0.1729, + "step": 15354 + }, + { + "epoch": 4.085949973390101, + "grad_norm": 0.44827285408973694, + "learning_rate": 1.6369506776072152e-08, + "loss": 0.1862, + "step": 15355 + }, + { + "epoch": 4.086216072378925, + "grad_norm": 0.2762730121612549, + "learning_rate": 1.6360248620276075e-08, + "loss": 0.1715, + "step": 15356 + }, + { + "epoch": 4.086482171367749, + "grad_norm": 0.35790300369262695, + "learning_rate": 1.6350992850055666e-08, + "loss": 0.197, + "step": 15357 + }, + { + "epoch": 4.086748270356573, + "grad_norm": 0.2872328460216522, + "learning_rate": 1.634173946567493e-08, + "loss": 0.1869, + "step": 15358 + }, + { + "epoch": 4.087014369345397, + "grad_norm": 0.2875404953956604, + "learning_rate": 1.633248846739781e-08, + "loss": 0.1927, + "step": 15359 + }, + { + "epoch": 4.08728046833422, + "grad_norm": 0.2900543212890625, + "learning_rate": 1.6323239855488113e-08, + "loss": 0.1839, + "step": 15360 + }, + { + "epoch": 4.087546567323044, + "grad_norm": 0.31061118841171265, + "learning_rate": 1.6313993630209676e-08, + "loss": 0.1774, + "step": 15361 + }, + { + "epoch": 4.087812666311868, + "grad_norm": 0.34381040930747986, + "learning_rate": 1.6304749791826177e-08, + "loss": 0.1641, + "step": 15362 + }, + { + "epoch": 4.088078765300692, + "grad_norm": 0.27032917737960815, + "learning_rate": 1.6295508340601293e-08, + "loss": 0.1688, + "step": 15363 + }, + { + "epoch": 4.088344864289516, + "grad_norm": 0.35111090540885925, + "learning_rate": 1.6286269276798615e-08, + "loss": 0.1771, + "step": 15364 + }, + { + "epoch": 4.08861096327834, + "grad_norm": 0.4308304190635681, + "learning_rate": 1.627703260068166e-08, + "loss": 0.1929, + "step": 15365 + }, + { + "epoch": 4.088877062267163, + "grad_norm": 0.26819804310798645, + "learning_rate": 1.6267798312513847e-08, + "loss": 0.1802, + "step": 15366 + }, + { + "epoch": 4.089143161255987, + "grad_norm": 0.2833310067653656, + "learning_rate": 1.6258566412558618e-08, + "loss": 0.159, + "step": 15367 + }, + { + "epoch": 4.089409260244811, + "grad_norm": 0.29818740487098694, + "learning_rate": 1.62493369010792e-08, + "loss": 0.1897, + "step": 15368 + }, + { + "epoch": 4.089675359233635, + "grad_norm": 0.26247069239616394, + "learning_rate": 1.6240109778338916e-08, + "loss": 0.1666, + "step": 15369 + }, + { + "epoch": 4.089941458222459, + "grad_norm": 0.2714632749557495, + "learning_rate": 1.6230885044600885e-08, + "loss": 0.1634, + "step": 15370 + }, + { + "epoch": 4.090207557211283, + "grad_norm": 0.26360824704170227, + "learning_rate": 1.6221662700128247e-08, + "loss": 0.1751, + "step": 15371 + }, + { + "epoch": 4.090473656200106, + "grad_norm": 0.352333664894104, + "learning_rate": 1.621244274518403e-08, + "loss": 0.1953, + "step": 15372 + }, + { + "epoch": 4.09073975518893, + "grad_norm": 0.2763585150241852, + "learning_rate": 1.6203225180031233e-08, + "loss": 0.1874, + "step": 15373 + }, + { + "epoch": 4.091005854177754, + "grad_norm": 0.2832927107810974, + "learning_rate": 1.6194010004932702e-08, + "loss": 0.1666, + "step": 15374 + }, + { + "epoch": 4.0912719531665775, + "grad_norm": 0.390948086977005, + "learning_rate": 1.6184797220151325e-08, + "loss": 0.1692, + "step": 15375 + }, + { + "epoch": 4.091538052155402, + "grad_norm": 0.2723273038864136, + "learning_rate": 1.617558682594986e-08, + "loss": 0.1885, + "step": 15376 + }, + { + "epoch": 4.091804151144226, + "grad_norm": 0.28423061966896057, + "learning_rate": 1.6166378822590986e-08, + "loss": 0.1723, + "step": 15377 + }, + { + "epoch": 4.09207025013305, + "grad_norm": 0.3228520154953003, + "learning_rate": 1.6157173210337348e-08, + "loss": 0.1648, + "step": 15378 + }, + { + "epoch": 4.092336349121873, + "grad_norm": 0.2876993715763092, + "learning_rate": 1.6147969989451505e-08, + "loss": 0.1654, + "step": 15379 + }, + { + "epoch": 4.092602448110697, + "grad_norm": 0.2671501934528351, + "learning_rate": 1.6138769160195965e-08, + "loss": 0.1857, + "step": 15380 + }, + { + "epoch": 4.092868547099521, + "grad_norm": 0.28046196699142456, + "learning_rate": 1.612957072283313e-08, + "loss": 0.1597, + "step": 15381 + }, + { + "epoch": 4.093134646088345, + "grad_norm": 0.4043821394443512, + "learning_rate": 1.6120374677625393e-08, + "loss": 0.159, + "step": 15382 + }, + { + "epoch": 4.093400745077169, + "grad_norm": 0.3130515217781067, + "learning_rate": 1.6111181024834995e-08, + "loss": 0.1708, + "step": 15383 + }, + { + "epoch": 4.093666844065993, + "grad_norm": 0.3150431513786316, + "learning_rate": 1.6101989764724212e-08, + "loss": 0.1779, + "step": 15384 + }, + { + "epoch": 4.093932943054816, + "grad_norm": 0.3592856824398041, + "learning_rate": 1.6092800897555147e-08, + "loss": 0.1961, + "step": 15385 + }, + { + "epoch": 4.09419904204364, + "grad_norm": 0.34582775831222534, + "learning_rate": 1.6083614423589907e-08, + "loss": 0.1844, + "step": 15386 + }, + { + "epoch": 4.094465141032464, + "grad_norm": 0.27735498547554016, + "learning_rate": 1.6074430343090507e-08, + "loss": 0.1594, + "step": 15387 + }, + { + "epoch": 4.094731240021288, + "grad_norm": 0.2931443154811859, + "learning_rate": 1.6065248656318918e-08, + "loss": 0.1719, + "step": 15388 + }, + { + "epoch": 4.094997339010112, + "grad_norm": 0.3791137635707855, + "learning_rate": 1.6056069363536983e-08, + "loss": 0.1842, + "step": 15389 + }, + { + "epoch": 4.095263437998936, + "grad_norm": 0.26247626543045044, + "learning_rate": 1.6046892465006566e-08, + "loss": 0.1593, + "step": 15390 + }, + { + "epoch": 4.095529536987759, + "grad_norm": 0.26658621430397034, + "learning_rate": 1.603771796098934e-08, + "loss": 0.1759, + "step": 15391 + }, + { + "epoch": 4.095795635976583, + "grad_norm": 0.3807286322116852, + "learning_rate": 1.602854585174702e-08, + "loss": 0.1704, + "step": 15392 + }, + { + "epoch": 4.096061734965407, + "grad_norm": 0.2773820161819458, + "learning_rate": 1.6019376137541207e-08, + "loss": 0.1798, + "step": 15393 + }, + { + "epoch": 4.096327833954231, + "grad_norm": 0.4235684871673584, + "learning_rate": 1.6010208818633454e-08, + "loss": 0.1866, + "step": 15394 + }, + { + "epoch": 4.096593932943055, + "grad_norm": 0.38182201981544495, + "learning_rate": 1.6001043895285237e-08, + "loss": 0.1796, + "step": 15395 + }, + { + "epoch": 4.096860031931879, + "grad_norm": 0.30344539880752563, + "learning_rate": 1.599188136775791e-08, + "loss": 0.1874, + "step": 15396 + }, + { + "epoch": 4.097126130920702, + "grad_norm": 0.3956640362739563, + "learning_rate": 1.598272123631288e-08, + "loss": 0.176, + "step": 15397 + }, + { + "epoch": 4.097392229909526, + "grad_norm": 0.308861643075943, + "learning_rate": 1.597356350121134e-08, + "loss": 0.1822, + "step": 15398 + }, + { + "epoch": 4.09765832889835, + "grad_norm": 0.3597905933856964, + "learning_rate": 1.5964408162714514e-08, + "loss": 0.1756, + "step": 15399 + }, + { + "epoch": 4.097924427887174, + "grad_norm": 0.2994506359100342, + "learning_rate": 1.595525522108354e-08, + "loss": 0.184, + "step": 15400 + }, + { + "epoch": 4.098190526875998, + "grad_norm": 0.38064175844192505, + "learning_rate": 1.5946104676579497e-08, + "loss": 0.1797, + "step": 15401 + }, + { + "epoch": 4.098456625864822, + "grad_norm": 0.36917805671691895, + "learning_rate": 1.593695652946333e-08, + "loss": 0.1812, + "step": 15402 + }, + { + "epoch": 4.098722724853646, + "grad_norm": 0.3549794554710388, + "learning_rate": 1.592781077999602e-08, + "loss": 0.1666, + "step": 15403 + }, + { + "epoch": 4.098988823842469, + "grad_norm": 0.26638108491897583, + "learning_rate": 1.591866742843835e-08, + "loss": 0.1693, + "step": 15404 + }, + { + "epoch": 4.099254922831293, + "grad_norm": 0.3413528501987457, + "learning_rate": 1.5909526475051182e-08, + "loss": 0.178, + "step": 15405 + }, + { + "epoch": 4.0995210218201175, + "grad_norm": 0.29004374146461487, + "learning_rate": 1.590038792009517e-08, + "loss": 0.1766, + "step": 15406 + }, + { + "epoch": 4.099787120808941, + "grad_norm": 0.2657945454120636, + "learning_rate": 1.5891251763830992e-08, + "loss": 0.1674, + "step": 15407 + }, + { + "epoch": 4.100053219797765, + "grad_norm": 0.31017541885375977, + "learning_rate": 1.588211800651924e-08, + "loss": 0.1764, + "step": 15408 + }, + { + "epoch": 4.100319318786589, + "grad_norm": 0.46578294038772583, + "learning_rate": 1.5872986648420428e-08, + "loss": 0.1692, + "step": 15409 + }, + { + "epoch": 4.100585417775412, + "grad_norm": 0.38978973031044006, + "learning_rate": 1.586385768979498e-08, + "loss": 0.1907, + "step": 15410 + }, + { + "epoch": 4.100851516764236, + "grad_norm": 0.3255953788757324, + "learning_rate": 1.5854731130903276e-08, + "loss": 0.1733, + "step": 15411 + }, + { + "epoch": 4.10111761575306, + "grad_norm": 0.29617419838905334, + "learning_rate": 1.5845606972005653e-08, + "loss": 0.164, + "step": 15412 + }, + { + "epoch": 4.101383714741884, + "grad_norm": 0.2776470482349396, + "learning_rate": 1.5836485213362315e-08, + "loss": 0.1708, + "step": 15413 + }, + { + "epoch": 4.101649813730708, + "grad_norm": 0.30966243147850037, + "learning_rate": 1.5827365855233444e-08, + "loss": 0.1857, + "step": 15414 + }, + { + "epoch": 4.101915912719532, + "grad_norm": 0.2727900743484497, + "learning_rate": 1.581824889787914e-08, + "loss": 0.1798, + "step": 15415 + }, + { + "epoch": 4.102182011708355, + "grad_norm": 0.2763073146343231, + "learning_rate": 1.580913434155947e-08, + "loss": 0.1853, + "step": 15416 + }, + { + "epoch": 4.102448110697179, + "grad_norm": 0.27594149112701416, + "learning_rate": 1.5800022186534356e-08, + "loss": 0.1638, + "step": 15417 + }, + { + "epoch": 4.102714209686003, + "grad_norm": 0.29477301239967346, + "learning_rate": 1.579091243306372e-08, + "loss": 0.1816, + "step": 15418 + }, + { + "epoch": 4.102980308674827, + "grad_norm": 0.2693306505680084, + "learning_rate": 1.5781805081407373e-08, + "loss": 0.1698, + "step": 15419 + }, + { + "epoch": 4.103246407663651, + "grad_norm": 0.3323078751564026, + "learning_rate": 1.57727001318251e-08, + "loss": 0.1885, + "step": 15420 + }, + { + "epoch": 4.103512506652475, + "grad_norm": 0.3246299922466278, + "learning_rate": 1.576359758457656e-08, + "loss": 0.177, + "step": 15421 + }, + { + "epoch": 4.103778605641298, + "grad_norm": 0.25320082902908325, + "learning_rate": 1.5754497439921387e-08, + "loss": 0.1696, + "step": 15422 + }, + { + "epoch": 4.104044704630122, + "grad_norm": 0.27040335536003113, + "learning_rate": 1.5745399698119143e-08, + "loss": 0.1735, + "step": 15423 + }, + { + "epoch": 4.104310803618946, + "grad_norm": 0.2991090714931488, + "learning_rate": 1.5736304359429342e-08, + "loss": 0.1775, + "step": 15424 + }, + { + "epoch": 4.10457690260777, + "grad_norm": 0.2955278754234314, + "learning_rate": 1.5727211424111342e-08, + "loss": 0.1656, + "step": 15425 + }, + { + "epoch": 4.104843001596594, + "grad_norm": 0.2854263484477997, + "learning_rate": 1.5718120892424557e-08, + "loss": 0.1683, + "step": 15426 + }, + { + "epoch": 4.105109100585418, + "grad_norm": 0.300915002822876, + "learning_rate": 1.5709032764628194e-08, + "loss": 0.1775, + "step": 15427 + }, + { + "epoch": 4.105375199574242, + "grad_norm": 0.3426893651485443, + "learning_rate": 1.569994704098152e-08, + "loss": 0.1646, + "step": 15428 + }, + { + "epoch": 4.105641298563065, + "grad_norm": 0.33755844831466675, + "learning_rate": 1.5690863721743653e-08, + "loss": 0.2079, + "step": 15429 + }, + { + "epoch": 4.105907397551889, + "grad_norm": 0.37326112389564514, + "learning_rate": 1.5681782807173682e-08, + "loss": 0.1924, + "step": 15430 + }, + { + "epoch": 4.1061734965407135, + "grad_norm": 0.4033002257347107, + "learning_rate": 1.5672704297530625e-08, + "loss": 0.1885, + "step": 15431 + }, + { + "epoch": 4.106439595529537, + "grad_norm": 0.289191871881485, + "learning_rate": 1.5663628193073386e-08, + "loss": 0.1717, + "step": 15432 + }, + { + "epoch": 4.106705694518361, + "grad_norm": 0.26075512170791626, + "learning_rate": 1.565455449406087e-08, + "loss": 0.1484, + "step": 15433 + }, + { + "epoch": 4.106971793507185, + "grad_norm": 0.33073651790618896, + "learning_rate": 1.564548320075183e-08, + "loss": 0.1681, + "step": 15434 + }, + { + "epoch": 4.107237892496008, + "grad_norm": 0.3131864368915558, + "learning_rate": 1.5636414313405054e-08, + "loss": 0.1974, + "step": 15435 + }, + { + "epoch": 4.107503991484832, + "grad_norm": 0.31438887119293213, + "learning_rate": 1.5627347832279135e-08, + "loss": 0.1792, + "step": 15436 + }, + { + "epoch": 4.1077700904736565, + "grad_norm": 0.2897157669067383, + "learning_rate": 1.5618283757632754e-08, + "loss": 0.1949, + "step": 15437 + }, + { + "epoch": 4.10803618946248, + "grad_norm": 0.31922611594200134, + "learning_rate": 1.560922208972436e-08, + "loss": 0.1824, + "step": 15438 + }, + { + "epoch": 4.108302288451304, + "grad_norm": 0.41623660922050476, + "learning_rate": 1.560016282881248e-08, + "loss": 0.2053, + "step": 15439 + }, + { + "epoch": 4.108568387440128, + "grad_norm": 0.2884977161884308, + "learning_rate": 1.5591105975155426e-08, + "loss": 0.1677, + "step": 15440 + }, + { + "epoch": 4.108834486428951, + "grad_norm": 0.354594349861145, + "learning_rate": 1.5582051529011585e-08, + "loss": 0.1866, + "step": 15441 + }, + { + "epoch": 4.109100585417775, + "grad_norm": 0.3413296341896057, + "learning_rate": 1.557299949063916e-08, + "loss": 0.1962, + "step": 15442 + }, + { + "epoch": 4.1093666844065995, + "grad_norm": 0.3038215935230255, + "learning_rate": 1.5563949860296355e-08, + "loss": 0.1859, + "step": 15443 + }, + { + "epoch": 4.109632783395423, + "grad_norm": 0.3276216387748718, + "learning_rate": 1.5554902638241286e-08, + "loss": 0.1854, + "step": 15444 + }, + { + "epoch": 4.109898882384247, + "grad_norm": 0.2635754346847534, + "learning_rate": 1.5545857824732e-08, + "loss": 0.168, + "step": 15445 + }, + { + "epoch": 4.110164981373071, + "grad_norm": 0.31346532702445984, + "learning_rate": 1.5536815420026462e-08, + "loss": 0.1856, + "step": 15446 + }, + { + "epoch": 4.110431080361894, + "grad_norm": 0.2840452790260315, + "learning_rate": 1.5527775424382604e-08, + "loss": 0.1761, + "step": 15447 + }, + { + "epoch": 4.110697179350718, + "grad_norm": 0.2637457847595215, + "learning_rate": 1.5518737838058217e-08, + "loss": 0.1683, + "step": 15448 + }, + { + "epoch": 4.1109632783395424, + "grad_norm": 0.3463171124458313, + "learning_rate": 1.550970266131111e-08, + "loss": 0.168, + "step": 15449 + }, + { + "epoch": 4.111229377328367, + "grad_norm": 0.276634156703949, + "learning_rate": 1.5500669894398965e-08, + "loss": 0.1804, + "step": 15450 + }, + { + "epoch": 4.11149547631719, + "grad_norm": 0.3194681406021118, + "learning_rate": 1.5491639537579437e-08, + "loss": 0.1885, + "step": 15451 + }, + { + "epoch": 4.111761575306014, + "grad_norm": 0.3020527958869934, + "learning_rate": 1.5482611591110094e-08, + "loss": 0.166, + "step": 15452 + }, + { + "epoch": 4.112027674294838, + "grad_norm": 0.39308467507362366, + "learning_rate": 1.54735860552484e-08, + "loss": 0.186, + "step": 15453 + }, + { + "epoch": 4.112293773283661, + "grad_norm": 0.28280556201934814, + "learning_rate": 1.5464562930251813e-08, + "loss": 0.172, + "step": 15454 + }, + { + "epoch": 4.112559872272485, + "grad_norm": 0.3357362449169159, + "learning_rate": 1.545554221637766e-08, + "loss": 0.1874, + "step": 15455 + }, + { + "epoch": 4.1128259712613096, + "grad_norm": 0.2729779779911041, + "learning_rate": 1.544652391388327e-08, + "loss": 0.1685, + "step": 15456 + }, + { + "epoch": 4.113092070250133, + "grad_norm": 0.2855370342731476, + "learning_rate": 1.5437508023025804e-08, + "loss": 0.1753, + "step": 15457 + }, + { + "epoch": 4.113358169238957, + "grad_norm": 0.3683962821960449, + "learning_rate": 1.5428494544062453e-08, + "loss": 0.1969, + "step": 15458 + }, + { + "epoch": 4.113624268227781, + "grad_norm": 0.2815377116203308, + "learning_rate": 1.5419483477250294e-08, + "loss": 0.1783, + "step": 15459 + }, + { + "epoch": 4.113890367216604, + "grad_norm": 0.28718605637550354, + "learning_rate": 1.5410474822846374e-08, + "loss": 0.1709, + "step": 15460 + }, + { + "epoch": 4.114156466205428, + "grad_norm": 0.2848663032054901, + "learning_rate": 1.540146858110758e-08, + "loss": 0.1733, + "step": 15461 + }, + { + "epoch": 4.1144225651942525, + "grad_norm": 0.32132384181022644, + "learning_rate": 1.5392464752290835e-08, + "loss": 0.1852, + "step": 15462 + }, + { + "epoch": 4.114688664183076, + "grad_norm": 0.4047758877277374, + "learning_rate": 1.5383463336652902e-08, + "loss": 0.1805, + "step": 15463 + }, + { + "epoch": 4.1149547631719, + "grad_norm": 0.4333855211734772, + "learning_rate": 1.5374464334450544e-08, + "loss": 0.1933, + "step": 15464 + }, + { + "epoch": 4.115220862160724, + "grad_norm": 0.33868157863616943, + "learning_rate": 1.536546774594043e-08, + "loss": 0.158, + "step": 15465 + }, + { + "epoch": 4.115486961149547, + "grad_norm": 0.2898003160953522, + "learning_rate": 1.5356473571379158e-08, + "loss": 0.1783, + "step": 15466 + }, + { + "epoch": 4.115753060138371, + "grad_norm": 0.2954498529434204, + "learning_rate": 1.534748181102329e-08, + "loss": 0.1772, + "step": 15467 + }, + { + "epoch": 4.1160191591271955, + "grad_norm": 0.29711219668388367, + "learning_rate": 1.5338492465129236e-08, + "loss": 0.1836, + "step": 15468 + }, + { + "epoch": 4.116285258116019, + "grad_norm": 0.271244615316391, + "learning_rate": 1.5329505533953447e-08, + "loss": 0.1811, + "step": 15469 + }, + { + "epoch": 4.116551357104843, + "grad_norm": 0.321010947227478, + "learning_rate": 1.5320521017752197e-08, + "loss": 0.1654, + "step": 15470 + }, + { + "epoch": 4.116817456093667, + "grad_norm": 0.25046396255493164, + "learning_rate": 1.5311538916781786e-08, + "loss": 0.1602, + "step": 15471 + }, + { + "epoch": 4.117083555082491, + "grad_norm": 0.3275277018547058, + "learning_rate": 1.530255923129835e-08, + "loss": 0.1887, + "step": 15472 + }, + { + "epoch": 4.117349654071314, + "grad_norm": 0.25976717472076416, + "learning_rate": 1.5293581961558045e-08, + "loss": 0.163, + "step": 15473 + }, + { + "epoch": 4.1176157530601385, + "grad_norm": 0.2830830216407776, + "learning_rate": 1.5284607107816916e-08, + "loss": 0.1649, + "step": 15474 + }, + { + "epoch": 4.117881852048963, + "grad_norm": 0.28768691420555115, + "learning_rate": 1.5275634670330962e-08, + "loss": 0.18, + "step": 15475 + }, + { + "epoch": 4.118147951037786, + "grad_norm": 0.2758127748966217, + "learning_rate": 1.5266664649356053e-08, + "loss": 0.1625, + "step": 15476 + }, + { + "epoch": 4.11841405002661, + "grad_norm": 0.42169684171676636, + "learning_rate": 1.5257697045148076e-08, + "loss": 0.1999, + "step": 15477 + }, + { + "epoch": 4.118680149015434, + "grad_norm": 0.35476553440093994, + "learning_rate": 1.524873185796276e-08, + "loss": 0.18, + "step": 15478 + }, + { + "epoch": 4.118946248004257, + "grad_norm": 0.2947482168674469, + "learning_rate": 1.5239769088055843e-08, + "loss": 0.1762, + "step": 15479 + }, + { + "epoch": 4.1192123469930815, + "grad_norm": 0.2798532545566559, + "learning_rate": 1.5230808735682943e-08, + "loss": 0.1655, + "step": 15480 + }, + { + "epoch": 4.119478445981906, + "grad_norm": 0.2501142621040344, + "learning_rate": 1.5221850801099668e-08, + "loss": 0.1605, + "step": 15481 + }, + { + "epoch": 4.119744544970729, + "grad_norm": 0.39320752024650574, + "learning_rate": 1.5212895284561455e-08, + "loss": 0.1942, + "step": 15482 + }, + { + "epoch": 4.120010643959553, + "grad_norm": 0.252564936876297, + "learning_rate": 1.5203942186323783e-08, + "loss": 0.1696, + "step": 15483 + }, + { + "epoch": 4.120276742948377, + "grad_norm": 0.33129003643989563, + "learning_rate": 1.5194991506641986e-08, + "loss": 0.1923, + "step": 15484 + }, + { + "epoch": 4.1205428419372, + "grad_norm": 0.31029951572418213, + "learning_rate": 1.5186043245771353e-08, + "loss": 0.1908, + "step": 15485 + }, + { + "epoch": 4.1208089409260245, + "grad_norm": 0.3660017251968384, + "learning_rate": 1.5177097403967144e-08, + "loss": 0.1667, + "step": 15486 + }, + { + "epoch": 4.121075039914849, + "grad_norm": 0.29236510396003723, + "learning_rate": 1.5168153981484432e-08, + "loss": 0.1841, + "step": 15487 + }, + { + "epoch": 4.121341138903672, + "grad_norm": 0.35958215594291687, + "learning_rate": 1.5159212978578406e-08, + "loss": 0.1975, + "step": 15488 + }, + { + "epoch": 4.121607237892496, + "grad_norm": 0.2780223786830902, + "learning_rate": 1.5150274395504004e-08, + "loss": 0.1706, + "step": 15489 + }, + { + "epoch": 4.12187333688132, + "grad_norm": 0.2744198143482208, + "learning_rate": 1.5141338232516233e-08, + "loss": 0.1748, + "step": 15490 + }, + { + "epoch": 4.122139435870143, + "grad_norm": 0.27723026275634766, + "learning_rate": 1.5132404489869897e-08, + "loss": 0.1794, + "step": 15491 + }, + { + "epoch": 4.122405534858967, + "grad_norm": 0.2647680938243866, + "learning_rate": 1.512347316781988e-08, + "loss": 0.1579, + "step": 15492 + }, + { + "epoch": 4.122671633847792, + "grad_norm": 0.4490789473056793, + "learning_rate": 1.5114544266620856e-08, + "loss": 0.17, + "step": 15493 + }, + { + "epoch": 4.122937732836615, + "grad_norm": 0.267959862947464, + "learning_rate": 1.5105617786527534e-08, + "loss": 0.1661, + "step": 15494 + }, + { + "epoch": 4.123203831825439, + "grad_norm": 0.2740338444709778, + "learning_rate": 1.5096693727794496e-08, + "loss": 0.1725, + "step": 15495 + }, + { + "epoch": 4.123469930814263, + "grad_norm": 0.2848115563392639, + "learning_rate": 1.508777209067631e-08, + "loss": 0.1792, + "step": 15496 + }, + { + "epoch": 4.123736029803087, + "grad_norm": 0.4132993221282959, + "learning_rate": 1.5078852875427396e-08, + "loss": 0.1876, + "step": 15497 + }, + { + "epoch": 4.12400212879191, + "grad_norm": 0.28324687480926514, + "learning_rate": 1.5069936082302183e-08, + "loss": 0.1776, + "step": 15498 + }, + { + "epoch": 4.1242682277807345, + "grad_norm": 0.26489177346229553, + "learning_rate": 1.5061021711554966e-08, + "loss": 0.1529, + "step": 15499 + }, + { + "epoch": 4.124534326769559, + "grad_norm": 0.2802461087703705, + "learning_rate": 1.5052109763440023e-08, + "loss": 0.1684, + "step": 15500 + }, + { + "epoch": 4.124800425758382, + "grad_norm": 0.3584613800048828, + "learning_rate": 1.5043200238211495e-08, + "loss": 0.176, + "step": 15501 + }, + { + "epoch": 4.125066524747206, + "grad_norm": 0.29291000962257385, + "learning_rate": 1.5034293136123556e-08, + "loss": 0.1794, + "step": 15502 + }, + { + "epoch": 4.12533262373603, + "grad_norm": 0.29863226413726807, + "learning_rate": 1.5025388457430265e-08, + "loss": 0.164, + "step": 15503 + }, + { + "epoch": 4.125598722724853, + "grad_norm": 0.27851763367652893, + "learning_rate": 1.5016486202385536e-08, + "loss": 0.1831, + "step": 15504 + }, + { + "epoch": 4.1258648217136775, + "grad_norm": 0.2778940796852112, + "learning_rate": 1.5007586371243352e-08, + "loss": 0.1655, + "step": 15505 + }, + { + "epoch": 4.126130920702502, + "grad_norm": 0.3102130889892578, + "learning_rate": 1.499868896425749e-08, + "loss": 0.1862, + "step": 15506 + }, + { + "epoch": 4.126397019691325, + "grad_norm": 0.33326536417007446, + "learning_rate": 1.4989793981681765e-08, + "loss": 0.1729, + "step": 15507 + }, + { + "epoch": 4.126663118680149, + "grad_norm": 0.3145354688167572, + "learning_rate": 1.4980901423769854e-08, + "loss": 0.1745, + "step": 15508 + }, + { + "epoch": 4.126929217668973, + "grad_norm": 0.2689979672431946, + "learning_rate": 1.497201129077539e-08, + "loss": 0.172, + "step": 15509 + }, + { + "epoch": 4.127195316657796, + "grad_norm": 0.33698251843452454, + "learning_rate": 1.4963123582951953e-08, + "loss": 0.1818, + "step": 15510 + }, + { + "epoch": 4.1274614156466205, + "grad_norm": 0.3638078272342682, + "learning_rate": 1.4954238300553067e-08, + "loss": 0.1678, + "step": 15511 + }, + { + "epoch": 4.127727514635445, + "grad_norm": 0.302675724029541, + "learning_rate": 1.49453554438321e-08, + "loss": 0.1707, + "step": 15512 + }, + { + "epoch": 4.127993613624268, + "grad_norm": 0.2837338447570801, + "learning_rate": 1.493647501304245e-08, + "loss": 0.165, + "step": 15513 + }, + { + "epoch": 4.128259712613092, + "grad_norm": 0.27002057433128357, + "learning_rate": 1.4927597008437365e-08, + "loss": 0.1602, + "step": 15514 + }, + { + "epoch": 4.128525811601916, + "grad_norm": 0.29817917943000793, + "learning_rate": 1.49187214302701e-08, + "loss": 0.1754, + "step": 15515 + }, + { + "epoch": 4.128791910590739, + "grad_norm": 0.28412920236587524, + "learning_rate": 1.490984827879378e-08, + "loss": 0.1755, + "step": 15516 + }, + { + "epoch": 4.1290580095795635, + "grad_norm": 0.26270151138305664, + "learning_rate": 1.490097755426153e-08, + "loss": 0.16, + "step": 15517 + }, + { + "epoch": 4.129324108568388, + "grad_norm": 0.3607224225997925, + "learning_rate": 1.489210925692631e-08, + "loss": 0.1885, + "step": 15518 + }, + { + "epoch": 4.129590207557211, + "grad_norm": 0.2712664008140564, + "learning_rate": 1.48832433870411e-08, + "loss": 0.1776, + "step": 15519 + }, + { + "epoch": 4.129856306546035, + "grad_norm": 0.2757965326309204, + "learning_rate": 1.4874379944858728e-08, + "loss": 0.1723, + "step": 15520 + }, + { + "epoch": 4.130122405534859, + "grad_norm": 0.26956796646118164, + "learning_rate": 1.4865518930632026e-08, + "loss": 0.1749, + "step": 15521 + }, + { + "epoch": 4.130388504523683, + "grad_norm": 0.34133222699165344, + "learning_rate": 1.4856660344613748e-08, + "loss": 0.1847, + "step": 15522 + }, + { + "epoch": 4.1306546035125065, + "grad_norm": 0.3592052757740021, + "learning_rate": 1.4847804187056512e-08, + "loss": 0.1856, + "step": 15523 + }, + { + "epoch": 4.130920702501331, + "grad_norm": 0.35802072286605835, + "learning_rate": 1.483895045821294e-08, + "loss": 0.189, + "step": 15524 + }, + { + "epoch": 4.131186801490155, + "grad_norm": 0.28064122796058655, + "learning_rate": 1.4830099158335562e-08, + "loss": 0.1634, + "step": 15525 + }, + { + "epoch": 4.131452900478978, + "grad_norm": 0.35723984241485596, + "learning_rate": 1.4821250287676845e-08, + "loss": 0.1952, + "step": 15526 + }, + { + "epoch": 4.131718999467802, + "grad_norm": 0.28587913513183594, + "learning_rate": 1.481240384648914e-08, + "loss": 0.1749, + "step": 15527 + }, + { + "epoch": 4.131985098456626, + "grad_norm": 0.3694190979003906, + "learning_rate": 1.4803559835024814e-08, + "loss": 0.1865, + "step": 15528 + }, + { + "epoch": 4.132251197445449, + "grad_norm": 0.4314327836036682, + "learning_rate": 1.4794718253536065e-08, + "loss": 0.1747, + "step": 15529 + }, + { + "epoch": 4.132517296434274, + "grad_norm": 0.378467321395874, + "learning_rate": 1.478587910227509e-08, + "loss": 0.1699, + "step": 15530 + }, + { + "epoch": 4.132783395423098, + "grad_norm": 0.3097953200340271, + "learning_rate": 1.4777042381494008e-08, + "loss": 0.1836, + "step": 15531 + }, + { + "epoch": 4.133049494411921, + "grad_norm": 0.33465811610221863, + "learning_rate": 1.476820809144489e-08, + "loss": 0.1913, + "step": 15532 + }, + { + "epoch": 4.133315593400745, + "grad_norm": 0.35260969400405884, + "learning_rate": 1.475937623237965e-08, + "loss": 0.171, + "step": 15533 + }, + { + "epoch": 4.133581692389569, + "grad_norm": 0.27899202704429626, + "learning_rate": 1.4750546804550235e-08, + "loss": 0.1729, + "step": 15534 + }, + { + "epoch": 4.133847791378392, + "grad_norm": 0.42250826954841614, + "learning_rate": 1.4741719808208441e-08, + "loss": 0.1825, + "step": 15535 + }, + { + "epoch": 4.1341138903672165, + "grad_norm": 0.29851534962654114, + "learning_rate": 1.473289524360607e-08, + "loss": 0.177, + "step": 15536 + }, + { + "epoch": 4.134379989356041, + "grad_norm": 0.34181371331214905, + "learning_rate": 1.4724073110994784e-08, + "loss": 0.1852, + "step": 15537 + }, + { + "epoch": 4.134646088344864, + "grad_norm": 0.28325870633125305, + "learning_rate": 1.4715253410626216e-08, + "loss": 0.1691, + "step": 15538 + }, + { + "epoch": 4.134912187333688, + "grad_norm": 0.3243364691734314, + "learning_rate": 1.4706436142751921e-08, + "loss": 0.1767, + "step": 15539 + }, + { + "epoch": 4.135178286322512, + "grad_norm": 0.3277949392795563, + "learning_rate": 1.4697621307623398e-08, + "loss": 0.1709, + "step": 15540 + }, + { + "epoch": 4.135444385311335, + "grad_norm": 0.2678946256637573, + "learning_rate": 1.468880890549209e-08, + "loss": 0.1659, + "step": 15541 + }, + { + "epoch": 4.1357104843001595, + "grad_norm": 0.34912341833114624, + "learning_rate": 1.4679998936609273e-08, + "loss": 0.1786, + "step": 15542 + }, + { + "epoch": 4.135976583288984, + "grad_norm": 0.2745736539363861, + "learning_rate": 1.4671191401226301e-08, + "loss": 0.1682, + "step": 15543 + }, + { + "epoch": 4.136242682277807, + "grad_norm": 0.27669718861579895, + "learning_rate": 1.466238629959432e-08, + "loss": 0.1781, + "step": 15544 + }, + { + "epoch": 4.136508781266631, + "grad_norm": 0.2697046995162964, + "learning_rate": 1.4653583631964495e-08, + "loss": 0.1773, + "step": 15545 + }, + { + "epoch": 4.136774880255455, + "grad_norm": 0.2784123718738556, + "learning_rate": 1.46447833985879e-08, + "loss": 0.1662, + "step": 15546 + }, + { + "epoch": 4.137040979244279, + "grad_norm": 0.30959588289260864, + "learning_rate": 1.463598559971555e-08, + "loss": 0.1898, + "step": 15547 + }, + { + "epoch": 4.1373070782331025, + "grad_norm": 0.31913506984710693, + "learning_rate": 1.4627190235598342e-08, + "loss": 0.1949, + "step": 15548 + }, + { + "epoch": 4.137573177221927, + "grad_norm": 0.40931200981140137, + "learning_rate": 1.4618397306487174e-08, + "loss": 0.1945, + "step": 15549 + }, + { + "epoch": 4.137839276210751, + "grad_norm": 0.3731653690338135, + "learning_rate": 1.460960681263279e-08, + "loss": 0.2183, + "step": 15550 + }, + { + "epoch": 4.138105375199574, + "grad_norm": 0.26396679878234863, + "learning_rate": 1.460081875428597e-08, + "loss": 0.1681, + "step": 15551 + }, + { + "epoch": 4.138371474188398, + "grad_norm": 0.3911936581134796, + "learning_rate": 1.459203313169729e-08, + "loss": 0.1812, + "step": 15552 + }, + { + "epoch": 4.138637573177222, + "grad_norm": 0.41745734214782715, + "learning_rate": 1.4583249945117437e-08, + "loss": 0.1913, + "step": 15553 + }, + { + "epoch": 4.1389036721660455, + "grad_norm": 0.2863408923149109, + "learning_rate": 1.4574469194796835e-08, + "loss": 0.1733, + "step": 15554 + }, + { + "epoch": 4.13916977115487, + "grad_norm": 0.25736385583877563, + "learning_rate": 1.4565690880985992e-08, + "loss": 0.1551, + "step": 15555 + }, + { + "epoch": 4.139435870143694, + "grad_norm": 0.3386618494987488, + "learning_rate": 1.4556915003935233e-08, + "loss": 0.1777, + "step": 15556 + }, + { + "epoch": 4.139701969132517, + "grad_norm": 0.2864183187484741, + "learning_rate": 1.454814156389489e-08, + "loss": 0.1778, + "step": 15557 + }, + { + "epoch": 4.139968068121341, + "grad_norm": 0.2817186713218689, + "learning_rate": 1.4539370561115217e-08, + "loss": 0.1862, + "step": 15558 + }, + { + "epoch": 4.140234167110165, + "grad_norm": 0.3261498212814331, + "learning_rate": 1.4530601995846337e-08, + "loss": 0.1725, + "step": 15559 + }, + { + "epoch": 4.1405002660989885, + "grad_norm": 0.30897092819213867, + "learning_rate": 1.4521835868338372e-08, + "loss": 0.1707, + "step": 15560 + }, + { + "epoch": 4.140766365087813, + "grad_norm": 0.32219159603118896, + "learning_rate": 1.4513072178841346e-08, + "loss": 0.1773, + "step": 15561 + }, + { + "epoch": 4.141032464076637, + "grad_norm": 0.27985280752182007, + "learning_rate": 1.4504310927605246e-08, + "loss": 0.1834, + "step": 15562 + }, + { + "epoch": 4.14129856306546, + "grad_norm": 0.3700472414493561, + "learning_rate": 1.4495552114879905e-08, + "loss": 0.1788, + "step": 15563 + }, + { + "epoch": 4.141564662054284, + "grad_norm": 0.3692394495010376, + "learning_rate": 1.4486795740915202e-08, + "loss": 0.1807, + "step": 15564 + }, + { + "epoch": 4.141830761043108, + "grad_norm": 0.3727920353412628, + "learning_rate": 1.4478041805960828e-08, + "loss": 0.1856, + "step": 15565 + }, + { + "epoch": 4.1420968600319314, + "grad_norm": 0.2649591565132141, + "learning_rate": 1.4469290310266491e-08, + "loss": 0.1672, + "step": 15566 + }, + { + "epoch": 4.142362959020756, + "grad_norm": 0.3737409710884094, + "learning_rate": 1.4460541254081794e-08, + "loss": 0.1933, + "step": 15567 + }, + { + "epoch": 4.14262905800958, + "grad_norm": 0.303900808095932, + "learning_rate": 1.4451794637656312e-08, + "loss": 0.174, + "step": 15568 + }, + { + "epoch": 4.142895156998404, + "grad_norm": 0.2653833329677582, + "learning_rate": 1.444305046123947e-08, + "loss": 0.1703, + "step": 15569 + }, + { + "epoch": 4.143161255987227, + "grad_norm": 0.29161328077316284, + "learning_rate": 1.4434308725080713e-08, + "loss": 0.1733, + "step": 15570 + }, + { + "epoch": 4.143427354976051, + "grad_norm": 0.258235901594162, + "learning_rate": 1.4425569429429319e-08, + "loss": 0.1598, + "step": 15571 + }, + { + "epoch": 4.143693453964875, + "grad_norm": 0.4270451068878174, + "learning_rate": 1.441683257453461e-08, + "loss": 0.1842, + "step": 15572 + }, + { + "epoch": 4.1439595529536986, + "grad_norm": 0.2916072607040405, + "learning_rate": 1.4408098160645722e-08, + "loss": 0.1753, + "step": 15573 + }, + { + "epoch": 4.144225651942523, + "grad_norm": 0.2630694806575775, + "learning_rate": 1.4399366188011808e-08, + "loss": 0.1666, + "step": 15574 + }, + { + "epoch": 4.144491750931347, + "grad_norm": 0.25769174098968506, + "learning_rate": 1.4390636656881916e-08, + "loss": 0.1469, + "step": 15575 + }, + { + "epoch": 4.14475784992017, + "grad_norm": 0.33057209849357605, + "learning_rate": 1.4381909567505035e-08, + "loss": 0.163, + "step": 15576 + }, + { + "epoch": 4.145023948908994, + "grad_norm": 0.27098312973976135, + "learning_rate": 1.4373184920130099e-08, + "loss": 0.1685, + "step": 15577 + }, + { + "epoch": 4.145290047897818, + "grad_norm": 0.37407398223876953, + "learning_rate": 1.4364462715005909e-08, + "loss": 0.1962, + "step": 15578 + }, + { + "epoch": 4.1455561468866415, + "grad_norm": 0.41803038120269775, + "learning_rate": 1.4355742952381289e-08, + "loss": 0.1955, + "step": 15579 + }, + { + "epoch": 4.145822245875466, + "grad_norm": 0.3627992570400238, + "learning_rate": 1.4347025632504895e-08, + "loss": 0.1783, + "step": 15580 + }, + { + "epoch": 4.14608834486429, + "grad_norm": 0.3114466071128845, + "learning_rate": 1.4338310755625382e-08, + "loss": 0.1672, + "step": 15581 + }, + { + "epoch": 4.146354443853113, + "grad_norm": 0.2884155809879303, + "learning_rate": 1.432959832199132e-08, + "loss": 0.1579, + "step": 15582 + }, + { + "epoch": 4.146620542841937, + "grad_norm": 0.29355180263519287, + "learning_rate": 1.4320888331851222e-08, + "loss": 0.1864, + "step": 15583 + }, + { + "epoch": 4.146886641830761, + "grad_norm": 0.28426122665405273, + "learning_rate": 1.4312180785453475e-08, + "loss": 0.1643, + "step": 15584 + }, + { + "epoch": 4.1471527408195845, + "grad_norm": 0.27975377440452576, + "learning_rate": 1.4303475683046484e-08, + "loss": 0.1747, + "step": 15585 + }, + { + "epoch": 4.147418839808409, + "grad_norm": 0.35014429688453674, + "learning_rate": 1.4294773024878493e-08, + "loss": 0.1795, + "step": 15586 + }, + { + "epoch": 4.147684938797233, + "grad_norm": 0.2762324810028076, + "learning_rate": 1.4286072811197759e-08, + "loss": 0.1692, + "step": 15587 + }, + { + "epoch": 4.147951037786056, + "grad_norm": 0.31732800602912903, + "learning_rate": 1.427737504225237e-08, + "loss": 0.1892, + "step": 15588 + }, + { + "epoch": 4.14821713677488, + "grad_norm": 0.26375308632850647, + "learning_rate": 1.4268679718290455e-08, + "loss": 0.1771, + "step": 15589 + }, + { + "epoch": 4.148483235763704, + "grad_norm": 0.5316618084907532, + "learning_rate": 1.4259986839560001e-08, + "loss": 0.179, + "step": 15590 + }, + { + "epoch": 4.148749334752528, + "grad_norm": 0.27980729937553406, + "learning_rate": 1.4251296406308987e-08, + "loss": 0.1809, + "step": 15591 + }, + { + "epoch": 4.149015433741352, + "grad_norm": 0.28251445293426514, + "learning_rate": 1.4242608418785219e-08, + "loss": 0.1825, + "step": 15592 + }, + { + "epoch": 4.149281532730176, + "grad_norm": 0.4054845869541168, + "learning_rate": 1.4233922877236548e-08, + "loss": 0.1736, + "step": 15593 + }, + { + "epoch": 4.149547631719, + "grad_norm": 0.37731003761291504, + "learning_rate": 1.4225239781910659e-08, + "loss": 0.181, + "step": 15594 + }, + { + "epoch": 4.149813730707823, + "grad_norm": 0.2811254858970642, + "learning_rate": 1.421655913305524e-08, + "loss": 0.1834, + "step": 15595 + }, + { + "epoch": 4.150079829696647, + "grad_norm": 0.33189284801483154, + "learning_rate": 1.4207880930917871e-08, + "loss": 0.1825, + "step": 15596 + }, + { + "epoch": 4.150345928685471, + "grad_norm": 0.2678059935569763, + "learning_rate": 1.4199205175746076e-08, + "loss": 0.1549, + "step": 15597 + }, + { + "epoch": 4.150612027674295, + "grad_norm": 0.29672566056251526, + "learning_rate": 1.4190531867787325e-08, + "loss": 0.1506, + "step": 15598 + }, + { + "epoch": 4.150878126663119, + "grad_norm": 0.26014965772628784, + "learning_rate": 1.4181861007288965e-08, + "loss": 0.169, + "step": 15599 + }, + { + "epoch": 4.151144225651943, + "grad_norm": 0.33744943141937256, + "learning_rate": 1.4173192594498328e-08, + "loss": 0.1674, + "step": 15600 + }, + { + "epoch": 4.151410324640766, + "grad_norm": 0.35832881927490234, + "learning_rate": 1.4164526629662632e-08, + "loss": 0.1814, + "step": 15601 + }, + { + "epoch": 4.15167642362959, + "grad_norm": 0.2533497214317322, + "learning_rate": 1.4155863113029088e-08, + "loss": 0.1437, + "step": 15602 + }, + { + "epoch": 4.151942522618414, + "grad_norm": 0.28830572962760925, + "learning_rate": 1.414720204484473e-08, + "loss": 0.1786, + "step": 15603 + }, + { + "epoch": 4.152208621607238, + "grad_norm": 0.4128655195236206, + "learning_rate": 1.4138543425356664e-08, + "loss": 0.1822, + "step": 15604 + }, + { + "epoch": 4.152474720596062, + "grad_norm": 0.2700994908809662, + "learning_rate": 1.4129887254811802e-08, + "loss": 0.1759, + "step": 15605 + }, + { + "epoch": 4.152740819584886, + "grad_norm": 0.3185725510120392, + "learning_rate": 1.4121233533457078e-08, + "loss": 0.1674, + "step": 15606 + }, + { + "epoch": 4.153006918573709, + "grad_norm": 0.3409254252910614, + "learning_rate": 1.4112582261539253e-08, + "loss": 0.1797, + "step": 15607 + }, + { + "epoch": 4.153273017562533, + "grad_norm": 0.2813383638858795, + "learning_rate": 1.410393343930515e-08, + "loss": 0.1635, + "step": 15608 + }, + { + "epoch": 4.153539116551357, + "grad_norm": 0.31577643752098083, + "learning_rate": 1.4095287067001382e-08, + "loss": 0.1947, + "step": 15609 + }, + { + "epoch": 4.153805215540181, + "grad_norm": 0.2723314166069031, + "learning_rate": 1.4086643144874599e-08, + "loss": 0.1558, + "step": 15610 + }, + { + "epoch": 4.154071314529005, + "grad_norm": 0.3118276596069336, + "learning_rate": 1.4078001673171336e-08, + "loss": 0.1759, + "step": 15611 + }, + { + "epoch": 4.154337413517829, + "grad_norm": 0.2770412862300873, + "learning_rate": 1.406936265213806e-08, + "loss": 0.1782, + "step": 15612 + }, + { + "epoch": 4.154603512506652, + "grad_norm": 0.46581172943115234, + "learning_rate": 1.4060726082021213e-08, + "loss": 0.1753, + "step": 15613 + }, + { + "epoch": 4.154869611495476, + "grad_norm": 0.4349476993083954, + "learning_rate": 1.4052091963067069e-08, + "loss": 0.1893, + "step": 15614 + }, + { + "epoch": 4.1551357104843, + "grad_norm": 0.2750174105167389, + "learning_rate": 1.4043460295521947e-08, + "loss": 0.1906, + "step": 15615 + }, + { + "epoch": 4.155401809473124, + "grad_norm": 0.2962608337402344, + "learning_rate": 1.4034831079631981e-08, + "loss": 0.1859, + "step": 15616 + }, + { + "epoch": 4.155667908461948, + "grad_norm": 0.2782108187675476, + "learning_rate": 1.402620431564332e-08, + "loss": 0.1719, + "step": 15617 + }, + { + "epoch": 4.155934007450772, + "grad_norm": 0.38262122869491577, + "learning_rate": 1.401758000380202e-08, + "loss": 0.1637, + "step": 15618 + }, + { + "epoch": 4.156200106439596, + "grad_norm": 0.3382413387298584, + "learning_rate": 1.4008958144354088e-08, + "loss": 0.1831, + "step": 15619 + }, + { + "epoch": 4.156466205428419, + "grad_norm": 0.3560945391654968, + "learning_rate": 1.400033873754538e-08, + "loss": 0.1824, + "step": 15620 + }, + { + "epoch": 4.156732304417243, + "grad_norm": 0.3115187883377075, + "learning_rate": 1.3991721783621802e-08, + "loss": 0.1722, + "step": 15621 + }, + { + "epoch": 4.156998403406067, + "grad_norm": 0.29881295561790466, + "learning_rate": 1.3983107282829077e-08, + "loss": 0.1643, + "step": 15622 + }, + { + "epoch": 4.157264502394891, + "grad_norm": 0.2784438133239746, + "learning_rate": 1.3974495235412943e-08, + "loss": 0.1609, + "step": 15623 + }, + { + "epoch": 4.157530601383715, + "grad_norm": 0.2875535488128662, + "learning_rate": 1.3965885641618991e-08, + "loss": 0.1782, + "step": 15624 + }, + { + "epoch": 4.157796700372539, + "grad_norm": 0.24129953980445862, + "learning_rate": 1.3957278501692815e-08, + "loss": 0.1457, + "step": 15625 + }, + { + "epoch": 4.158062799361362, + "grad_norm": 0.3137284517288208, + "learning_rate": 1.3948673815879907e-08, + "loss": 0.1742, + "step": 15626 + }, + { + "epoch": 4.158328898350186, + "grad_norm": 0.35709336400032043, + "learning_rate": 1.3940071584425705e-08, + "loss": 0.1763, + "step": 15627 + }, + { + "epoch": 4.15859499733901, + "grad_norm": 0.31827685236930847, + "learning_rate": 1.3931471807575523e-08, + "loss": 0.181, + "step": 15628 + }, + { + "epoch": 4.158861096327834, + "grad_norm": 0.28451940417289734, + "learning_rate": 1.3922874485574688e-08, + "loss": 0.181, + "step": 15629 + }, + { + "epoch": 4.159127195316658, + "grad_norm": 0.3089245557785034, + "learning_rate": 1.3914279618668356e-08, + "loss": 0.1712, + "step": 15630 + }, + { + "epoch": 4.159393294305482, + "grad_norm": 0.42134228348731995, + "learning_rate": 1.3905687207101725e-08, + "loss": 0.205, + "step": 15631 + }, + { + "epoch": 4.159659393294305, + "grad_norm": 0.3782343566417694, + "learning_rate": 1.3897097251119826e-08, + "loss": 0.1904, + "step": 15632 + }, + { + "epoch": 4.159925492283129, + "grad_norm": 0.37913304567337036, + "learning_rate": 1.3888509750967692e-08, + "loss": 0.1855, + "step": 15633 + }, + { + "epoch": 4.160191591271953, + "grad_norm": 0.35500505566596985, + "learning_rate": 1.3879924706890267e-08, + "loss": 0.1731, + "step": 15634 + }, + { + "epoch": 4.160457690260777, + "grad_norm": 0.26008763909339905, + "learning_rate": 1.3871342119132379e-08, + "loss": 0.1548, + "step": 15635 + }, + { + "epoch": 4.160723789249601, + "grad_norm": 0.3072495460510254, + "learning_rate": 1.3862761987938853e-08, + "loss": 0.192, + "step": 15636 + }, + { + "epoch": 4.160989888238425, + "grad_norm": 0.2691281735897064, + "learning_rate": 1.3854184313554362e-08, + "loss": 0.1634, + "step": 15637 + }, + { + "epoch": 4.161255987227248, + "grad_norm": 0.2948296368122101, + "learning_rate": 1.3845609096223631e-08, + "loss": 0.1752, + "step": 15638 + }, + { + "epoch": 4.161522086216072, + "grad_norm": 0.28499653935432434, + "learning_rate": 1.3837036336191165e-08, + "loss": 0.1792, + "step": 15639 + }, + { + "epoch": 4.161788185204896, + "grad_norm": 0.2815474271774292, + "learning_rate": 1.3828466033701525e-08, + "loss": 0.1645, + "step": 15640 + }, + { + "epoch": 4.1620542841937205, + "grad_norm": 0.3809460997581482, + "learning_rate": 1.3819898188999134e-08, + "loss": 0.1659, + "step": 15641 + }, + { + "epoch": 4.162320383182544, + "grad_norm": 0.32384443283081055, + "learning_rate": 1.3811332802328402e-08, + "loss": 0.1885, + "step": 15642 + }, + { + "epoch": 4.162586482171368, + "grad_norm": 0.3537523150444031, + "learning_rate": 1.3802769873933584e-08, + "loss": 0.1723, + "step": 15643 + }, + { + "epoch": 4.162852581160192, + "grad_norm": 0.4469645321369171, + "learning_rate": 1.3794209404058944e-08, + "loss": 0.1799, + "step": 15644 + }, + { + "epoch": 4.163118680149015, + "grad_norm": 0.26153436303138733, + "learning_rate": 1.3785651392948604e-08, + "loss": 0.1476, + "step": 15645 + }, + { + "epoch": 4.163384779137839, + "grad_norm": 0.34352031350135803, + "learning_rate": 1.3777095840846697e-08, + "loss": 0.1779, + "step": 15646 + }, + { + "epoch": 4.1636508781266635, + "grad_norm": 0.34730231761932373, + "learning_rate": 1.3768542747997213e-08, + "loss": 0.1728, + "step": 15647 + }, + { + "epoch": 4.163916977115487, + "grad_norm": 0.2790161669254303, + "learning_rate": 1.3759992114644159e-08, + "loss": 0.1704, + "step": 15648 + }, + { + "epoch": 4.164183076104311, + "grad_norm": 0.3658545911312103, + "learning_rate": 1.3751443941031349e-08, + "loss": 0.1949, + "step": 15649 + }, + { + "epoch": 4.164449175093135, + "grad_norm": 0.5064297914505005, + "learning_rate": 1.3742898227402621e-08, + "loss": 0.1753, + "step": 15650 + }, + { + "epoch": 4.164715274081958, + "grad_norm": 0.2458675652742386, + "learning_rate": 1.3734354974001738e-08, + "loss": 0.1664, + "step": 15651 + }, + { + "epoch": 4.164981373070782, + "grad_norm": 0.29524949193000793, + "learning_rate": 1.3725814181072338e-08, + "loss": 0.1805, + "step": 15652 + }, + { + "epoch": 4.165247472059606, + "grad_norm": 0.3084155321121216, + "learning_rate": 1.3717275848858023e-08, + "loss": 0.178, + "step": 15653 + }, + { + "epoch": 4.16551357104843, + "grad_norm": 0.2907354235649109, + "learning_rate": 1.3708739977602347e-08, + "loss": 0.1949, + "step": 15654 + }, + { + "epoch": 4.165779670037254, + "grad_norm": 0.39843764901161194, + "learning_rate": 1.3700206567548778e-08, + "loss": 0.1856, + "step": 15655 + }, + { + "epoch": 4.166045769026078, + "grad_norm": 0.2808525264263153, + "learning_rate": 1.3691675618940668e-08, + "loss": 0.1609, + "step": 15656 + }, + { + "epoch": 4.166311868014901, + "grad_norm": 0.3563593626022339, + "learning_rate": 1.3683147132021367e-08, + "loss": 0.154, + "step": 15657 + }, + { + "epoch": 4.166577967003725, + "grad_norm": 0.27494189143180847, + "learning_rate": 1.36746211070341e-08, + "loss": 0.1775, + "step": 15658 + }, + { + "epoch": 4.166844065992549, + "grad_norm": 0.31071147322654724, + "learning_rate": 1.3666097544222089e-08, + "loss": 0.1641, + "step": 15659 + }, + { + "epoch": 4.167110164981373, + "grad_norm": 0.32599350810050964, + "learning_rate": 1.3657576443828378e-08, + "loss": 0.1547, + "step": 15660 + }, + { + "epoch": 4.167376263970197, + "grad_norm": 0.36743149161338806, + "learning_rate": 1.3649057806096054e-08, + "loss": 0.1934, + "step": 15661 + }, + { + "epoch": 4.167642362959021, + "grad_norm": 0.5437930822372437, + "learning_rate": 1.3640541631268076e-08, + "loss": 0.1752, + "step": 15662 + }, + { + "epoch": 4.167908461947844, + "grad_norm": 0.3067018985748291, + "learning_rate": 1.3632027919587352e-08, + "loss": 0.1768, + "step": 15663 + }, + { + "epoch": 4.168174560936668, + "grad_norm": 0.3156796395778656, + "learning_rate": 1.3623516671296687e-08, + "loss": 0.1589, + "step": 15664 + }, + { + "epoch": 4.168440659925492, + "grad_norm": 0.2754879295825958, + "learning_rate": 1.3615007886638873e-08, + "loss": 0.1606, + "step": 15665 + }, + { + "epoch": 4.1687067589143165, + "grad_norm": 0.3752937614917755, + "learning_rate": 1.3606501565856554e-08, + "loss": 0.1685, + "step": 15666 + }, + { + "epoch": 4.16897285790314, + "grad_norm": 0.28840166330337524, + "learning_rate": 1.3597997709192377e-08, + "loss": 0.1712, + "step": 15667 + }, + { + "epoch": 4.169238956891964, + "grad_norm": 0.2772069275379181, + "learning_rate": 1.358949631688887e-08, + "loss": 0.172, + "step": 15668 + }, + { + "epoch": 4.169505055880788, + "grad_norm": 0.34572306275367737, + "learning_rate": 1.3580997389188531e-08, + "loss": 0.1741, + "step": 15669 + }, + { + "epoch": 4.169771154869611, + "grad_norm": 0.3051536977291107, + "learning_rate": 1.3572500926333774e-08, + "loss": 0.1767, + "step": 15670 + }, + { + "epoch": 4.170037253858435, + "grad_norm": 0.2612815797328949, + "learning_rate": 1.3564006928566906e-08, + "loss": 0.1573, + "step": 15671 + }, + { + "epoch": 4.1703033528472595, + "grad_norm": 0.2793270945549011, + "learning_rate": 1.3555515396130224e-08, + "loss": 0.1692, + "step": 15672 + }, + { + "epoch": 4.170569451836083, + "grad_norm": 0.3048200011253357, + "learning_rate": 1.3547026329265887e-08, + "loss": 0.1752, + "step": 15673 + }, + { + "epoch": 4.170835550824907, + "grad_norm": 0.3453269600868225, + "learning_rate": 1.3538539728216059e-08, + "loss": 0.1784, + "step": 15674 + }, + { + "epoch": 4.171101649813731, + "grad_norm": 0.34014782309532166, + "learning_rate": 1.3530055593222756e-08, + "loss": 0.1956, + "step": 15675 + }, + { + "epoch": 4.171367748802554, + "grad_norm": 0.3311043083667755, + "learning_rate": 1.3521573924527973e-08, + "loss": 0.1741, + "step": 15676 + }, + { + "epoch": 4.171633847791378, + "grad_norm": 0.3567332625389099, + "learning_rate": 1.3513094722373641e-08, + "loss": 0.1653, + "step": 15677 + }, + { + "epoch": 4.1718999467802025, + "grad_norm": 0.29669153690338135, + "learning_rate": 1.3504617987001632e-08, + "loss": 0.1799, + "step": 15678 + }, + { + "epoch": 4.172166045769026, + "grad_norm": 0.3608645498752594, + "learning_rate": 1.3496143718653652e-08, + "loss": 0.1895, + "step": 15679 + }, + { + "epoch": 4.17243214475785, + "grad_norm": 0.41824033856391907, + "learning_rate": 1.3487671917571453e-08, + "loss": 0.1769, + "step": 15680 + }, + { + "epoch": 4.172698243746674, + "grad_norm": 0.2746535539627075, + "learning_rate": 1.347920258399664e-08, + "loss": 0.1745, + "step": 15681 + }, + { + "epoch": 4.172964342735497, + "grad_norm": 0.2906392514705658, + "learning_rate": 1.3470735718170778e-08, + "loss": 0.1697, + "step": 15682 + }, + { + "epoch": 4.173230441724321, + "grad_norm": 0.29965969920158386, + "learning_rate": 1.3462271320335384e-08, + "loss": 0.1818, + "step": 15683 + }, + { + "epoch": 4.1734965407131455, + "grad_norm": 0.25795847177505493, + "learning_rate": 1.3453809390731874e-08, + "loss": 0.1792, + "step": 15684 + }, + { + "epoch": 4.17376263970197, + "grad_norm": 0.27185407280921936, + "learning_rate": 1.344534992960158e-08, + "loss": 0.1723, + "step": 15685 + }, + { + "epoch": 4.174028738690793, + "grad_norm": 0.296492338180542, + "learning_rate": 1.3436892937185784e-08, + "loss": 0.1668, + "step": 15686 + }, + { + "epoch": 4.174294837679617, + "grad_norm": 0.2904232442378998, + "learning_rate": 1.3428438413725729e-08, + "loss": 0.1832, + "step": 15687 + }, + { + "epoch": 4.174560936668441, + "grad_norm": 0.49011319875717163, + "learning_rate": 1.341998635946251e-08, + "loss": 0.1967, + "step": 15688 + }, + { + "epoch": 4.174827035657264, + "grad_norm": 0.29084280133247375, + "learning_rate": 1.3411536774637244e-08, + "loss": 0.1804, + "step": 15689 + }, + { + "epoch": 4.1750931346460884, + "grad_norm": 0.3605770766735077, + "learning_rate": 1.3403089659490886e-08, + "loss": 0.1789, + "step": 15690 + }, + { + "epoch": 4.175359233634913, + "grad_norm": 0.2731247544288635, + "learning_rate": 1.3394645014264383e-08, + "loss": 0.1677, + "step": 15691 + }, + { + "epoch": 4.175625332623736, + "grad_norm": 0.3818322420120239, + "learning_rate": 1.338620283919859e-08, + "loss": 0.1605, + "step": 15692 + }, + { + "epoch": 4.17589143161256, + "grad_norm": 0.2859536409378052, + "learning_rate": 1.3377763134534325e-08, + "loss": 0.1827, + "step": 15693 + }, + { + "epoch": 4.176157530601384, + "grad_norm": 1.3343071937561035, + "learning_rate": 1.3369325900512252e-08, + "loss": 0.1971, + "step": 15694 + }, + { + "epoch": 4.176423629590207, + "grad_norm": 0.28886398673057556, + "learning_rate": 1.3360891137373075e-08, + "loss": 0.1546, + "step": 15695 + }, + { + "epoch": 4.176689728579031, + "grad_norm": 0.41723716259002686, + "learning_rate": 1.3352458845357317e-08, + "loss": 0.201, + "step": 15696 + }, + { + "epoch": 4.1769558275678556, + "grad_norm": 0.3120678961277008, + "learning_rate": 1.3344029024705517e-08, + "loss": 0.1677, + "step": 15697 + }, + { + "epoch": 4.177221926556679, + "grad_norm": 0.3342714011669159, + "learning_rate": 1.3335601675658093e-08, + "loss": 0.1787, + "step": 15698 + }, + { + "epoch": 4.177488025545503, + "grad_norm": 0.39594438672065735, + "learning_rate": 1.3327176798455453e-08, + "loss": 0.1876, + "step": 15699 + }, + { + "epoch": 4.177754124534327, + "grad_norm": 0.2971104681491852, + "learning_rate": 1.3318754393337827e-08, + "loss": 0.1651, + "step": 15700 + }, + { + "epoch": 4.17802022352315, + "grad_norm": 0.27923688292503357, + "learning_rate": 1.3310334460545492e-08, + "loss": 0.1943, + "step": 15701 + }, + { + "epoch": 4.178286322511974, + "grad_norm": 0.2806740403175354, + "learning_rate": 1.3301917000318553e-08, + "loss": 0.181, + "step": 15702 + }, + { + "epoch": 4.1785524215007985, + "grad_norm": 0.26951274275779724, + "learning_rate": 1.3293502012897129e-08, + "loss": 0.1684, + "step": 15703 + }, + { + "epoch": 4.178818520489622, + "grad_norm": 0.2706802487373352, + "learning_rate": 1.3285089498521218e-08, + "loss": 0.1633, + "step": 15704 + }, + { + "epoch": 4.179084619478446, + "grad_norm": 0.2637554407119751, + "learning_rate": 1.3276679457430773e-08, + "loss": 0.171, + "step": 15705 + }, + { + "epoch": 4.17935071846727, + "grad_norm": 0.4181736409664154, + "learning_rate": 1.3268271889865678e-08, + "loss": 0.1937, + "step": 15706 + }, + { + "epoch": 4.179616817456093, + "grad_norm": 0.3054555058479309, + "learning_rate": 1.3259866796065699e-08, + "loss": 0.1617, + "step": 15707 + }, + { + "epoch": 4.179882916444917, + "grad_norm": 0.3978502154350281, + "learning_rate": 1.325146417627061e-08, + "loss": 0.1816, + "step": 15708 + }, + { + "epoch": 4.1801490154337415, + "grad_norm": 0.299142062664032, + "learning_rate": 1.3243064030720008e-08, + "loss": 0.1826, + "step": 15709 + }, + { + "epoch": 4.180415114422566, + "grad_norm": 0.4763425886631012, + "learning_rate": 1.3234666359653546e-08, + "loss": 0.2145, + "step": 15710 + }, + { + "epoch": 4.180681213411389, + "grad_norm": 0.3655843138694763, + "learning_rate": 1.3226271163310698e-08, + "loss": 0.181, + "step": 15711 + }, + { + "epoch": 4.180947312400213, + "grad_norm": 0.6994251012802124, + "learning_rate": 1.3217878441930919e-08, + "loss": 0.1833, + "step": 15712 + }, + { + "epoch": 4.181213411389037, + "grad_norm": 0.32475316524505615, + "learning_rate": 1.3209488195753605e-08, + "loss": 0.1842, + "step": 15713 + }, + { + "epoch": 4.18147951037786, + "grad_norm": 0.27735382318496704, + "learning_rate": 1.3201100425018075e-08, + "loss": 0.1672, + "step": 15714 + }, + { + "epoch": 4.1817456093666845, + "grad_norm": 0.2641296088695526, + "learning_rate": 1.3192715129963527e-08, + "loss": 0.1692, + "step": 15715 + }, + { + "epoch": 4.182011708355509, + "grad_norm": 0.26840829849243164, + "learning_rate": 1.318433231082916e-08, + "loss": 0.1724, + "step": 15716 + }, + { + "epoch": 4.182277807344332, + "grad_norm": 0.27652508020401, + "learning_rate": 1.3175951967854027e-08, + "loss": 0.1947, + "step": 15717 + }, + { + "epoch": 4.182543906333156, + "grad_norm": 0.4113771319389343, + "learning_rate": 1.316757410127719e-08, + "loss": 0.1968, + "step": 15718 + }, + { + "epoch": 4.18281000532198, + "grad_norm": 0.2850145995616913, + "learning_rate": 1.3159198711337582e-08, + "loss": 0.1771, + "step": 15719 + }, + { + "epoch": 4.183076104310803, + "grad_norm": 0.27386823296546936, + "learning_rate": 1.3150825798274123e-08, + "loss": 0.1569, + "step": 15720 + }, + { + "epoch": 4.1833422032996275, + "grad_norm": 0.28025659918785095, + "learning_rate": 1.3142455362325577e-08, + "loss": 0.1707, + "step": 15721 + }, + { + "epoch": 4.183608302288452, + "grad_norm": 0.2956272065639496, + "learning_rate": 1.3134087403730699e-08, + "loss": 0.1655, + "step": 15722 + }, + { + "epoch": 4.183874401277275, + "grad_norm": 0.29376518726348877, + "learning_rate": 1.3125721922728194e-08, + "loss": 0.1763, + "step": 15723 + }, + { + "epoch": 4.184140500266099, + "grad_norm": 0.36335769295692444, + "learning_rate": 1.311735891955662e-08, + "loss": 0.1839, + "step": 15724 + }, + { + "epoch": 4.184406599254923, + "grad_norm": 0.2833505868911743, + "learning_rate": 1.310899839445454e-08, + "loss": 0.1682, + "step": 15725 + }, + { + "epoch": 4.184672698243746, + "grad_norm": 0.32943329215049744, + "learning_rate": 1.3100640347660385e-08, + "loss": 0.1744, + "step": 15726 + }, + { + "epoch": 4.1849387972325705, + "grad_norm": 0.3458375036716461, + "learning_rate": 1.3092284779412544e-08, + "loss": 0.1642, + "step": 15727 + }, + { + "epoch": 4.185204896221395, + "grad_norm": 0.28906816244125366, + "learning_rate": 1.3083931689949345e-08, + "loss": 0.1759, + "step": 15728 + }, + { + "epoch": 4.185470995210218, + "grad_norm": 0.2579818665981293, + "learning_rate": 1.3075581079509068e-08, + "loss": 0.1649, + "step": 15729 + }, + { + "epoch": 4.185737094199042, + "grad_norm": 0.28228914737701416, + "learning_rate": 1.306723294832983e-08, + "loss": 0.165, + "step": 15730 + }, + { + "epoch": 4.186003193187866, + "grad_norm": 0.3291736841201782, + "learning_rate": 1.3058887296649778e-08, + "loss": 0.1933, + "step": 15731 + }, + { + "epoch": 4.186269292176689, + "grad_norm": 0.30742698907852173, + "learning_rate": 1.305054412470692e-08, + "loss": 0.1681, + "step": 15732 + }, + { + "epoch": 4.186535391165513, + "grad_norm": 0.3102574348449707, + "learning_rate": 1.304220343273924e-08, + "loss": 0.1642, + "step": 15733 + }, + { + "epoch": 4.186801490154338, + "grad_norm": 0.3732796311378479, + "learning_rate": 1.3033865220984619e-08, + "loss": 0.1699, + "step": 15734 + }, + { + "epoch": 4.187067589143162, + "grad_norm": 0.42372334003448486, + "learning_rate": 1.3025529489680898e-08, + "loss": 0.1683, + "step": 15735 + }, + { + "epoch": 4.187333688131985, + "grad_norm": 0.34003549814224243, + "learning_rate": 1.3017196239065808e-08, + "loss": 0.1592, + "step": 15736 + }, + { + "epoch": 4.187599787120809, + "grad_norm": 0.3345491290092468, + "learning_rate": 1.300886546937705e-08, + "loss": 0.1913, + "step": 15737 + }, + { + "epoch": 4.187865886109633, + "grad_norm": 0.2568283975124359, + "learning_rate": 1.3000537180852211e-08, + "loss": 0.1658, + "step": 15738 + }, + { + "epoch": 4.188131985098456, + "grad_norm": 0.2945881485939026, + "learning_rate": 1.2992211373728856e-08, + "loss": 0.17, + "step": 15739 + }, + { + "epoch": 4.1883980840872805, + "grad_norm": 0.281644344329834, + "learning_rate": 1.2983888048244429e-08, + "loss": 0.1871, + "step": 15740 + }, + { + "epoch": 4.188664183076105, + "grad_norm": 0.4825938642024994, + "learning_rate": 1.2975567204636307e-08, + "loss": 0.1864, + "step": 15741 + }, + { + "epoch": 4.188930282064928, + "grad_norm": 0.38126835227012634, + "learning_rate": 1.2967248843141899e-08, + "loss": 0.185, + "step": 15742 + }, + { + "epoch": 4.189196381053752, + "grad_norm": 0.2926611006259918, + "learning_rate": 1.2958932963998392e-08, + "loss": 0.1727, + "step": 15743 + }, + { + "epoch": 4.189462480042576, + "grad_norm": 0.2959270477294922, + "learning_rate": 1.295061956744301e-08, + "loss": 0.1732, + "step": 15744 + }, + { + "epoch": 4.189728579031399, + "grad_norm": 0.329045832157135, + "learning_rate": 1.2942308653712841e-08, + "loss": 0.1923, + "step": 15745 + }, + { + "epoch": 4.1899946780202235, + "grad_norm": 0.27460238337516785, + "learning_rate": 1.293400022304495e-08, + "loss": 0.1768, + "step": 15746 + }, + { + "epoch": 4.190260777009048, + "grad_norm": 0.26869940757751465, + "learning_rate": 1.292569427567628e-08, + "loss": 0.1751, + "step": 15747 + }, + { + "epoch": 4.190526875997871, + "grad_norm": 0.28728005290031433, + "learning_rate": 1.2917390811843754e-08, + "loss": 0.1592, + "step": 15748 + }, + { + "epoch": 4.190792974986695, + "grad_norm": 0.5437055230140686, + "learning_rate": 1.2909089831784204e-08, + "loss": 0.1963, + "step": 15749 + }, + { + "epoch": 4.191059073975519, + "grad_norm": 0.2701438069343567, + "learning_rate": 1.2900791335734406e-08, + "loss": 0.1856, + "step": 15750 + }, + { + "epoch": 4.191325172964342, + "grad_norm": 0.27943772077560425, + "learning_rate": 1.2892495323931018e-08, + "loss": 0.1674, + "step": 15751 + }, + { + "epoch": 4.1915912719531665, + "grad_norm": 0.32944348454475403, + "learning_rate": 1.2884201796610695e-08, + "loss": 0.1675, + "step": 15752 + }, + { + "epoch": 4.191857370941991, + "grad_norm": 0.3302861452102661, + "learning_rate": 1.2875910754009933e-08, + "loss": 0.2097, + "step": 15753 + }, + { + "epoch": 4.192123469930814, + "grad_norm": 0.3428686261177063, + "learning_rate": 1.286762219636527e-08, + "loss": 0.178, + "step": 15754 + }, + { + "epoch": 4.192389568919638, + "grad_norm": 0.2862301468849182, + "learning_rate": 1.2859336123913035e-08, + "loss": 0.1726, + "step": 15755 + }, + { + "epoch": 4.192655667908462, + "grad_norm": 0.2874833345413208, + "learning_rate": 1.2851052536889662e-08, + "loss": 0.1914, + "step": 15756 + }, + { + "epoch": 4.192921766897285, + "grad_norm": 0.4030388593673706, + "learning_rate": 1.2842771435531341e-08, + "loss": 0.181, + "step": 15757 + }, + { + "epoch": 4.1931878658861095, + "grad_norm": 0.32498979568481445, + "learning_rate": 1.2834492820074295e-08, + "loss": 0.18, + "step": 15758 + }, + { + "epoch": 4.193453964874934, + "grad_norm": 0.28024598956108093, + "learning_rate": 1.2826216690754665e-08, + "loss": 0.1598, + "step": 15759 + }, + { + "epoch": 4.193720063863758, + "grad_norm": 0.3580869436264038, + "learning_rate": 1.2817943047808455e-08, + "loss": 0.1943, + "step": 15760 + }, + { + "epoch": 4.193986162852581, + "grad_norm": 0.4455997049808502, + "learning_rate": 1.2809671891471685e-08, + "loss": 0.1776, + "step": 15761 + }, + { + "epoch": 4.194252261841405, + "grad_norm": 0.33603471517562866, + "learning_rate": 1.2801403221980245e-08, + "loss": 0.1851, + "step": 15762 + }, + { + "epoch": 4.194518360830229, + "grad_norm": 0.39492136240005493, + "learning_rate": 1.2793137039569968e-08, + "loss": 0.1824, + "step": 15763 + }, + { + "epoch": 4.1947844598190525, + "grad_norm": 0.39047548174858093, + "learning_rate": 1.2784873344476644e-08, + "loss": 0.1845, + "step": 15764 + }, + { + "epoch": 4.195050558807877, + "grad_norm": 0.30715519189834595, + "learning_rate": 1.2776612136935983e-08, + "loss": 0.1787, + "step": 15765 + }, + { + "epoch": 4.195316657796701, + "grad_norm": 0.2595587968826294, + "learning_rate": 1.2768353417183565e-08, + "loss": 0.1557, + "step": 15766 + }, + { + "epoch": 4.195582756785524, + "grad_norm": 0.2794157862663269, + "learning_rate": 1.2760097185455e-08, + "loss": 0.1715, + "step": 15767 + }, + { + "epoch": 4.195848855774348, + "grad_norm": 0.34032413363456726, + "learning_rate": 1.275184344198571e-08, + "loss": 0.1661, + "step": 15768 + }, + { + "epoch": 4.196114954763172, + "grad_norm": 0.2873696982860565, + "learning_rate": 1.2743592187011142e-08, + "loss": 0.1663, + "step": 15769 + }, + { + "epoch": 4.196381053751995, + "grad_norm": 0.3016456961631775, + "learning_rate": 1.2735343420766654e-08, + "loss": 0.1802, + "step": 15770 + }, + { + "epoch": 4.19664715274082, + "grad_norm": 0.28873467445373535, + "learning_rate": 1.2727097143487509e-08, + "loss": 0.1798, + "step": 15771 + }, + { + "epoch": 4.196913251729644, + "grad_norm": 0.46389642357826233, + "learning_rate": 1.2718853355408888e-08, + "loss": 0.1854, + "step": 15772 + }, + { + "epoch": 4.197179350718467, + "grad_norm": 0.2704433500766754, + "learning_rate": 1.2710612056765945e-08, + "loss": 0.1739, + "step": 15773 + }, + { + "epoch": 4.197445449707291, + "grad_norm": 0.28315067291259766, + "learning_rate": 1.2702373247793707e-08, + "loss": 0.1676, + "step": 15774 + }, + { + "epoch": 4.197711548696115, + "grad_norm": 0.2960595190525055, + "learning_rate": 1.2694136928727207e-08, + "loss": 0.1779, + "step": 15775 + }, + { + "epoch": 4.197977647684938, + "grad_norm": 0.2869736850261688, + "learning_rate": 1.268590309980132e-08, + "loss": 0.1843, + "step": 15776 + }, + { + "epoch": 4.1982437466737625, + "grad_norm": 0.27417677640914917, + "learning_rate": 1.2677671761250897e-08, + "loss": 0.1684, + "step": 15777 + }, + { + "epoch": 4.198509845662587, + "grad_norm": 0.28981178998947144, + "learning_rate": 1.2669442913310724e-08, + "loss": 0.1806, + "step": 15778 + }, + { + "epoch": 4.19877594465141, + "grad_norm": 0.3651476800441742, + "learning_rate": 1.2661216556215503e-08, + "loss": 0.1874, + "step": 15779 + }, + { + "epoch": 4.199042043640234, + "grad_norm": 0.26647719740867615, + "learning_rate": 1.2652992690199894e-08, + "loss": 0.1639, + "step": 15780 + }, + { + "epoch": 4.199308142629058, + "grad_norm": 0.3418070375919342, + "learning_rate": 1.2644771315498403e-08, + "loss": 0.1845, + "step": 15781 + }, + { + "epoch": 4.199574241617881, + "grad_norm": 0.333389550447464, + "learning_rate": 1.2636552432345582e-08, + "loss": 0.1594, + "step": 15782 + }, + { + "epoch": 4.1998403406067055, + "grad_norm": 0.3539975583553314, + "learning_rate": 1.2628336040975785e-08, + "loss": 0.1618, + "step": 15783 + }, + { + "epoch": 4.20010643959553, + "grad_norm": 0.3033011257648468, + "learning_rate": 1.262012214162339e-08, + "loss": 0.186, + "step": 15784 + }, + { + "epoch": 4.200372538584354, + "grad_norm": 0.37211117148399353, + "learning_rate": 1.261191073452268e-08, + "loss": 0.1757, + "step": 15785 + }, + { + "epoch": 4.200638637573177, + "grad_norm": 0.3027718961238861, + "learning_rate": 1.2603701819907874e-08, + "loss": 0.1808, + "step": 15786 + }, + { + "epoch": 4.200904736562001, + "grad_norm": 0.2629108428955078, + "learning_rate": 1.2595495398013078e-08, + "loss": 0.1663, + "step": 15787 + }, + { + "epoch": 4.201170835550825, + "grad_norm": 0.3558528423309326, + "learning_rate": 1.2587291469072381e-08, + "loss": 0.1892, + "step": 15788 + }, + { + "epoch": 4.2014369345396485, + "grad_norm": 0.26386669278144836, + "learning_rate": 1.2579090033319739e-08, + "loss": 0.1573, + "step": 15789 + }, + { + "epoch": 4.201703033528473, + "grad_norm": 0.4312100112438202, + "learning_rate": 1.2570891090989122e-08, + "loss": 0.2082, + "step": 15790 + }, + { + "epoch": 4.201969132517297, + "grad_norm": 0.30263063311576843, + "learning_rate": 1.2562694642314342e-08, + "loss": 0.1826, + "step": 15791 + }, + { + "epoch": 4.20223523150612, + "grad_norm": 0.2618972957134247, + "learning_rate": 1.2554500687529179e-08, + "loss": 0.1673, + "step": 15792 + }, + { + "epoch": 4.202501330494944, + "grad_norm": 0.2701118588447571, + "learning_rate": 1.2546309226867357e-08, + "loss": 0.1715, + "step": 15793 + }, + { + "epoch": 4.202767429483768, + "grad_norm": 0.28272438049316406, + "learning_rate": 1.2538120260562546e-08, + "loss": 0.169, + "step": 15794 + }, + { + "epoch": 4.2030335284725915, + "grad_norm": 0.25630924105644226, + "learning_rate": 1.2529933788848235e-08, + "loss": 0.1777, + "step": 15795 + }, + { + "epoch": 4.203299627461416, + "grad_norm": 0.30295252799987793, + "learning_rate": 1.2521749811957972e-08, + "loss": 0.1987, + "step": 15796 + }, + { + "epoch": 4.20356572645024, + "grad_norm": 0.30181294679641724, + "learning_rate": 1.251356833012519e-08, + "loss": 0.1872, + "step": 15797 + }, + { + "epoch": 4.203831825439063, + "grad_norm": 0.3766702711582184, + "learning_rate": 1.2505389343583195e-08, + "loss": 0.1888, + "step": 15798 + }, + { + "epoch": 4.204097924427887, + "grad_norm": 0.2714931070804596, + "learning_rate": 1.2497212852565286e-08, + "loss": 0.1577, + "step": 15799 + }, + { + "epoch": 4.204364023416711, + "grad_norm": 0.46561962366104126, + "learning_rate": 1.248903885730469e-08, + "loss": 0.1967, + "step": 15800 + }, + { + "epoch": 4.2046301224055345, + "grad_norm": 0.32239830493927, + "learning_rate": 1.2480867358034552e-08, + "loss": 0.167, + "step": 15801 + }, + { + "epoch": 4.204896221394359, + "grad_norm": 0.37421178817749023, + "learning_rate": 1.247269835498791e-08, + "loss": 0.1752, + "step": 15802 + }, + { + "epoch": 4.205162320383183, + "grad_norm": 0.3721961975097656, + "learning_rate": 1.2464531848397797e-08, + "loss": 0.1795, + "step": 15803 + }, + { + "epoch": 4.205428419372007, + "grad_norm": 0.24877752363681793, + "learning_rate": 1.2456367838497096e-08, + "loss": 0.16, + "step": 15804 + }, + { + "epoch": 4.20569451836083, + "grad_norm": 0.2769644260406494, + "learning_rate": 1.2448206325518695e-08, + "loss": 0.1923, + "step": 15805 + }, + { + "epoch": 4.205960617349654, + "grad_norm": 0.3522733151912689, + "learning_rate": 1.2440047309695322e-08, + "loss": 0.1885, + "step": 15806 + }, + { + "epoch": 4.206226716338478, + "grad_norm": 0.28630849719047546, + "learning_rate": 1.243189079125977e-08, + "loss": 0.1664, + "step": 15807 + }, + { + "epoch": 4.206492815327302, + "grad_norm": 0.30094146728515625, + "learning_rate": 1.2423736770444626e-08, + "loss": 0.1983, + "step": 15808 + }, + { + "epoch": 4.206758914316126, + "grad_norm": 0.3465160131454468, + "learning_rate": 1.2415585247482497e-08, + "loss": 0.1922, + "step": 15809 + }, + { + "epoch": 4.20702501330495, + "grad_norm": 0.2696979343891144, + "learning_rate": 1.2407436222605827e-08, + "loss": 0.1808, + "step": 15810 + }, + { + "epoch": 4.207291112293773, + "grad_norm": 0.29213735461235046, + "learning_rate": 1.2399289696047088e-08, + "loss": 0.1853, + "step": 15811 + }, + { + "epoch": 4.207557211282597, + "grad_norm": 0.29658183455467224, + "learning_rate": 1.2391145668038616e-08, + "loss": 0.1621, + "step": 15812 + }, + { + "epoch": 4.207823310271421, + "grad_norm": 0.27006232738494873, + "learning_rate": 1.2383004138812681e-08, + "loss": 0.1628, + "step": 15813 + }, + { + "epoch": 4.2080894092602446, + "grad_norm": 0.2877451479434967, + "learning_rate": 1.2374865108601529e-08, + "loss": 0.1637, + "step": 15814 + }, + { + "epoch": 4.208355508249069, + "grad_norm": 0.27678102254867554, + "learning_rate": 1.2366728577637275e-08, + "loss": 0.1752, + "step": 15815 + }, + { + "epoch": 4.208621607237893, + "grad_norm": 0.2859494388103485, + "learning_rate": 1.2358594546152034e-08, + "loss": 0.1698, + "step": 15816 + }, + { + "epoch": 4.208887706226716, + "grad_norm": 0.3913668990135193, + "learning_rate": 1.2350463014377742e-08, + "loss": 0.1855, + "step": 15817 + }, + { + "epoch": 4.20915380521554, + "grad_norm": 0.28664764761924744, + "learning_rate": 1.2342333982546383e-08, + "loss": 0.172, + "step": 15818 + }, + { + "epoch": 4.209419904204364, + "grad_norm": 0.47596484422683716, + "learning_rate": 1.2334207450889766e-08, + "loss": 0.1919, + "step": 15819 + }, + { + "epoch": 4.2096860031931875, + "grad_norm": 0.2935014069080353, + "learning_rate": 1.2326083419639699e-08, + "loss": 0.1702, + "step": 15820 + }, + { + "epoch": 4.209952102182012, + "grad_norm": 0.43149444460868835, + "learning_rate": 1.2317961889027895e-08, + "loss": 0.1859, + "step": 15821 + }, + { + "epoch": 4.210218201170836, + "grad_norm": 0.285744309425354, + "learning_rate": 1.2309842859286013e-08, + "loss": 0.1711, + "step": 15822 + }, + { + "epoch": 4.210484300159659, + "grad_norm": 0.4301532506942749, + "learning_rate": 1.230172633064559e-08, + "loss": 0.1826, + "step": 15823 + }, + { + "epoch": 4.210750399148483, + "grad_norm": 0.38461655378341675, + "learning_rate": 1.2293612303338163e-08, + "loss": 0.1724, + "step": 15824 + }, + { + "epoch": 4.211016498137307, + "grad_norm": 0.3341929316520691, + "learning_rate": 1.2285500777595126e-08, + "loss": 0.1877, + "step": 15825 + }, + { + "epoch": 4.2112825971261305, + "grad_norm": 0.3401263356208801, + "learning_rate": 1.227739175364787e-08, + "loss": 0.1888, + "step": 15826 + }, + { + "epoch": 4.211548696114955, + "grad_norm": 0.31718918681144714, + "learning_rate": 1.2269285231727633e-08, + "loss": 0.178, + "step": 15827 + }, + { + "epoch": 4.211814795103779, + "grad_norm": 0.2688066363334656, + "learning_rate": 1.2261181212065663e-08, + "loss": 0.1725, + "step": 15828 + }, + { + "epoch": 4.212080894092603, + "grad_norm": 0.24406982958316803, + "learning_rate": 1.2253079694893099e-08, + "loss": 0.1562, + "step": 15829 + }, + { + "epoch": 4.212346993081426, + "grad_norm": 0.36953407526016235, + "learning_rate": 1.2244980680441042e-08, + "loss": 0.197, + "step": 15830 + }, + { + "epoch": 4.21261309207025, + "grad_norm": 1.3990014791488647, + "learning_rate": 1.2236884168940432e-08, + "loss": 0.1711, + "step": 15831 + }, + { + "epoch": 4.212879191059074, + "grad_norm": 0.2558794617652893, + "learning_rate": 1.2228790160622227e-08, + "loss": 0.1645, + "step": 15832 + }, + { + "epoch": 4.213145290047898, + "grad_norm": 0.2650069296360016, + "learning_rate": 1.222069865571731e-08, + "loss": 0.1844, + "step": 15833 + }, + { + "epoch": 4.213411389036722, + "grad_norm": 0.3208431005477905, + "learning_rate": 1.2212609654456418e-08, + "loss": 0.1679, + "step": 15834 + }, + { + "epoch": 4.213677488025546, + "grad_norm": 0.29720059037208557, + "learning_rate": 1.2204523157070302e-08, + "loss": 0.1772, + "step": 15835 + }, + { + "epoch": 4.213943587014369, + "grad_norm": 0.2912011444568634, + "learning_rate": 1.2196439163789585e-08, + "loss": 0.1658, + "step": 15836 + }, + { + "epoch": 4.214209686003193, + "grad_norm": 0.32047122716903687, + "learning_rate": 1.2188357674844873e-08, + "loss": 0.1962, + "step": 15837 + }, + { + "epoch": 4.214475784992017, + "grad_norm": 0.25472769141197205, + "learning_rate": 1.2180278690466628e-08, + "loss": 0.1511, + "step": 15838 + }, + { + "epoch": 4.214741883980841, + "grad_norm": 0.32725805044174194, + "learning_rate": 1.2172202210885308e-08, + "loss": 0.1811, + "step": 15839 + }, + { + "epoch": 4.215007982969665, + "grad_norm": 0.2738662362098694, + "learning_rate": 1.2164128236331228e-08, + "loss": 0.1754, + "step": 15840 + }, + { + "epoch": 4.215274081958489, + "grad_norm": 0.4061015546321869, + "learning_rate": 1.2156056767034728e-08, + "loss": 0.207, + "step": 15841 + }, + { + "epoch": 4.215540180947312, + "grad_norm": 0.2835158109664917, + "learning_rate": 1.2147987803225978e-08, + "loss": 0.1756, + "step": 15842 + }, + { + "epoch": 4.215806279936136, + "grad_norm": 0.40455490350723267, + "learning_rate": 1.213992134513514e-08, + "loss": 0.1779, + "step": 15843 + }, + { + "epoch": 4.21607237892496, + "grad_norm": 0.32043716311454773, + "learning_rate": 1.2131857392992285e-08, + "loss": 0.1746, + "step": 15844 + }, + { + "epoch": 4.216338477913784, + "grad_norm": 0.2834743857383728, + "learning_rate": 1.2123795947027426e-08, + "loss": 0.1881, + "step": 15845 + }, + { + "epoch": 4.216604576902608, + "grad_norm": 0.3551473319530487, + "learning_rate": 1.2115737007470461e-08, + "loss": 0.1748, + "step": 15846 + }, + { + "epoch": 4.216870675891432, + "grad_norm": 0.2820931375026703, + "learning_rate": 1.2107680574551293e-08, + "loss": 0.1843, + "step": 15847 + }, + { + "epoch": 4.217136774880255, + "grad_norm": 0.26874837279319763, + "learning_rate": 1.2099626648499662e-08, + "loss": 0.1766, + "step": 15848 + }, + { + "epoch": 4.217402873869079, + "grad_norm": 0.2679813504219055, + "learning_rate": 1.2091575229545292e-08, + "loss": 0.1626, + "step": 15849 + }, + { + "epoch": 4.217668972857903, + "grad_norm": 0.29103249311447144, + "learning_rate": 1.2083526317917848e-08, + "loss": 0.1683, + "step": 15850 + }, + { + "epoch": 4.217935071846727, + "grad_norm": 0.3484085500240326, + "learning_rate": 1.2075479913846876e-08, + "loss": 0.1803, + "step": 15851 + }, + { + "epoch": 4.218201170835551, + "grad_norm": 0.3263779878616333, + "learning_rate": 1.2067436017561916e-08, + "loss": 0.1557, + "step": 15852 + }, + { + "epoch": 4.218467269824375, + "grad_norm": 0.2613394558429718, + "learning_rate": 1.205939462929234e-08, + "loss": 0.1676, + "step": 15853 + }, + { + "epoch": 4.218733368813199, + "grad_norm": 0.2600984573364258, + "learning_rate": 1.2051355749267556e-08, + "loss": 0.1604, + "step": 15854 + }, + { + "epoch": 4.218999467802022, + "grad_norm": 0.28655582666397095, + "learning_rate": 1.204331937771681e-08, + "loss": 0.1598, + "step": 15855 + }, + { + "epoch": 4.219265566790846, + "grad_norm": 0.28965017199516296, + "learning_rate": 1.2035285514869354e-08, + "loss": 0.1762, + "step": 15856 + }, + { + "epoch": 4.21953166577967, + "grad_norm": 0.26178038120269775, + "learning_rate": 1.2027254160954258e-08, + "loss": 0.1684, + "step": 15857 + }, + { + "epoch": 4.219797764768494, + "grad_norm": 0.24215540289878845, + "learning_rate": 1.2019225316200687e-08, + "loss": 0.1553, + "step": 15858 + }, + { + "epoch": 4.220063863757318, + "grad_norm": 0.41884759068489075, + "learning_rate": 1.2011198980837578e-08, + "loss": 0.1728, + "step": 15859 + }, + { + "epoch": 4.220329962746142, + "grad_norm": 0.26752930879592896, + "learning_rate": 1.2003175155093893e-08, + "loss": 0.1606, + "step": 15860 + }, + { + "epoch": 4.220596061734965, + "grad_norm": 0.39864689111709595, + "learning_rate": 1.1995153839198458e-08, + "loss": 0.1971, + "step": 15861 + }, + { + "epoch": 4.220862160723789, + "grad_norm": 0.3919110596179962, + "learning_rate": 1.198713503338008e-08, + "loss": 0.1912, + "step": 15862 + }, + { + "epoch": 4.221128259712613, + "grad_norm": 0.35685181617736816, + "learning_rate": 1.1979118737867444e-08, + "loss": 0.1919, + "step": 15863 + }, + { + "epoch": 4.221394358701437, + "grad_norm": 0.35372012853622437, + "learning_rate": 1.1971104952889211e-08, + "loss": 0.1822, + "step": 15864 + }, + { + "epoch": 4.221660457690261, + "grad_norm": 0.26266640424728394, + "learning_rate": 1.1963093678673951e-08, + "loss": 0.1703, + "step": 15865 + }, + { + "epoch": 4.221926556679085, + "grad_norm": 0.3180939257144928, + "learning_rate": 1.1955084915450176e-08, + "loss": 0.2014, + "step": 15866 + }, + { + "epoch": 4.222192655667908, + "grad_norm": 0.31466004252433777, + "learning_rate": 1.1947078663446275e-08, + "loss": 0.1744, + "step": 15867 + }, + { + "epoch": 4.222458754656732, + "grad_norm": 0.28329768776893616, + "learning_rate": 1.1939074922890625e-08, + "loss": 0.1693, + "step": 15868 + }, + { + "epoch": 4.222724853645556, + "grad_norm": 0.37229135632514954, + "learning_rate": 1.1931073694011518e-08, + "loss": 0.1966, + "step": 15869 + }, + { + "epoch": 4.22299095263438, + "grad_norm": 0.601212203502655, + "learning_rate": 1.192307497703714e-08, + "loss": 0.1746, + "step": 15870 + }, + { + "epoch": 4.223257051623204, + "grad_norm": 0.30009186267852783, + "learning_rate": 1.1915078772195642e-08, + "loss": 0.1737, + "step": 15871 + }, + { + "epoch": 4.223523150612028, + "grad_norm": 0.35945242643356323, + "learning_rate": 1.1907085079715096e-08, + "loss": 0.1781, + "step": 15872 + }, + { + "epoch": 4.223789249600851, + "grad_norm": 0.30887433886528015, + "learning_rate": 1.189909389982352e-08, + "loss": 0.1992, + "step": 15873 + }, + { + "epoch": 4.224055348589675, + "grad_norm": 0.26676085591316223, + "learning_rate": 1.1891105232748788e-08, + "loss": 0.1836, + "step": 15874 + }, + { + "epoch": 4.224321447578499, + "grad_norm": 0.2713548243045807, + "learning_rate": 1.1883119078718806e-08, + "loss": 0.1679, + "step": 15875 + }, + { + "epoch": 4.224587546567323, + "grad_norm": 0.27224627137184143, + "learning_rate": 1.1875135437961314e-08, + "loss": 0.154, + "step": 15876 + }, + { + "epoch": 4.224853645556147, + "grad_norm": 0.3329312205314636, + "learning_rate": 1.1867154310704052e-08, + "loss": 0.169, + "step": 15877 + }, + { + "epoch": 4.225119744544971, + "grad_norm": 0.2991621494293213, + "learning_rate": 1.1859175697174628e-08, + "loss": 0.1901, + "step": 15878 + }, + { + "epoch": 4.225385843533795, + "grad_norm": 0.38911527395248413, + "learning_rate": 1.1851199597600614e-08, + "loss": 0.1757, + "step": 15879 + }, + { + "epoch": 4.225651942522618, + "grad_norm": 0.26340630650520325, + "learning_rate": 1.1843226012209528e-08, + "loss": 0.1674, + "step": 15880 + }, + { + "epoch": 4.225918041511442, + "grad_norm": 0.42899927496910095, + "learning_rate": 1.183525494122879e-08, + "loss": 0.2021, + "step": 15881 + }, + { + "epoch": 4.2261841405002665, + "grad_norm": 0.27047234773635864, + "learning_rate": 1.1827286384885727e-08, + "loss": 0.1645, + "step": 15882 + }, + { + "epoch": 4.22645023948909, + "grad_norm": 0.2805328071117401, + "learning_rate": 1.1819320343407645e-08, + "loss": 0.1735, + "step": 15883 + }, + { + "epoch": 4.226716338477914, + "grad_norm": 0.3875790536403656, + "learning_rate": 1.1811356817021723e-08, + "loss": 0.169, + "step": 15884 + }, + { + "epoch": 4.226982437466738, + "grad_norm": 0.29523319005966187, + "learning_rate": 1.1803395805955107e-08, + "loss": 0.1553, + "step": 15885 + }, + { + "epoch": 4.227248536455561, + "grad_norm": 0.31810230016708374, + "learning_rate": 1.1795437310434863e-08, + "loss": 0.1884, + "step": 15886 + }, + { + "epoch": 4.227514635444385, + "grad_norm": 0.3190000653266907, + "learning_rate": 1.1787481330687998e-08, + "loss": 0.1496, + "step": 15887 + }, + { + "epoch": 4.2277807344332095, + "grad_norm": 0.29421961307525635, + "learning_rate": 1.1779527866941441e-08, + "loss": 0.1682, + "step": 15888 + }, + { + "epoch": 4.228046833422033, + "grad_norm": 0.4240773022174835, + "learning_rate": 1.1771576919422e-08, + "loss": 0.1933, + "step": 15889 + }, + { + "epoch": 4.228312932410857, + "grad_norm": 0.4906136393547058, + "learning_rate": 1.1763628488356493e-08, + "loss": 0.1979, + "step": 15890 + }, + { + "epoch": 4.228579031399681, + "grad_norm": 0.2687635123729706, + "learning_rate": 1.1755682573971582e-08, + "loss": 0.1762, + "step": 15891 + }, + { + "epoch": 4.228845130388504, + "grad_norm": 0.2746092677116394, + "learning_rate": 1.1747739176493954e-08, + "loss": 0.1797, + "step": 15892 + }, + { + "epoch": 4.229111229377328, + "grad_norm": 0.2796938717365265, + "learning_rate": 1.1739798296150116e-08, + "loss": 0.1664, + "step": 15893 + }, + { + "epoch": 4.229377328366152, + "grad_norm": 0.309098482131958, + "learning_rate": 1.1731859933166576e-08, + "loss": 0.1702, + "step": 15894 + }, + { + "epoch": 4.229643427354976, + "grad_norm": 0.25915810465812683, + "learning_rate": 1.1723924087769776e-08, + "loss": 0.1506, + "step": 15895 + }, + { + "epoch": 4.2299095263438, + "grad_norm": 0.6898595094680786, + "learning_rate": 1.1715990760186056e-08, + "loss": 0.1882, + "step": 15896 + }, + { + "epoch": 4.230175625332624, + "grad_norm": 0.2912929654121399, + "learning_rate": 1.1708059950641669e-08, + "loss": 0.1781, + "step": 15897 + }, + { + "epoch": 4.230441724321447, + "grad_norm": 0.35835444927215576, + "learning_rate": 1.1700131659362844e-08, + "loss": 0.1723, + "step": 15898 + }, + { + "epoch": 4.230707823310271, + "grad_norm": 0.28132298588752747, + "learning_rate": 1.1692205886575678e-08, + "loss": 0.1847, + "step": 15899 + }, + { + "epoch": 4.230973922299095, + "grad_norm": 0.26379895210266113, + "learning_rate": 1.1684282632506259e-08, + "loss": 0.1758, + "step": 15900 + }, + { + "epoch": 4.231240021287919, + "grad_norm": 0.35038280487060547, + "learning_rate": 1.1676361897380548e-08, + "loss": 0.186, + "step": 15901 + }, + { + "epoch": 4.231506120276743, + "grad_norm": 0.25960251688957214, + "learning_rate": 1.1668443681424511e-08, + "loss": 0.158, + "step": 15902 + }, + { + "epoch": 4.231772219265567, + "grad_norm": 0.28691405057907104, + "learning_rate": 1.1660527984863944e-08, + "loss": 0.1667, + "step": 15903 + }, + { + "epoch": 4.232038318254391, + "grad_norm": 0.3108721971511841, + "learning_rate": 1.1652614807924622e-08, + "loss": 0.1733, + "step": 15904 + }, + { + "epoch": 4.232304417243214, + "grad_norm": 0.47725388407707214, + "learning_rate": 1.1644704150832275e-08, + "loss": 0.1976, + "step": 15905 + }, + { + "epoch": 4.232570516232038, + "grad_norm": 0.31490129232406616, + "learning_rate": 1.16367960138125e-08, + "loss": 0.1757, + "step": 15906 + }, + { + "epoch": 4.2328366152208625, + "grad_norm": 0.2886200249195099, + "learning_rate": 1.1628890397090863e-08, + "loss": 0.177, + "step": 15907 + }, + { + "epoch": 4.233102714209686, + "grad_norm": 0.2902263402938843, + "learning_rate": 1.1620987300892848e-08, + "loss": 0.1914, + "step": 15908 + }, + { + "epoch": 4.23336881319851, + "grad_norm": 0.2679606080055237, + "learning_rate": 1.1613086725443888e-08, + "loss": 0.1808, + "step": 15909 + }, + { + "epoch": 4.233634912187334, + "grad_norm": 0.2774191200733185, + "learning_rate": 1.160518867096929e-08, + "loss": 0.1859, + "step": 15910 + }, + { + "epoch": 4.233901011176157, + "grad_norm": 0.26683440804481506, + "learning_rate": 1.1597293137694364e-08, + "loss": 0.1681, + "step": 15911 + }, + { + "epoch": 4.234167110164981, + "grad_norm": 0.36370712518692017, + "learning_rate": 1.1589400125844251e-08, + "loss": 0.1912, + "step": 15912 + }, + { + "epoch": 4.2344332091538055, + "grad_norm": 0.27526384592056274, + "learning_rate": 1.1581509635644127e-08, + "loss": 0.1664, + "step": 15913 + }, + { + "epoch": 4.234699308142629, + "grad_norm": 0.2669236660003662, + "learning_rate": 1.1573621667319e-08, + "loss": 0.1707, + "step": 15914 + }, + { + "epoch": 4.234965407131453, + "grad_norm": 0.42709749937057495, + "learning_rate": 1.1565736221093881e-08, + "loss": 0.1967, + "step": 15915 + }, + { + "epoch": 4.235231506120277, + "grad_norm": 0.2745107412338257, + "learning_rate": 1.1557853297193665e-08, + "loss": 0.175, + "step": 15916 + }, + { + "epoch": 4.2354976051091, + "grad_norm": 0.43022897839546204, + "learning_rate": 1.1549972895843218e-08, + "loss": 0.1803, + "step": 15917 + }, + { + "epoch": 4.235763704097924, + "grad_norm": 0.30131104588508606, + "learning_rate": 1.1542095017267273e-08, + "loss": 0.1668, + "step": 15918 + }, + { + "epoch": 4.2360298030867485, + "grad_norm": 0.3718286156654358, + "learning_rate": 1.1534219661690548e-08, + "loss": 0.1903, + "step": 15919 + }, + { + "epoch": 4.236295902075572, + "grad_norm": 0.24303264915943146, + "learning_rate": 1.1526346829337618e-08, + "loss": 0.154, + "step": 15920 + }, + { + "epoch": 4.236562001064396, + "grad_norm": 0.2713077664375305, + "learning_rate": 1.1518476520433074e-08, + "loss": 0.1691, + "step": 15921 + }, + { + "epoch": 4.23682810005322, + "grad_norm": 0.27217569947242737, + "learning_rate": 1.1510608735201376e-08, + "loss": 0.1633, + "step": 15922 + }, + { + "epoch": 4.237094199042044, + "grad_norm": 0.29144731163978577, + "learning_rate": 1.1502743473866949e-08, + "loss": 0.1653, + "step": 15923 + }, + { + "epoch": 4.237360298030867, + "grad_norm": 0.2838473916053772, + "learning_rate": 1.1494880736654123e-08, + "loss": 0.1787, + "step": 15924 + }, + { + "epoch": 4.2376263970196915, + "grad_norm": 0.2611929476261139, + "learning_rate": 1.1487020523787127e-08, + "loss": 0.1641, + "step": 15925 + }, + { + "epoch": 4.237892496008516, + "grad_norm": 0.28759458661079407, + "learning_rate": 1.1479162835490198e-08, + "loss": 0.1672, + "step": 15926 + }, + { + "epoch": 4.238158594997339, + "grad_norm": 0.2661687731742859, + "learning_rate": 1.1471307671987407e-08, + "loss": 0.1648, + "step": 15927 + }, + { + "epoch": 4.238424693986163, + "grad_norm": 0.2570021152496338, + "learning_rate": 1.1463455033502844e-08, + "loss": 0.1537, + "step": 15928 + }, + { + "epoch": 4.238690792974987, + "grad_norm": 0.33759692311286926, + "learning_rate": 1.145560492026043e-08, + "loss": 0.1771, + "step": 15929 + }, + { + "epoch": 4.23895689196381, + "grad_norm": 0.2822782099246979, + "learning_rate": 1.1447757332484099e-08, + "loss": 0.1851, + "step": 15930 + }, + { + "epoch": 4.239222990952634, + "grad_norm": 0.27831050753593445, + "learning_rate": 1.1439912270397678e-08, + "loss": 0.1881, + "step": 15931 + }, + { + "epoch": 4.239489089941459, + "grad_norm": 0.44611677527427673, + "learning_rate": 1.1432069734224935e-08, + "loss": 0.1859, + "step": 15932 + }, + { + "epoch": 4.239755188930282, + "grad_norm": 0.2633802890777588, + "learning_rate": 1.1424229724189526e-08, + "loss": 0.1722, + "step": 15933 + }, + { + "epoch": 4.240021287919106, + "grad_norm": 0.3430756628513336, + "learning_rate": 1.1416392240515105e-08, + "loss": 0.1808, + "step": 15934 + }, + { + "epoch": 4.24028738690793, + "grad_norm": 0.28898948431015015, + "learning_rate": 1.1408557283425158e-08, + "loss": 0.1676, + "step": 15935 + }, + { + "epoch": 4.240553485896753, + "grad_norm": 0.4542010724544525, + "learning_rate": 1.1400724853143184e-08, + "loss": 0.1765, + "step": 15936 + }, + { + "epoch": 4.240819584885577, + "grad_norm": 0.41985300183296204, + "learning_rate": 1.1392894949892584e-08, + "loss": 0.1724, + "step": 15937 + }, + { + "epoch": 4.2410856838744015, + "grad_norm": 0.3445180654525757, + "learning_rate": 1.13850675738967e-08, + "loss": 0.1912, + "step": 15938 + }, + { + "epoch": 4.241351782863225, + "grad_norm": 0.34580203890800476, + "learning_rate": 1.1377242725378744e-08, + "loss": 0.171, + "step": 15939 + }, + { + "epoch": 4.241617881852049, + "grad_norm": 0.37464261054992676, + "learning_rate": 1.1369420404561925e-08, + "loss": 0.1627, + "step": 15940 + }, + { + "epoch": 4.241883980840873, + "grad_norm": 0.3167952299118042, + "learning_rate": 1.1361600611669331e-08, + "loss": 0.1743, + "step": 15941 + }, + { + "epoch": 4.242150079829696, + "grad_norm": 0.3284357190132141, + "learning_rate": 1.1353783346924006e-08, + "loss": 0.1735, + "step": 15942 + }, + { + "epoch": 4.24241617881852, + "grad_norm": 0.3144759237766266, + "learning_rate": 1.134596861054894e-08, + "loss": 0.1864, + "step": 15943 + }, + { + "epoch": 4.2426822778073445, + "grad_norm": 0.26263779401779175, + "learning_rate": 1.1338156402766986e-08, + "loss": 0.1632, + "step": 15944 + }, + { + "epoch": 4.242948376796168, + "grad_norm": 0.28066402673721313, + "learning_rate": 1.1330346723800977e-08, + "loss": 0.1874, + "step": 15945 + }, + { + "epoch": 4.243214475784992, + "grad_norm": 0.39256298542022705, + "learning_rate": 1.1322539573873668e-08, + "loss": 0.1843, + "step": 15946 + }, + { + "epoch": 4.243480574773816, + "grad_norm": 0.2693318724632263, + "learning_rate": 1.131473495320775e-08, + "loss": 0.1731, + "step": 15947 + }, + { + "epoch": 4.24374667376264, + "grad_norm": 0.28080663084983826, + "learning_rate": 1.1306932862025786e-08, + "loss": 0.1868, + "step": 15948 + }, + { + "epoch": 4.244012772751463, + "grad_norm": 0.33553802967071533, + "learning_rate": 1.1299133300550345e-08, + "loss": 0.1699, + "step": 15949 + }, + { + "epoch": 4.2442788717402875, + "grad_norm": 0.366877943277359, + "learning_rate": 1.129133626900386e-08, + "loss": 0.1633, + "step": 15950 + }, + { + "epoch": 4.244544970729112, + "grad_norm": 0.24878187477588654, + "learning_rate": 1.1283541767608729e-08, + "loss": 0.1625, + "step": 15951 + }, + { + "epoch": 4.244811069717935, + "grad_norm": 0.2984018325805664, + "learning_rate": 1.1275749796587264e-08, + "loss": 0.2028, + "step": 15952 + }, + { + "epoch": 4.245077168706759, + "grad_norm": 0.36535680294036865, + "learning_rate": 1.1267960356161731e-08, + "loss": 0.1664, + "step": 15953 + }, + { + "epoch": 4.245343267695583, + "grad_norm": 0.3423362970352173, + "learning_rate": 1.1260173446554255e-08, + "loss": 0.1781, + "step": 15954 + }, + { + "epoch": 4.245609366684406, + "grad_norm": 0.2935076653957367, + "learning_rate": 1.1252389067986989e-08, + "loss": 0.1787, + "step": 15955 + }, + { + "epoch": 4.2458754656732305, + "grad_norm": 0.26673775911331177, + "learning_rate": 1.12446072206819e-08, + "loss": 0.1707, + "step": 15956 + }, + { + "epoch": 4.246141564662055, + "grad_norm": 0.3141684830188751, + "learning_rate": 1.1236827904861013e-08, + "loss": 0.1728, + "step": 15957 + }, + { + "epoch": 4.246407663650878, + "grad_norm": 0.26387524604797363, + "learning_rate": 1.1229051120746114e-08, + "loss": 0.1652, + "step": 15958 + }, + { + "epoch": 4.246673762639702, + "grad_norm": 0.2619415521621704, + "learning_rate": 1.1221276868559082e-08, + "loss": 0.1613, + "step": 15959 + }, + { + "epoch": 4.246939861628526, + "grad_norm": 0.3633423149585724, + "learning_rate": 1.1213505148521673e-08, + "loss": 0.1762, + "step": 15960 + }, + { + "epoch": 4.247205960617349, + "grad_norm": 0.4113592505455017, + "learning_rate": 1.12057359608555e-08, + "loss": 0.1784, + "step": 15961 + }, + { + "epoch": 4.2474720596061735, + "grad_norm": 0.506325364112854, + "learning_rate": 1.1197969305782195e-08, + "loss": 0.1864, + "step": 15962 + }, + { + "epoch": 4.247738158594998, + "grad_norm": 0.2734331488609314, + "learning_rate": 1.1190205183523238e-08, + "loss": 0.1742, + "step": 15963 + }, + { + "epoch": 4.248004257583821, + "grad_norm": 0.2699424624443054, + "learning_rate": 1.1182443594300116e-08, + "loss": 0.1525, + "step": 15964 + }, + { + "epoch": 4.248270356572645, + "grad_norm": 0.26408225297927856, + "learning_rate": 1.1174684538334166e-08, + "loss": 0.1672, + "step": 15965 + }, + { + "epoch": 4.248536455561469, + "grad_norm": 0.28817662596702576, + "learning_rate": 1.1166928015846721e-08, + "loss": 0.1934, + "step": 15966 + }, + { + "epoch": 4.248802554550292, + "grad_norm": 0.28750255703926086, + "learning_rate": 1.1159174027059004e-08, + "loss": 0.1859, + "step": 15967 + }, + { + "epoch": 4.2490686535391164, + "grad_norm": 0.4072868525981903, + "learning_rate": 1.1151422572192204e-08, + "loss": 0.1946, + "step": 15968 + }, + { + "epoch": 4.249334752527941, + "grad_norm": 0.27624770998954773, + "learning_rate": 1.1143673651467355e-08, + "loss": 0.1603, + "step": 15969 + }, + { + "epoch": 4.249600851516764, + "grad_norm": 0.35973215103149414, + "learning_rate": 1.1135927265105516e-08, + "loss": 0.1964, + "step": 15970 + }, + { + "epoch": 4.249866950505588, + "grad_norm": 0.25826719403266907, + "learning_rate": 1.1128183413327596e-08, + "loss": 0.1431, + "step": 15971 + }, + { + "epoch": 4.250133049494412, + "grad_norm": 0.27713003754615784, + "learning_rate": 1.1120442096354476e-08, + "loss": 0.162, + "step": 15972 + }, + { + "epoch": 4.250399148483236, + "grad_norm": 0.4273363947868347, + "learning_rate": 1.1112703314406968e-08, + "loss": 0.1839, + "step": 15973 + }, + { + "epoch": 4.250665247472059, + "grad_norm": 0.28862476348876953, + "learning_rate": 1.1104967067705805e-08, + "loss": 0.1837, + "step": 15974 + }, + { + "epoch": 4.2509313464608836, + "grad_norm": 0.26880860328674316, + "learning_rate": 1.1097233356471602e-08, + "loss": 0.1659, + "step": 15975 + }, + { + "epoch": 4.251197445449708, + "grad_norm": 0.2636345326900482, + "learning_rate": 1.1089502180924992e-08, + "loss": 0.1683, + "step": 15976 + }, + { + "epoch": 4.251463544438531, + "grad_norm": 0.28871259093284607, + "learning_rate": 1.1081773541286421e-08, + "loss": 0.1638, + "step": 15977 + }, + { + "epoch": 4.251729643427355, + "grad_norm": 0.3066595494747162, + "learning_rate": 1.1074047437776368e-08, + "loss": 0.1609, + "step": 15978 + }, + { + "epoch": 4.251995742416179, + "grad_norm": 0.27341926097869873, + "learning_rate": 1.1066323870615201e-08, + "loss": 0.1675, + "step": 15979 + }, + { + "epoch": 4.252261841405002, + "grad_norm": 0.34779834747314453, + "learning_rate": 1.1058602840023191e-08, + "loss": 0.1625, + "step": 15980 + }, + { + "epoch": 4.2525279403938265, + "grad_norm": 0.27872300148010254, + "learning_rate": 1.1050884346220557e-08, + "loss": 0.1877, + "step": 15981 + }, + { + "epoch": 4.252794039382651, + "grad_norm": 0.38795286417007446, + "learning_rate": 1.1043168389427448e-08, + "loss": 0.1822, + "step": 15982 + }, + { + "epoch": 4.253060138371474, + "grad_norm": 0.29369044303894043, + "learning_rate": 1.1035454969863978e-08, + "loss": 0.1787, + "step": 15983 + }, + { + "epoch": 4.253326237360298, + "grad_norm": 0.30561819672584534, + "learning_rate": 1.1027744087750091e-08, + "loss": 0.1899, + "step": 15984 + }, + { + "epoch": 4.253592336349122, + "grad_norm": 0.2466038465499878, + "learning_rate": 1.1020035743305767e-08, + "loss": 0.1617, + "step": 15985 + }, + { + "epoch": 4.253858435337945, + "grad_norm": 0.4316011369228363, + "learning_rate": 1.101232993675082e-08, + "loss": 0.1805, + "step": 15986 + }, + { + "epoch": 4.2541245343267695, + "grad_norm": 0.4045720398426056, + "learning_rate": 1.100462666830505e-08, + "loss": 0.172, + "step": 15987 + }, + { + "epoch": 4.254390633315594, + "grad_norm": 0.5110570788383484, + "learning_rate": 1.0996925938188173e-08, + "loss": 0.1882, + "step": 15988 + }, + { + "epoch": 4.254656732304417, + "grad_norm": 0.28036826848983765, + "learning_rate": 1.0989227746619856e-08, + "loss": 0.1713, + "step": 15989 + }, + { + "epoch": 4.254922831293241, + "grad_norm": 0.3155253827571869, + "learning_rate": 1.0981532093819613e-08, + "loss": 0.1848, + "step": 15990 + }, + { + "epoch": 4.255188930282065, + "grad_norm": 0.2888508141040802, + "learning_rate": 1.0973838980007e-08, + "loss": 0.1847, + "step": 15991 + }, + { + "epoch": 4.255455029270888, + "grad_norm": 0.27899134159088135, + "learning_rate": 1.0966148405401388e-08, + "loss": 0.1837, + "step": 15992 + }, + { + "epoch": 4.2557211282597125, + "grad_norm": 0.3295454680919647, + "learning_rate": 1.0958460370222167e-08, + "loss": 0.1605, + "step": 15993 + }, + { + "epoch": 4.255987227248537, + "grad_norm": 0.3686673641204834, + "learning_rate": 1.0950774874688574e-08, + "loss": 0.1748, + "step": 15994 + }, + { + "epoch": 4.25625332623736, + "grad_norm": 0.2892071008682251, + "learning_rate": 1.0943091919019842e-08, + "loss": 0.189, + "step": 15995 + }, + { + "epoch": 4.256519425226184, + "grad_norm": 0.25689396262168884, + "learning_rate": 1.09354115034351e-08, + "loss": 0.1603, + "step": 15996 + }, + { + "epoch": 4.256785524215008, + "grad_norm": 0.2519969344139099, + "learning_rate": 1.0927733628153413e-08, + "loss": 0.1711, + "step": 15997 + }, + { + "epoch": 4.257051623203832, + "grad_norm": 0.3134240508079529, + "learning_rate": 1.0920058293393775e-08, + "loss": 0.1562, + "step": 15998 + }, + { + "epoch": 4.2573177221926555, + "grad_norm": 0.2652980387210846, + "learning_rate": 1.0912385499375087e-08, + "loss": 0.1636, + "step": 15999 + }, + { + "epoch": 4.25758382118148, + "grad_norm": 0.2722250819206238, + "learning_rate": 1.0904715246316209e-08, + "loss": 0.163, + "step": 16000 + }, + { + "epoch": 4.257849920170304, + "grad_norm": 0.2889958620071411, + "learning_rate": 1.0897047534435888e-08, + "loss": 0.1925, + "step": 16001 + }, + { + "epoch": 4.258116019159127, + "grad_norm": 0.288717657327652, + "learning_rate": 1.0889382363952837e-08, + "loss": 0.1794, + "step": 16002 + }, + { + "epoch": 4.258382118147951, + "grad_norm": 0.3589579164981842, + "learning_rate": 1.088171973508568e-08, + "loss": 0.1812, + "step": 16003 + }, + { + "epoch": 4.258648217136775, + "grad_norm": 0.31538456678390503, + "learning_rate": 1.0874059648053002e-08, + "loss": 0.1568, + "step": 16004 + }, + { + "epoch": 4.2589143161255985, + "grad_norm": 0.3276999592781067, + "learning_rate": 1.0866402103073224e-08, + "loss": 0.1675, + "step": 16005 + }, + { + "epoch": 4.259180415114423, + "grad_norm": 0.27546966075897217, + "learning_rate": 1.0858747100364807e-08, + "loss": 0.1652, + "step": 16006 + }, + { + "epoch": 4.259446514103247, + "grad_norm": 0.2690174877643585, + "learning_rate": 1.0851094640146052e-08, + "loss": 0.1606, + "step": 16007 + }, + { + "epoch": 4.25971261309207, + "grad_norm": 0.274763286113739, + "learning_rate": 1.0843444722635253e-08, + "loss": 0.1867, + "step": 16008 + }, + { + "epoch": 4.259978712080894, + "grad_norm": 0.37604326009750366, + "learning_rate": 1.0835797348050546e-08, + "loss": 0.1829, + "step": 16009 + }, + { + "epoch": 4.260244811069718, + "grad_norm": 0.42478811740875244, + "learning_rate": 1.0828152516610134e-08, + "loss": 0.1542, + "step": 16010 + }, + { + "epoch": 4.260510910058541, + "grad_norm": 0.2747623920440674, + "learning_rate": 1.0820510228531998e-08, + "loss": 0.1825, + "step": 16011 + }, + { + "epoch": 4.260777009047366, + "grad_norm": 0.2812627851963043, + "learning_rate": 1.0812870484034143e-08, + "loss": 0.181, + "step": 16012 + }, + { + "epoch": 4.26104310803619, + "grad_norm": 0.43352046608924866, + "learning_rate": 1.0805233283334448e-08, + "loss": 0.1587, + "step": 16013 + }, + { + "epoch": 4.261309207025013, + "grad_norm": 0.28949975967407227, + "learning_rate": 1.079759862665075e-08, + "loss": 0.1659, + "step": 16014 + }, + { + "epoch": 4.261575306013837, + "grad_norm": 0.26434993743896484, + "learning_rate": 1.0789966514200822e-08, + "loss": 0.1747, + "step": 16015 + }, + { + "epoch": 4.261841405002661, + "grad_norm": 0.2746541202068329, + "learning_rate": 1.0782336946202319e-08, + "loss": 0.1722, + "step": 16016 + }, + { + "epoch": 4.262107503991485, + "grad_norm": 0.3534722626209259, + "learning_rate": 1.077470992287286e-08, + "loss": 0.1785, + "step": 16017 + }, + { + "epoch": 4.2623736029803085, + "grad_norm": 0.27230072021484375, + "learning_rate": 1.076708544442999e-08, + "loss": 0.1754, + "step": 16018 + }, + { + "epoch": 4.262639701969133, + "grad_norm": 0.4001053273677826, + "learning_rate": 1.0759463511091193e-08, + "loss": 0.2056, + "step": 16019 + }, + { + "epoch": 4.262905800957956, + "grad_norm": 0.33187875151634216, + "learning_rate": 1.0751844123073817e-08, + "loss": 0.1844, + "step": 16020 + }, + { + "epoch": 4.26317189994678, + "grad_norm": 0.2672671377658844, + "learning_rate": 1.0744227280595241e-08, + "loss": 0.163, + "step": 16021 + }, + { + "epoch": 4.263437998935604, + "grad_norm": 0.294005423784256, + "learning_rate": 1.0736612983872651e-08, + "loss": 0.1967, + "step": 16022 + }, + { + "epoch": 4.263704097924428, + "grad_norm": 0.31789252161979675, + "learning_rate": 1.0729001233123247e-08, + "loss": 0.1805, + "step": 16023 + }, + { + "epoch": 4.2639701969132515, + "grad_norm": 0.2864533066749573, + "learning_rate": 1.0721392028564136e-08, + "loss": 0.1845, + "step": 16024 + }, + { + "epoch": 4.264236295902076, + "grad_norm": 0.3488040566444397, + "learning_rate": 1.0713785370412377e-08, + "loss": 0.1663, + "step": 16025 + }, + { + "epoch": 4.2645023948909, + "grad_norm": 0.29216042160987854, + "learning_rate": 1.0706181258884872e-08, + "loss": 0.1744, + "step": 16026 + }, + { + "epoch": 4.264768493879723, + "grad_norm": 0.3167957663536072, + "learning_rate": 1.069857969419854e-08, + "loss": 0.1796, + "step": 16027 + }, + { + "epoch": 4.265034592868547, + "grad_norm": 0.28000742197036743, + "learning_rate": 1.0690980676570182e-08, + "loss": 0.1701, + "step": 16028 + }, + { + "epoch": 4.265300691857371, + "grad_norm": 0.28699633479118347, + "learning_rate": 1.068338420621655e-08, + "loss": 0.1628, + "step": 16029 + }, + { + "epoch": 4.2655667908461945, + "grad_norm": 0.31123295426368713, + "learning_rate": 1.067579028335428e-08, + "loss": 0.1651, + "step": 16030 + }, + { + "epoch": 4.265832889835019, + "grad_norm": 0.5501151084899902, + "learning_rate": 1.0668198908199988e-08, + "loss": 0.1608, + "step": 16031 + }, + { + "epoch": 4.266098988823843, + "grad_norm": 0.27003979682922363, + "learning_rate": 1.06606100809702e-08, + "loss": 0.169, + "step": 16032 + }, + { + "epoch": 4.266365087812666, + "grad_norm": 0.3882785141468048, + "learning_rate": 1.0653023801881355e-08, + "loss": 0.1829, + "step": 16033 + }, + { + "epoch": 4.26663118680149, + "grad_norm": 0.3091522455215454, + "learning_rate": 1.0645440071149848e-08, + "loss": 0.1664, + "step": 16034 + }, + { + "epoch": 4.266897285790314, + "grad_norm": 0.32504385709762573, + "learning_rate": 1.0637858888991957e-08, + "loss": 0.1781, + "step": 16035 + }, + { + "epoch": 4.2671633847791375, + "grad_norm": 0.3192850649356842, + "learning_rate": 1.0630280255623936e-08, + "loss": 0.1847, + "step": 16036 + }, + { + "epoch": 4.267429483767962, + "grad_norm": 0.3265872299671173, + "learning_rate": 1.062270417126191e-08, + "loss": 0.1755, + "step": 16037 + }, + { + "epoch": 4.267695582756786, + "grad_norm": 0.2806554138660431, + "learning_rate": 1.0615130636121983e-08, + "loss": 0.1548, + "step": 16038 + }, + { + "epoch": 4.267961681745609, + "grad_norm": 0.29496049880981445, + "learning_rate": 1.0607559650420172e-08, + "loss": 0.1859, + "step": 16039 + }, + { + "epoch": 4.268227780734433, + "grad_norm": 0.2917378842830658, + "learning_rate": 1.059999121437244e-08, + "loss": 0.1791, + "step": 16040 + }, + { + "epoch": 4.268493879723257, + "grad_norm": 0.2725639343261719, + "learning_rate": 1.0592425328194598e-08, + "loss": 0.1803, + "step": 16041 + }, + { + "epoch": 4.268759978712081, + "grad_norm": 0.25193360447883606, + "learning_rate": 1.0584861992102489e-08, + "loss": 0.1593, + "step": 16042 + }, + { + "epoch": 4.269026077700905, + "grad_norm": 0.30976277589797974, + "learning_rate": 1.0577301206311806e-08, + "loss": 0.1887, + "step": 16043 + }, + { + "epoch": 4.269292176689729, + "grad_norm": 0.45220983028411865, + "learning_rate": 1.056974297103822e-08, + "loss": 0.1859, + "step": 16044 + }, + { + "epoch": 4.269558275678552, + "grad_norm": 0.2785129249095917, + "learning_rate": 1.0562187286497281e-08, + "loss": 0.1675, + "step": 16045 + }, + { + "epoch": 4.269824374667376, + "grad_norm": 0.4326590597629547, + "learning_rate": 1.0554634152904507e-08, + "loss": 0.2036, + "step": 16046 + }, + { + "epoch": 4.2700904736562, + "grad_norm": 0.3081009089946747, + "learning_rate": 1.0547083570475324e-08, + "loss": 0.1769, + "step": 16047 + }, + { + "epoch": 4.270356572645024, + "grad_norm": 0.3240946829319, + "learning_rate": 1.0539535539425115e-08, + "loss": 0.1749, + "step": 16048 + }, + { + "epoch": 4.270622671633848, + "grad_norm": 0.29518091678619385, + "learning_rate": 1.0531990059969132e-08, + "loss": 0.1854, + "step": 16049 + }, + { + "epoch": 4.270888770622672, + "grad_norm": 0.38966771960258484, + "learning_rate": 1.0524447132322611e-08, + "loss": 0.1687, + "step": 16050 + }, + { + "epoch": 4.271154869611496, + "grad_norm": 0.2942713797092438, + "learning_rate": 1.0516906756700673e-08, + "loss": 0.1788, + "step": 16051 + }, + { + "epoch": 4.271420968600319, + "grad_norm": 0.3336140215396881, + "learning_rate": 1.0509368933318374e-08, + "loss": 0.1683, + "step": 16052 + }, + { + "epoch": 4.271687067589143, + "grad_norm": 0.26162219047546387, + "learning_rate": 1.0501833662390746e-08, + "loss": 0.1646, + "step": 16053 + }, + { + "epoch": 4.271953166577967, + "grad_norm": 0.2653477191925049, + "learning_rate": 1.0494300944132684e-08, + "loss": 0.1618, + "step": 16054 + }, + { + "epoch": 4.2722192655667905, + "grad_norm": 0.35513579845428467, + "learning_rate": 1.0486770778759057e-08, + "loss": 0.1692, + "step": 16055 + }, + { + "epoch": 4.272485364555615, + "grad_norm": 0.4700751304626465, + "learning_rate": 1.0479243166484607e-08, + "loss": 0.1744, + "step": 16056 + }, + { + "epoch": 4.272751463544439, + "grad_norm": 0.2791737914085388, + "learning_rate": 1.0471718107524085e-08, + "loss": 0.1602, + "step": 16057 + }, + { + "epoch": 4.273017562533262, + "grad_norm": 0.2805144488811493, + "learning_rate": 1.0464195602092074e-08, + "loss": 0.164, + "step": 16058 + }, + { + "epoch": 4.273283661522086, + "grad_norm": 0.2753397524356842, + "learning_rate": 1.0456675650403157e-08, + "loss": 0.1793, + "step": 16059 + }, + { + "epoch": 4.27354976051091, + "grad_norm": 0.26017919182777405, + "learning_rate": 1.0449158252671787e-08, + "loss": 0.162, + "step": 16060 + }, + { + "epoch": 4.2738158594997335, + "grad_norm": 0.2898653745651245, + "learning_rate": 1.0441643409112422e-08, + "loss": 0.1764, + "step": 16061 + }, + { + "epoch": 4.274081958488558, + "grad_norm": 0.46456989645957947, + "learning_rate": 1.043413111993937e-08, + "loss": 0.2073, + "step": 16062 + }, + { + "epoch": 4.274348057477382, + "grad_norm": 0.2620698809623718, + "learning_rate": 1.042662138536693e-08, + "loss": 0.1739, + "step": 16063 + }, + { + "epoch": 4.274614156466205, + "grad_norm": 0.2680091857910156, + "learning_rate": 1.0419114205609236e-08, + "loss": 0.1685, + "step": 16064 + }, + { + "epoch": 4.274880255455029, + "grad_norm": 0.285478800535202, + "learning_rate": 1.0411609580880477e-08, + "loss": 0.1839, + "step": 16065 + }, + { + "epoch": 4.275146354443853, + "grad_norm": 0.2838967442512512, + "learning_rate": 1.0404107511394633e-08, + "loss": 0.1639, + "step": 16066 + }, + { + "epoch": 4.275412453432677, + "grad_norm": 0.30255356431007385, + "learning_rate": 1.0396607997365714e-08, + "loss": 0.1869, + "step": 16067 + }, + { + "epoch": 4.275678552421501, + "grad_norm": 0.33221301436424255, + "learning_rate": 1.0389111039007614e-08, + "loss": 0.1853, + "step": 16068 + }, + { + "epoch": 4.275944651410325, + "grad_norm": 0.32808202505111694, + "learning_rate": 1.0381616636534174e-08, + "loss": 0.1804, + "step": 16069 + }, + { + "epoch": 4.276210750399149, + "grad_norm": 0.34910210967063904, + "learning_rate": 1.0374124790159156e-08, + "loss": 0.1647, + "step": 16070 + }, + { + "epoch": 4.276476849387972, + "grad_norm": 0.343354731798172, + "learning_rate": 1.036663550009621e-08, + "loss": 0.1756, + "step": 16071 + }, + { + "epoch": 4.276742948376796, + "grad_norm": 0.37211787700653076, + "learning_rate": 1.0359148766558979e-08, + "loss": 0.1933, + "step": 16072 + }, + { + "epoch": 4.27700904736562, + "grad_norm": 0.29287660121917725, + "learning_rate": 1.0351664589760966e-08, + "loss": 0.1888, + "step": 16073 + }, + { + "epoch": 4.277275146354444, + "grad_norm": 0.3388165533542633, + "learning_rate": 1.034418296991565e-08, + "loss": 0.202, + "step": 16074 + }, + { + "epoch": 4.277541245343268, + "grad_norm": 0.2736137807369232, + "learning_rate": 1.0336703907236433e-08, + "loss": 0.1725, + "step": 16075 + }, + { + "epoch": 4.277807344332092, + "grad_norm": 0.2762967646121979, + "learning_rate": 1.0329227401936636e-08, + "loss": 0.1736, + "step": 16076 + }, + { + "epoch": 4.278073443320915, + "grad_norm": 0.27276790142059326, + "learning_rate": 1.0321753454229477e-08, + "loss": 0.1627, + "step": 16077 + }, + { + "epoch": 4.278339542309739, + "grad_norm": 0.26520809531211853, + "learning_rate": 1.0314282064328162e-08, + "loss": 0.1691, + "step": 16078 + }, + { + "epoch": 4.278605641298563, + "grad_norm": 0.2829332649707794, + "learning_rate": 1.0306813232445755e-08, + "loss": 0.1864, + "step": 16079 + }, + { + "epoch": 4.278871740287387, + "grad_norm": 0.40127769112586975, + "learning_rate": 1.0299346958795318e-08, + "loss": 0.1984, + "step": 16080 + }, + { + "epoch": 4.279137839276211, + "grad_norm": 0.2791908383369446, + "learning_rate": 1.0291883243589773e-08, + "loss": 0.1942, + "step": 16081 + }, + { + "epoch": 4.279403938265035, + "grad_norm": 0.2642214894294739, + "learning_rate": 1.028442208704201e-08, + "loss": 0.171, + "step": 16082 + }, + { + "epoch": 4.279670037253858, + "grad_norm": 0.2858228087425232, + "learning_rate": 1.0276963489364843e-08, + "loss": 0.185, + "step": 16083 + }, + { + "epoch": 4.279936136242682, + "grad_norm": 0.42551589012145996, + "learning_rate": 1.0269507450771008e-08, + "loss": 0.1773, + "step": 16084 + }, + { + "epoch": 4.280202235231506, + "grad_norm": 0.26824164390563965, + "learning_rate": 1.0262053971473161e-08, + "loss": 0.1696, + "step": 16085 + }, + { + "epoch": 4.28046833422033, + "grad_norm": 0.3495870530605316, + "learning_rate": 1.0254603051683896e-08, + "loss": 0.1711, + "step": 16086 + }, + { + "epoch": 4.280734433209154, + "grad_norm": 0.30148592591285706, + "learning_rate": 1.0247154691615711e-08, + "loss": 0.1707, + "step": 16087 + }, + { + "epoch": 4.281000532197978, + "grad_norm": 0.33212757110595703, + "learning_rate": 1.0239708891481058e-08, + "loss": 0.1823, + "step": 16088 + }, + { + "epoch": 4.281266631186801, + "grad_norm": 0.24763372540473938, + "learning_rate": 1.0232265651492311e-08, + "loss": 0.1659, + "step": 16089 + }, + { + "epoch": 4.281532730175625, + "grad_norm": 0.2722298204898834, + "learning_rate": 1.0224824971861767e-08, + "loss": 0.1673, + "step": 16090 + }, + { + "epoch": 4.281798829164449, + "grad_norm": 0.3553701937198639, + "learning_rate": 1.0217386852801669e-08, + "loss": 0.1662, + "step": 16091 + }, + { + "epoch": 4.282064928153273, + "grad_norm": 0.36396467685699463, + "learning_rate": 1.020995129452411e-08, + "loss": 0.2095, + "step": 16092 + }, + { + "epoch": 4.282331027142097, + "grad_norm": 0.26798731088638306, + "learning_rate": 1.0202518297241236e-08, + "loss": 0.1626, + "step": 16093 + }, + { + "epoch": 4.282597126130921, + "grad_norm": 0.2803994119167328, + "learning_rate": 1.0195087861164996e-08, + "loss": 0.1708, + "step": 16094 + }, + { + "epoch": 4.282863225119745, + "grad_norm": 0.28467893600463867, + "learning_rate": 1.0187659986507358e-08, + "loss": 0.1795, + "step": 16095 + }, + { + "epoch": 4.283129324108568, + "grad_norm": 0.3414987325668335, + "learning_rate": 1.0180234673480147e-08, + "loss": 0.1885, + "step": 16096 + }, + { + "epoch": 4.283395423097392, + "grad_norm": 0.26967236399650574, + "learning_rate": 1.0172811922295165e-08, + "loss": 0.1507, + "step": 16097 + }, + { + "epoch": 4.283661522086216, + "grad_norm": 0.2599624991416931, + "learning_rate": 1.0165391733164119e-08, + "loss": 0.1741, + "step": 16098 + }, + { + "epoch": 4.28392762107504, + "grad_norm": 0.5413193106651306, + "learning_rate": 1.0157974106298683e-08, + "loss": 0.1763, + "step": 16099 + }, + { + "epoch": 4.284193720063864, + "grad_norm": 0.2684400677680969, + "learning_rate": 1.0150559041910367e-08, + "loss": 0.1786, + "step": 16100 + }, + { + "epoch": 4.284459819052688, + "grad_norm": 0.3161429166793823, + "learning_rate": 1.0143146540210713e-08, + "loss": 0.173, + "step": 16101 + }, + { + "epoch": 4.284725918041511, + "grad_norm": 0.3779224455356598, + "learning_rate": 1.0135736601411093e-08, + "loss": 0.1722, + "step": 16102 + }, + { + "epoch": 4.284992017030335, + "grad_norm": 0.32453617453575134, + "learning_rate": 1.0128329225722887e-08, + "loss": 0.1581, + "step": 16103 + }, + { + "epoch": 4.285258116019159, + "grad_norm": 0.3448435664176941, + "learning_rate": 1.0120924413357356e-08, + "loss": 0.1865, + "step": 16104 + }, + { + "epoch": 4.285524215007983, + "grad_norm": 0.28130635619163513, + "learning_rate": 1.0113522164525701e-08, + "loss": 0.1799, + "step": 16105 + }, + { + "epoch": 4.285790313996807, + "grad_norm": 0.26456940174102783, + "learning_rate": 1.0106122479439072e-08, + "loss": 0.1724, + "step": 16106 + }, + { + "epoch": 4.286056412985631, + "grad_norm": 0.26933836936950684, + "learning_rate": 1.009872535830848e-08, + "loss": 0.1695, + "step": 16107 + }, + { + "epoch": 4.286322511974454, + "grad_norm": 0.321634441614151, + "learning_rate": 1.0091330801344955e-08, + "loss": 0.1757, + "step": 16108 + }, + { + "epoch": 4.286588610963278, + "grad_norm": 0.26051172614097595, + "learning_rate": 1.0083938808759363e-08, + "loss": 0.1578, + "step": 16109 + }, + { + "epoch": 4.286854709952102, + "grad_norm": 0.35976284742355347, + "learning_rate": 1.0076549380762567e-08, + "loss": 0.2017, + "step": 16110 + }, + { + "epoch": 4.287120808940926, + "grad_norm": 0.27960118651390076, + "learning_rate": 1.0069162517565277e-08, + "loss": 0.1768, + "step": 16111 + }, + { + "epoch": 4.28738690792975, + "grad_norm": 0.273406445980072, + "learning_rate": 1.0061778219378259e-08, + "loss": 0.1715, + "step": 16112 + }, + { + "epoch": 4.287653006918574, + "grad_norm": 0.277610182762146, + "learning_rate": 1.0054396486412064e-08, + "loss": 0.1707, + "step": 16113 + }, + { + "epoch": 4.287919105907397, + "grad_norm": 0.2860173285007477, + "learning_rate": 1.0047017318877282e-08, + "loss": 0.1872, + "step": 16114 + }, + { + "epoch": 4.288185204896221, + "grad_norm": 0.27385857701301575, + "learning_rate": 1.0039640716984343e-08, + "loss": 0.1747, + "step": 16115 + }, + { + "epoch": 4.288451303885045, + "grad_norm": 0.2697862684726715, + "learning_rate": 1.003226668094368e-08, + "loss": 0.1696, + "step": 16116 + }, + { + "epoch": 4.2887174028738695, + "grad_norm": 0.2953225076198578, + "learning_rate": 1.0024895210965578e-08, + "loss": 0.1868, + "step": 16117 + }, + { + "epoch": 4.288983501862693, + "grad_norm": 0.262404203414917, + "learning_rate": 1.0017526307260294e-08, + "loss": 0.1597, + "step": 16118 + }, + { + "epoch": 4.289249600851517, + "grad_norm": 0.37722083926200867, + "learning_rate": 1.0010159970038024e-08, + "loss": 0.1806, + "step": 16119 + }, + { + "epoch": 4.289515699840341, + "grad_norm": 0.3825755715370178, + "learning_rate": 1.0002796199508868e-08, + "loss": 0.1787, + "step": 16120 + }, + { + "epoch": 4.289781798829164, + "grad_norm": 0.25965216755867004, + "learning_rate": 9.995434995882835e-09, + "loss": 0.1649, + "step": 16121 + }, + { + "epoch": 4.290047897817988, + "grad_norm": 0.2707470953464508, + "learning_rate": 9.988076359369912e-09, + "loss": 0.1697, + "step": 16122 + }, + { + "epoch": 4.2903139968068125, + "grad_norm": 0.29214537143707275, + "learning_rate": 9.980720290179945e-09, + "loss": 0.1823, + "step": 16123 + }, + { + "epoch": 4.290580095795636, + "grad_norm": 0.3721027672290802, + "learning_rate": 9.973366788522752e-09, + "loss": 0.1754, + "step": 16124 + }, + { + "epoch": 4.29084619478446, + "grad_norm": 0.2607021629810333, + "learning_rate": 9.966015854608101e-09, + "loss": 0.1551, + "step": 16125 + }, + { + "epoch": 4.291112293773284, + "grad_norm": 0.37463125586509705, + "learning_rate": 9.958667488645623e-09, + "loss": 0.181, + "step": 16126 + }, + { + "epoch": 4.291378392762107, + "grad_norm": 0.279496967792511, + "learning_rate": 9.951321690844938e-09, + "loss": 0.1943, + "step": 16127 + }, + { + "epoch": 4.291644491750931, + "grad_norm": 0.3165995478630066, + "learning_rate": 9.943978461415536e-09, + "loss": 0.1705, + "step": 16128 + }, + { + "epoch": 4.2919105907397554, + "grad_norm": 0.3764989674091339, + "learning_rate": 9.936637800566882e-09, + "loss": 0.1913, + "step": 16129 + }, + { + "epoch": 4.292176689728579, + "grad_norm": 0.3849409520626068, + "learning_rate": 9.92929970850832e-09, + "loss": 0.1912, + "step": 16130 + }, + { + "epoch": 4.292442788717403, + "grad_norm": 0.3189812898635864, + "learning_rate": 9.921964185449183e-09, + "loss": 0.1795, + "step": 16131 + }, + { + "epoch": 4.292708887706227, + "grad_norm": 0.3599945306777954, + "learning_rate": 9.914631231598658e-09, + "loss": 0.192, + "step": 16132 + }, + { + "epoch": 4.29297498669505, + "grad_norm": 0.3052372336387634, + "learning_rate": 9.9073008471659e-09, + "loss": 0.1773, + "step": 16133 + }, + { + "epoch": 4.293241085683874, + "grad_norm": 0.2725208103656769, + "learning_rate": 9.89997303236001e-09, + "loss": 0.1774, + "step": 16134 + }, + { + "epoch": 4.293507184672698, + "grad_norm": 0.2554076611995697, + "learning_rate": 9.892647787389995e-09, + "loss": 0.1562, + "step": 16135 + }, + { + "epoch": 4.2937732836615226, + "grad_norm": 0.2723091244697571, + "learning_rate": 9.88532511246476e-09, + "loss": 0.1589, + "step": 16136 + }, + { + "epoch": 4.294039382650346, + "grad_norm": 0.36741799116134644, + "learning_rate": 9.878005007793189e-09, + "loss": 0.1652, + "step": 16137 + }, + { + "epoch": 4.29430548163917, + "grad_norm": 0.3907027542591095, + "learning_rate": 9.87068747358404e-09, + "loss": 0.1753, + "step": 16138 + }, + { + "epoch": 4.294571580627993, + "grad_norm": 0.299258291721344, + "learning_rate": 9.86337251004602e-09, + "loss": 0.1923, + "step": 16139 + }, + { + "epoch": 4.294837679616817, + "grad_norm": 0.31045207381248474, + "learning_rate": 9.856060117387798e-09, + "loss": 0.1763, + "step": 16140 + }, + { + "epoch": 4.295103778605641, + "grad_norm": 0.36034107208251953, + "learning_rate": 9.848750295817931e-09, + "loss": 0.1839, + "step": 16141 + }, + { + "epoch": 4.2953698775944655, + "grad_norm": 0.4036286473274231, + "learning_rate": 9.841443045544883e-09, + "loss": 0.2083, + "step": 16142 + }, + { + "epoch": 4.295635976583289, + "grad_norm": 0.3106549084186554, + "learning_rate": 9.8341383667771e-09, + "loss": 0.1685, + "step": 16143 + }, + { + "epoch": 4.295902075572113, + "grad_norm": 0.2829609811306, + "learning_rate": 9.826836259722927e-09, + "loss": 0.1685, + "step": 16144 + }, + { + "epoch": 4.296168174560937, + "grad_norm": 0.28945690393447876, + "learning_rate": 9.819536724590605e-09, + "loss": 0.1832, + "step": 16145 + }, + { + "epoch": 4.29643427354976, + "grad_norm": 0.2779707610607147, + "learning_rate": 9.812239761588382e-09, + "loss": 0.1802, + "step": 16146 + }, + { + "epoch": 4.296700372538584, + "grad_norm": 0.2829013466835022, + "learning_rate": 9.804945370924322e-09, + "loss": 0.1891, + "step": 16147 + }, + { + "epoch": 4.2969664715274085, + "grad_norm": 0.3725215494632721, + "learning_rate": 9.797653552806507e-09, + "loss": 0.1505, + "step": 16148 + }, + { + "epoch": 4.297232570516232, + "grad_norm": 0.2977980077266693, + "learning_rate": 9.79036430744291e-09, + "loss": 0.1783, + "step": 16149 + }, + { + "epoch": 4.297498669505056, + "grad_norm": 0.2621549963951111, + "learning_rate": 9.783077635041459e-09, + "loss": 0.1644, + "step": 16150 + }, + { + "epoch": 4.29776476849388, + "grad_norm": 0.28024473786354065, + "learning_rate": 9.77579353580994e-09, + "loss": 0.1701, + "step": 16151 + }, + { + "epoch": 4.298030867482703, + "grad_norm": 0.26429763436317444, + "learning_rate": 9.768512009956164e-09, + "loss": 0.1612, + "step": 16152 + }, + { + "epoch": 4.298296966471527, + "grad_norm": 0.3495919704437256, + "learning_rate": 9.761233057687757e-09, + "loss": 0.171, + "step": 16153 + }, + { + "epoch": 4.2985630654603515, + "grad_norm": 0.2546439468860626, + "learning_rate": 9.753956679212361e-09, + "loss": 0.1621, + "step": 16154 + }, + { + "epoch": 4.298829164449175, + "grad_norm": 0.43546783924102783, + "learning_rate": 9.7466828747375e-09, + "loss": 0.1878, + "step": 16155 + }, + { + "epoch": 4.299095263437999, + "grad_norm": 0.2742435038089752, + "learning_rate": 9.739411644470675e-09, + "loss": 0.1645, + "step": 16156 + }, + { + "epoch": 4.299361362426823, + "grad_norm": 0.4050188362598419, + "learning_rate": 9.732142988619229e-09, + "loss": 0.1957, + "step": 16157 + }, + { + "epoch": 4.299627461415646, + "grad_norm": 0.3073154389858246, + "learning_rate": 9.724876907390511e-09, + "loss": 0.1905, + "step": 16158 + }, + { + "epoch": 4.29989356040447, + "grad_norm": 0.26890015602111816, + "learning_rate": 9.717613400991742e-09, + "loss": 0.1634, + "step": 16159 + }, + { + "epoch": 4.3001596593932945, + "grad_norm": 0.2971723675727844, + "learning_rate": 9.7103524696301e-09, + "loss": 0.1983, + "step": 16160 + }, + { + "epoch": 4.300425758382119, + "grad_norm": 0.3610592782497406, + "learning_rate": 9.703094113512678e-09, + "loss": 0.1764, + "step": 16161 + }, + { + "epoch": 4.300691857370942, + "grad_norm": 0.41182073950767517, + "learning_rate": 9.695838332846506e-09, + "loss": 0.1796, + "step": 16162 + }, + { + "epoch": 4.300957956359766, + "grad_norm": 0.2761932611465454, + "learning_rate": 9.688585127838545e-09, + "loss": 0.1717, + "step": 16163 + }, + { + "epoch": 4.301224055348589, + "grad_norm": 0.3732932507991791, + "learning_rate": 9.681334498695649e-09, + "loss": 0.1954, + "step": 16164 + }, + { + "epoch": 4.301490154337413, + "grad_norm": 0.2598327696323395, + "learning_rate": 9.674086445624641e-09, + "loss": 0.1611, + "step": 16165 + }, + { + "epoch": 4.3017562533262375, + "grad_norm": 0.30495452880859375, + "learning_rate": 9.666840968832224e-09, + "loss": 0.1857, + "step": 16166 + }, + { + "epoch": 4.302022352315062, + "grad_norm": 0.2919375002384186, + "learning_rate": 9.659598068525077e-09, + "loss": 0.1805, + "step": 16167 + }, + { + "epoch": 4.302288451303885, + "grad_norm": 0.2778889834880829, + "learning_rate": 9.652357744909767e-09, + "loss": 0.1777, + "step": 16168 + }, + { + "epoch": 4.302554550292709, + "grad_norm": 0.34834936261177063, + "learning_rate": 9.645119998192807e-09, + "loss": 0.2015, + "step": 16169 + }, + { + "epoch": 4.302820649281533, + "grad_norm": 0.3007170855998993, + "learning_rate": 9.637884828580622e-09, + "loss": 0.1713, + "step": 16170 + }, + { + "epoch": 4.303086748270356, + "grad_norm": 0.2654810845851898, + "learning_rate": 9.630652236279624e-09, + "loss": 0.1545, + "step": 16171 + }, + { + "epoch": 4.30335284725918, + "grad_norm": 0.29870137572288513, + "learning_rate": 9.623422221496035e-09, + "loss": 0.1786, + "step": 16172 + }, + { + "epoch": 4.303618946248005, + "grad_norm": 0.3333531618118286, + "learning_rate": 9.616194784436116e-09, + "loss": 0.181, + "step": 16173 + }, + { + "epoch": 4.303885045236828, + "grad_norm": 0.2819097638130188, + "learning_rate": 9.608969925305977e-09, + "loss": 0.1613, + "step": 16174 + }, + { + "epoch": 4.304151144225652, + "grad_norm": 0.46447131037712097, + "learning_rate": 9.601747644311698e-09, + "loss": 0.1865, + "step": 16175 + }, + { + "epoch": 4.304417243214476, + "grad_norm": 0.29483625292778015, + "learning_rate": 9.59452794165927e-09, + "loss": 0.1805, + "step": 16176 + }, + { + "epoch": 4.304683342203299, + "grad_norm": 0.339120477437973, + "learning_rate": 9.587310817554639e-09, + "loss": 0.1768, + "step": 16177 + }, + { + "epoch": 4.304949441192123, + "grad_norm": 0.3656764030456543, + "learning_rate": 9.580096272203608e-09, + "loss": 0.1756, + "step": 16178 + }, + { + "epoch": 4.3052155401809475, + "grad_norm": 0.30393803119659424, + "learning_rate": 9.57288430581198e-09, + "loss": 0.176, + "step": 16179 + }, + { + "epoch": 4.305481639169771, + "grad_norm": 0.28530874848365784, + "learning_rate": 9.565674918585465e-09, + "loss": 0.1639, + "step": 16180 + }, + { + "epoch": 4.305747738158595, + "grad_norm": 0.2880721092224121, + "learning_rate": 9.558468110729645e-09, + "loss": 0.1785, + "step": 16181 + }, + { + "epoch": 4.306013837147419, + "grad_norm": 0.47498923540115356, + "learning_rate": 9.551263882450133e-09, + "loss": 0.18, + "step": 16182 + }, + { + "epoch": 4.306279936136242, + "grad_norm": 0.2668202519416809, + "learning_rate": 9.544062233952355e-09, + "loss": 0.1637, + "step": 16183 + }, + { + "epoch": 4.306546035125066, + "grad_norm": 0.36471229791641235, + "learning_rate": 9.536863165441733e-09, + "loss": 0.1701, + "step": 16184 + }, + { + "epoch": 4.3068121341138905, + "grad_norm": 0.2553505301475525, + "learning_rate": 9.529666677123593e-09, + "loss": 0.1679, + "step": 16185 + }, + { + "epoch": 4.307078233102715, + "grad_norm": 0.2555933892726898, + "learning_rate": 9.522472769203238e-09, + "loss": 0.1648, + "step": 16186 + }, + { + "epoch": 4.307344332091538, + "grad_norm": 0.2723802626132965, + "learning_rate": 9.515281441885792e-09, + "loss": 0.1582, + "step": 16187 + }, + { + "epoch": 4.307610431080362, + "grad_norm": 0.42283880710601807, + "learning_rate": 9.508092695376424e-09, + "loss": 0.1761, + "step": 16188 + }, + { + "epoch": 4.307876530069186, + "grad_norm": 0.37093204259872437, + "learning_rate": 9.500906529880115e-09, + "loss": 0.1699, + "step": 16189 + }, + { + "epoch": 4.308142629058009, + "grad_norm": 0.31328079104423523, + "learning_rate": 9.493722945601845e-09, + "loss": 0.1871, + "step": 16190 + }, + { + "epoch": 4.3084087280468335, + "grad_norm": 0.38108009099960327, + "learning_rate": 9.486541942746529e-09, + "loss": 0.1841, + "step": 16191 + }, + { + "epoch": 4.308674827035658, + "grad_norm": 0.2669636607170105, + "learning_rate": 9.47936352151898e-09, + "loss": 0.1824, + "step": 16192 + }, + { + "epoch": 4.308940926024481, + "grad_norm": 0.27939727902412415, + "learning_rate": 9.472187682123911e-09, + "loss": 0.1682, + "step": 16193 + }, + { + "epoch": 4.309207025013305, + "grad_norm": 0.24886855483055115, + "learning_rate": 9.465014424766026e-09, + "loss": 0.1582, + "step": 16194 + }, + { + "epoch": 4.309473124002129, + "grad_norm": 0.353879451751709, + "learning_rate": 9.457843749649896e-09, + "loss": 0.1898, + "step": 16195 + }, + { + "epoch": 4.309739222990952, + "grad_norm": 0.28920599818229675, + "learning_rate": 9.450675656980067e-09, + "loss": 0.1707, + "step": 16196 + }, + { + "epoch": 4.3100053219797765, + "grad_norm": 0.2622043788433075, + "learning_rate": 9.443510146960954e-09, + "loss": 0.1639, + "step": 16197 + }, + { + "epoch": 4.310271420968601, + "grad_norm": 0.26043790578842163, + "learning_rate": 9.436347219796947e-09, + "loss": 0.1662, + "step": 16198 + }, + { + "epoch": 4.310537519957424, + "grad_norm": 0.3533761203289032, + "learning_rate": 9.429186875692363e-09, + "loss": 0.1746, + "step": 16199 + }, + { + "epoch": 4.310803618946248, + "grad_norm": 0.2974061071872711, + "learning_rate": 9.422029114851403e-09, + "loss": 0.1619, + "step": 16200 + }, + { + "epoch": 4.311069717935072, + "grad_norm": 0.257464200258255, + "learning_rate": 9.414873937478263e-09, + "loss": 0.1634, + "step": 16201 + }, + { + "epoch": 4.311335816923895, + "grad_norm": 0.4241461455821991, + "learning_rate": 9.407721343776975e-09, + "loss": 0.1892, + "step": 16202 + }, + { + "epoch": 4.3116019159127195, + "grad_norm": 0.2737126648426056, + "learning_rate": 9.40057133395159e-09, + "loss": 0.1739, + "step": 16203 + }, + { + "epoch": 4.311868014901544, + "grad_norm": 0.36036428809165955, + "learning_rate": 9.393423908206e-09, + "loss": 0.1779, + "step": 16204 + }, + { + "epoch": 4.312134113890367, + "grad_norm": 0.2841172516345978, + "learning_rate": 9.386279066744074e-09, + "loss": 0.1849, + "step": 16205 + }, + { + "epoch": 4.312400212879191, + "grad_norm": 0.38044115900993347, + "learning_rate": 9.379136809769605e-09, + "loss": 0.1824, + "step": 16206 + }, + { + "epoch": 4.312666311868015, + "grad_norm": 0.2824699580669403, + "learning_rate": 9.371997137486332e-09, + "loss": 0.1881, + "step": 16207 + }, + { + "epoch": 4.312932410856838, + "grad_norm": 0.3525492548942566, + "learning_rate": 9.364860050097845e-09, + "loss": 0.1734, + "step": 16208 + }, + { + "epoch": 4.313198509845662, + "grad_norm": 0.40417516231536865, + "learning_rate": 9.357725547807738e-09, + "loss": 0.1875, + "step": 16209 + }, + { + "epoch": 4.313464608834487, + "grad_norm": 0.3610212504863739, + "learning_rate": 9.35059363081948e-09, + "loss": 0.1875, + "step": 16210 + }, + { + "epoch": 4.313730707823311, + "grad_norm": 0.31003451347351074, + "learning_rate": 9.343464299336524e-09, + "loss": 0.172, + "step": 16211 + }, + { + "epoch": 4.313996806812134, + "grad_norm": 0.33694353699684143, + "learning_rate": 9.336337553562145e-09, + "loss": 0.1855, + "step": 16212 + }, + { + "epoch": 4.314262905800958, + "grad_norm": 0.335314005613327, + "learning_rate": 9.329213393699687e-09, + "loss": 0.1784, + "step": 16213 + }, + { + "epoch": 4.314529004789782, + "grad_norm": 0.27078038454055786, + "learning_rate": 9.322091819952305e-09, + "loss": 0.1662, + "step": 16214 + }, + { + "epoch": 4.314795103778605, + "grad_norm": 0.36414387822151184, + "learning_rate": 9.314972832523128e-09, + "loss": 0.2043, + "step": 16215 + }, + { + "epoch": 4.3150612027674295, + "grad_norm": 0.43098151683807373, + "learning_rate": 9.307856431615235e-09, + "loss": 0.1947, + "step": 16216 + }, + { + "epoch": 4.315327301756254, + "grad_norm": 0.2717766761779785, + "learning_rate": 9.300742617431545e-09, + "loss": 0.1635, + "step": 16217 + }, + { + "epoch": 4.315593400745077, + "grad_norm": 0.5044987797737122, + "learning_rate": 9.293631390175005e-09, + "loss": 0.1785, + "step": 16218 + }, + { + "epoch": 4.315859499733901, + "grad_norm": 0.27194300293922424, + "learning_rate": 9.286522750048398e-09, + "loss": 0.1581, + "step": 16219 + }, + { + "epoch": 4.316125598722725, + "grad_norm": 0.3619849681854248, + "learning_rate": 9.279416697254505e-09, + "loss": 0.1682, + "step": 16220 + }, + { + "epoch": 4.316391697711548, + "grad_norm": 0.3533584475517273, + "learning_rate": 9.272313231995999e-09, + "loss": 0.169, + "step": 16221 + }, + { + "epoch": 4.3166577967003725, + "grad_norm": 0.2784755229949951, + "learning_rate": 9.265212354475505e-09, + "loss": 0.1832, + "step": 16222 + }, + { + "epoch": 4.316923895689197, + "grad_norm": 0.27538472414016724, + "learning_rate": 9.258114064895528e-09, + "loss": 0.1669, + "step": 16223 + }, + { + "epoch": 4.31718999467802, + "grad_norm": 0.29923632740974426, + "learning_rate": 9.251018363458541e-09, + "loss": 0.1823, + "step": 16224 + }, + { + "epoch": 4.317456093666844, + "grad_norm": 0.4420282244682312, + "learning_rate": 9.243925250366903e-09, + "loss": 0.1892, + "step": 16225 + }, + { + "epoch": 4.317722192655668, + "grad_norm": 0.34137922525405884, + "learning_rate": 9.236834725822951e-09, + "loss": 0.1688, + "step": 16226 + }, + { + "epoch": 4.317988291644491, + "grad_norm": 0.29544854164123535, + "learning_rate": 9.229746790028903e-09, + "loss": 0.1698, + "step": 16227 + }, + { + "epoch": 4.3182543906333155, + "grad_norm": 0.26553982496261597, + "learning_rate": 9.222661443186951e-09, + "loss": 0.1718, + "step": 16228 + }, + { + "epoch": 4.31852048962214, + "grad_norm": 0.2716725468635559, + "learning_rate": 9.215578685499148e-09, + "loss": 0.1614, + "step": 16229 + }, + { + "epoch": 4.318786588610963, + "grad_norm": 0.25146710872650146, + "learning_rate": 9.20849851716754e-09, + "loss": 0.1642, + "step": 16230 + }, + { + "epoch": 4.319052687599787, + "grad_norm": 0.39204639196395874, + "learning_rate": 9.201420938394034e-09, + "loss": 0.1915, + "step": 16231 + }, + { + "epoch": 4.319318786588611, + "grad_norm": 0.39044108986854553, + "learning_rate": 9.194345949380532e-09, + "loss": 0.174, + "step": 16232 + }, + { + "epoch": 4.319584885577434, + "grad_norm": 0.2945111393928528, + "learning_rate": 9.1872735503288e-09, + "loss": 0.1847, + "step": 16233 + }, + { + "epoch": 4.3198509845662585, + "grad_norm": 0.6100487112998962, + "learning_rate": 9.180203741440551e-09, + "loss": 0.1875, + "step": 16234 + }, + { + "epoch": 4.320117083555083, + "grad_norm": 0.27814263105392456, + "learning_rate": 9.173136522917457e-09, + "loss": 0.1739, + "step": 16235 + }, + { + "epoch": 4.320383182543907, + "grad_norm": 0.3704608380794525, + "learning_rate": 9.16607189496107e-09, + "loss": 0.1766, + "step": 16236 + }, + { + "epoch": 4.32064928153273, + "grad_norm": 0.2902313768863678, + "learning_rate": 9.159009857772925e-09, + "loss": 0.1688, + "step": 16237 + }, + { + "epoch": 4.320915380521554, + "grad_norm": 0.27123165130615234, + "learning_rate": 9.151950411554387e-09, + "loss": 0.1815, + "step": 16238 + }, + { + "epoch": 4.321181479510378, + "grad_norm": 0.2935780882835388, + "learning_rate": 9.144893556506861e-09, + "loss": 0.191, + "step": 16239 + }, + { + "epoch": 4.3214475784992015, + "grad_norm": 0.2760672867298126, + "learning_rate": 9.137839292831572e-09, + "loss": 0.1737, + "step": 16240 + }, + { + "epoch": 4.321713677488026, + "grad_norm": 0.3700399696826935, + "learning_rate": 9.130787620729753e-09, + "loss": 0.1765, + "step": 16241 + }, + { + "epoch": 4.32197977647685, + "grad_norm": 0.24360330402851105, + "learning_rate": 9.123738540402526e-09, + "loss": 0.1444, + "step": 16242 + }, + { + "epoch": 4.322245875465673, + "grad_norm": 0.28452861309051514, + "learning_rate": 9.116692052050966e-09, + "loss": 0.1601, + "step": 16243 + }, + { + "epoch": 4.322511974454497, + "grad_norm": 0.3577418625354767, + "learning_rate": 9.109648155876005e-09, + "loss": 0.1912, + "step": 16244 + }, + { + "epoch": 4.322778073443321, + "grad_norm": 0.29066482186317444, + "learning_rate": 9.102606852078597e-09, + "loss": 0.1788, + "step": 16245 + }, + { + "epoch": 4.3230441724321444, + "grad_norm": 0.28132832050323486, + "learning_rate": 9.095568140859545e-09, + "loss": 0.1657, + "step": 16246 + }, + { + "epoch": 4.323310271420969, + "grad_norm": 0.2795443534851074, + "learning_rate": 9.088532022419626e-09, + "loss": 0.1773, + "step": 16247 + }, + { + "epoch": 4.323576370409793, + "grad_norm": 0.26277458667755127, + "learning_rate": 9.081498496959506e-09, + "loss": 0.157, + "step": 16248 + }, + { + "epoch": 4.323842469398616, + "grad_norm": 0.281564861536026, + "learning_rate": 9.074467564679789e-09, + "loss": 0.1733, + "step": 16249 + }, + { + "epoch": 4.32410856838744, + "grad_norm": 0.5773699283599854, + "learning_rate": 9.067439225781038e-09, + "loss": 0.1789, + "step": 16250 + }, + { + "epoch": 4.324374667376264, + "grad_norm": 0.2850068509578705, + "learning_rate": 9.060413480463714e-09, + "loss": 0.1841, + "step": 16251 + }, + { + "epoch": 4.324640766365087, + "grad_norm": 0.275190532207489, + "learning_rate": 9.05339032892819e-09, + "loss": 0.1726, + "step": 16252 + }, + { + "epoch": 4.3249068653539116, + "grad_norm": 0.37091779708862305, + "learning_rate": 9.046369771374774e-09, + "loss": 0.1907, + "step": 16253 + }, + { + "epoch": 4.325172964342736, + "grad_norm": 0.2834274172782898, + "learning_rate": 9.039351808003737e-09, + "loss": 0.1928, + "step": 16254 + }, + { + "epoch": 4.32543906333156, + "grad_norm": 0.2641501724720001, + "learning_rate": 9.03233643901522e-09, + "loss": 0.1631, + "step": 16255 + }, + { + "epoch": 4.325705162320383, + "grad_norm": 0.27435508370399475, + "learning_rate": 9.02532366460932e-09, + "loss": 0.1719, + "step": 16256 + }, + { + "epoch": 4.325971261309207, + "grad_norm": 0.2705192267894745, + "learning_rate": 9.01831348498605e-09, + "loss": 0.1768, + "step": 16257 + }, + { + "epoch": 4.32623736029803, + "grad_norm": 0.37111279368400574, + "learning_rate": 9.01130590034539e-09, + "loss": 0.1796, + "step": 16258 + }, + { + "epoch": 4.3265034592868545, + "grad_norm": 0.26830899715423584, + "learning_rate": 9.004300910887153e-09, + "loss": 0.185, + "step": 16259 + }, + { + "epoch": 4.326769558275679, + "grad_norm": 0.2780144512653351, + "learning_rate": 8.997298516811192e-09, + "loss": 0.1816, + "step": 16260 + }, + { + "epoch": 4.327035657264503, + "grad_norm": 0.28327855467796326, + "learning_rate": 8.99029871831719e-09, + "loss": 0.1767, + "step": 16261 + }, + { + "epoch": 4.327301756253326, + "grad_norm": 0.3169264793395996, + "learning_rate": 8.983301515604824e-09, + "loss": 0.1708, + "step": 16262 + }, + { + "epoch": 4.32756785524215, + "grad_norm": 0.27558138966560364, + "learning_rate": 8.97630690887362e-09, + "loss": 0.1719, + "step": 16263 + }, + { + "epoch": 4.327833954230974, + "grad_norm": 0.2517476975917816, + "learning_rate": 8.969314898323143e-09, + "loss": 0.165, + "step": 16264 + }, + { + "epoch": 4.3281000532197975, + "grad_norm": 0.27744394540786743, + "learning_rate": 8.962325484152767e-09, + "loss": 0.1723, + "step": 16265 + }, + { + "epoch": 4.328366152208622, + "grad_norm": 0.3191203773021698, + "learning_rate": 8.955338666561896e-09, + "loss": 0.1767, + "step": 16266 + }, + { + "epoch": 4.328632251197446, + "grad_norm": 0.4060564935207367, + "learning_rate": 8.94835444574975e-09, + "loss": 0.1695, + "step": 16267 + }, + { + "epoch": 4.328898350186269, + "grad_norm": 0.43341630697250366, + "learning_rate": 8.941372821915583e-09, + "loss": 0.1728, + "step": 16268 + }, + { + "epoch": 4.329164449175093, + "grad_norm": 0.2827399969100952, + "learning_rate": 8.934393795258477e-09, + "loss": 0.171, + "step": 16269 + }, + { + "epoch": 4.329430548163917, + "grad_norm": 0.29598936438560486, + "learning_rate": 8.92741736597753e-09, + "loss": 0.1869, + "step": 16270 + }, + { + "epoch": 4.3296966471527405, + "grad_norm": 0.2748754322528839, + "learning_rate": 8.920443534271693e-09, + "loss": 0.1654, + "step": 16271 + }, + { + "epoch": 4.329962746141565, + "grad_norm": 0.27310627698898315, + "learning_rate": 8.913472300339897e-09, + "loss": 0.1687, + "step": 16272 + }, + { + "epoch": 4.330228845130389, + "grad_norm": 0.4469538629055023, + "learning_rate": 8.906503664380981e-09, + "loss": 0.2024, + "step": 16273 + }, + { + "epoch": 4.330494944119212, + "grad_norm": 0.3423615097999573, + "learning_rate": 8.899537626593678e-09, + "loss": 0.179, + "step": 16274 + }, + { + "epoch": 4.330761043108036, + "grad_norm": 0.29600754380226135, + "learning_rate": 8.892574187176704e-09, + "loss": 0.1744, + "step": 16275 + }, + { + "epoch": 4.33102714209686, + "grad_norm": 0.2842199504375458, + "learning_rate": 8.885613346328636e-09, + "loss": 0.1816, + "step": 16276 + }, + { + "epoch": 4.3312932410856835, + "grad_norm": 0.3402622640132904, + "learning_rate": 8.878655104248023e-09, + "loss": 0.1775, + "step": 16277 + }, + { + "epoch": 4.331559340074508, + "grad_norm": 0.27842849493026733, + "learning_rate": 8.871699461133342e-09, + "loss": 0.1612, + "step": 16278 + }, + { + "epoch": 4.331825439063332, + "grad_norm": 0.31754544377326965, + "learning_rate": 8.864746417182977e-09, + "loss": 0.1746, + "step": 16279 + }, + { + "epoch": 4.332091538052156, + "grad_norm": 0.35233238339424133, + "learning_rate": 8.857795972595238e-09, + "loss": 0.184, + "step": 16280 + }, + { + "epoch": 4.332357637040979, + "grad_norm": 0.3937876224517822, + "learning_rate": 8.850848127568379e-09, + "loss": 0.1918, + "step": 16281 + }, + { + "epoch": 4.332623736029803, + "grad_norm": 0.28815966844558716, + "learning_rate": 8.843902882300535e-09, + "loss": 0.1689, + "step": 16282 + }, + { + "epoch": 4.332889835018627, + "grad_norm": 0.2816939949989319, + "learning_rate": 8.836960236989843e-09, + "loss": 0.1556, + "step": 16283 + }, + { + "epoch": 4.333155934007451, + "grad_norm": 0.4423615038394928, + "learning_rate": 8.830020191834275e-09, + "loss": 0.1962, + "step": 16284 + }, + { + "epoch": 4.333422032996275, + "grad_norm": 0.27950385212898254, + "learning_rate": 8.823082747031808e-09, + "loss": 0.1712, + "step": 16285 + }, + { + "epoch": 4.333688131985099, + "grad_norm": 0.2881005108356476, + "learning_rate": 8.816147902780292e-09, + "loss": 0.1671, + "step": 16286 + }, + { + "epoch": 4.333954230973922, + "grad_norm": 0.2887134552001953, + "learning_rate": 8.809215659277547e-09, + "loss": 0.1667, + "step": 16287 + }, + { + "epoch": 4.334220329962746, + "grad_norm": 0.27242612838745117, + "learning_rate": 8.802286016721261e-09, + "loss": 0.1844, + "step": 16288 + }, + { + "epoch": 4.33448642895157, + "grad_norm": 0.2845108211040497, + "learning_rate": 8.795358975309108e-09, + "loss": 0.1903, + "step": 16289 + }, + { + "epoch": 4.334752527940394, + "grad_norm": 0.2909080386161804, + "learning_rate": 8.788434535238665e-09, + "loss": 0.187, + "step": 16290 + }, + { + "epoch": 4.335018626929218, + "grad_norm": 0.4586591124534607, + "learning_rate": 8.781512696707405e-09, + "loss": 0.1664, + "step": 16291 + }, + { + "epoch": 4.335284725918042, + "grad_norm": 0.4107954800128937, + "learning_rate": 8.774593459912771e-09, + "loss": 0.1781, + "step": 16292 + }, + { + "epoch": 4.335550824906865, + "grad_norm": 0.2729905843734741, + "learning_rate": 8.767676825052105e-09, + "loss": 0.1625, + "step": 16293 + }, + { + "epoch": 4.335816923895689, + "grad_norm": 0.32047605514526367, + "learning_rate": 8.760762792322719e-09, + "loss": 0.1774, + "step": 16294 + }, + { + "epoch": 4.336083022884513, + "grad_norm": 0.269726037979126, + "learning_rate": 8.753851361921749e-09, + "loss": 0.1883, + "step": 16295 + }, + { + "epoch": 4.3363491218733365, + "grad_norm": 0.3443053662776947, + "learning_rate": 8.746942534046387e-09, + "loss": 0.1829, + "step": 16296 + }, + { + "epoch": 4.336615220862161, + "grad_norm": 0.39440739154815674, + "learning_rate": 8.740036308893638e-09, + "loss": 0.1719, + "step": 16297 + }, + { + "epoch": 4.336881319850985, + "grad_norm": 0.38707807660102844, + "learning_rate": 8.733132686660527e-09, + "loss": 0.1942, + "step": 16298 + }, + { + "epoch": 4.337147418839808, + "grad_norm": 0.2758655846118927, + "learning_rate": 8.726231667543915e-09, + "loss": 0.1647, + "step": 16299 + }, + { + "epoch": 4.337413517828632, + "grad_norm": 0.26080378890037537, + "learning_rate": 8.719333251740646e-09, + "loss": 0.1723, + "step": 16300 + }, + { + "epoch": 4.337679616817456, + "grad_norm": 0.2647104263305664, + "learning_rate": 8.712437439447483e-09, + "loss": 0.1534, + "step": 16301 + }, + { + "epoch": 4.3379457158062795, + "grad_norm": 0.32557451725006104, + "learning_rate": 8.70554423086114e-09, + "loss": 0.1791, + "step": 16302 + }, + { + "epoch": 4.338211814795104, + "grad_norm": 0.41259342432022095, + "learning_rate": 8.698653626178165e-09, + "loss": 0.1747, + "step": 16303 + }, + { + "epoch": 4.338477913783928, + "grad_norm": 0.27228614687919617, + "learning_rate": 8.69176562559515e-09, + "loss": 0.1656, + "step": 16304 + }, + { + "epoch": 4.338744012772752, + "grad_norm": 0.44156622886657715, + "learning_rate": 8.6848802293085e-09, + "loss": 0.1872, + "step": 16305 + }, + { + "epoch": 4.339010111761575, + "grad_norm": 0.3691898286342621, + "learning_rate": 8.677997437514629e-09, + "loss": 0.1998, + "step": 16306 + }, + { + "epoch": 4.339276210750399, + "grad_norm": 0.3026661276817322, + "learning_rate": 8.671117250409844e-09, + "loss": 0.1786, + "step": 16307 + }, + { + "epoch": 4.339542309739223, + "grad_norm": 0.37445998191833496, + "learning_rate": 8.664239668190387e-09, + "loss": 0.1854, + "step": 16308 + }, + { + "epoch": 4.339808408728047, + "grad_norm": 0.29350775480270386, + "learning_rate": 8.657364691052438e-09, + "loss": 0.177, + "step": 16309 + }, + { + "epoch": 4.340074507716871, + "grad_norm": 0.2616989016532898, + "learning_rate": 8.650492319192049e-09, + "loss": 0.1693, + "step": 16310 + }, + { + "epoch": 4.340340606705695, + "grad_norm": 0.2853144407272339, + "learning_rate": 8.643622552805263e-09, + "loss": 0.1799, + "step": 16311 + }, + { + "epoch": 4.340606705694518, + "grad_norm": 0.29968932271003723, + "learning_rate": 8.63675539208799e-09, + "loss": 0.1763, + "step": 16312 + }, + { + "epoch": 4.340872804683342, + "grad_norm": 0.27294138073921204, + "learning_rate": 8.629890837236131e-09, + "loss": 0.1886, + "step": 16313 + }, + { + "epoch": 4.341138903672166, + "grad_norm": 0.33329424262046814, + "learning_rate": 8.623028888445417e-09, + "loss": 0.1941, + "step": 16314 + }, + { + "epoch": 4.34140500266099, + "grad_norm": 0.34930112957954407, + "learning_rate": 8.616169545911656e-09, + "loss": 0.1771, + "step": 16315 + }, + { + "epoch": 4.341671101649814, + "grad_norm": 0.3597320318222046, + "learning_rate": 8.609312809830405e-09, + "loss": 0.172, + "step": 16316 + }, + { + "epoch": 4.341937200638638, + "grad_norm": 0.35454559326171875, + "learning_rate": 8.602458680397296e-09, + "loss": 0.1642, + "step": 16317 + }, + { + "epoch": 4.342203299627461, + "grad_norm": 0.2756431996822357, + "learning_rate": 8.595607157807772e-09, + "loss": 0.1621, + "step": 16318 + }, + { + "epoch": 4.342469398616285, + "grad_norm": 0.28381776809692383, + "learning_rate": 8.5887582422573e-09, + "loss": 0.168, + "step": 16319 + }, + { + "epoch": 4.342735497605109, + "grad_norm": 0.3513348400592804, + "learning_rate": 8.581911933941166e-09, + "loss": 0.2042, + "step": 16320 + }, + { + "epoch": 4.343001596593933, + "grad_norm": 0.27930590510368347, + "learning_rate": 8.575068233054694e-09, + "loss": 0.1668, + "step": 16321 + }, + { + "epoch": 4.343267695582757, + "grad_norm": 0.2849315404891968, + "learning_rate": 8.56822713979305e-09, + "loss": 0.1868, + "step": 16322 + }, + { + "epoch": 4.343533794571581, + "grad_norm": 0.3188376724720001, + "learning_rate": 8.561388654351376e-09, + "loss": 0.1768, + "step": 16323 + }, + { + "epoch": 4.343799893560404, + "grad_norm": 0.31168439984321594, + "learning_rate": 8.554552776924706e-09, + "loss": 0.1954, + "step": 16324 + }, + { + "epoch": 4.344065992549228, + "grad_norm": 0.35546183586120605, + "learning_rate": 8.547719507708006e-09, + "loss": 0.1803, + "step": 16325 + }, + { + "epoch": 4.344332091538052, + "grad_norm": 0.3221019208431244, + "learning_rate": 8.54088884689621e-09, + "loss": 0.1911, + "step": 16326 + }, + { + "epoch": 4.344598190526876, + "grad_norm": 0.2828027606010437, + "learning_rate": 8.534060794684094e-09, + "loss": 0.1604, + "step": 16327 + }, + { + "epoch": 4.3448642895157, + "grad_norm": 0.34482884407043457, + "learning_rate": 8.527235351266437e-09, + "loss": 0.1616, + "step": 16328 + }, + { + "epoch": 4.345130388504524, + "grad_norm": 0.2926430106163025, + "learning_rate": 8.520412516837915e-09, + "loss": 0.1826, + "step": 16329 + }, + { + "epoch": 4.345396487493348, + "grad_norm": 0.4714832901954651, + "learning_rate": 8.51359229159313e-09, + "loss": 0.1948, + "step": 16330 + }, + { + "epoch": 4.345662586482171, + "grad_norm": 0.32707279920578003, + "learning_rate": 8.50677467572659e-09, + "loss": 0.1843, + "step": 16331 + }, + { + "epoch": 4.345928685470995, + "grad_norm": 0.2622414827346802, + "learning_rate": 8.499959669432777e-09, + "loss": 0.1833, + "step": 16332 + }, + { + "epoch": 4.346194784459819, + "grad_norm": 0.25529608130455017, + "learning_rate": 8.493147272906031e-09, + "loss": 0.1718, + "step": 16333 + }, + { + "epoch": 4.346460883448643, + "grad_norm": 0.27572402358055115, + "learning_rate": 8.4863374863407e-09, + "loss": 0.1857, + "step": 16334 + }, + { + "epoch": 4.346726982437467, + "grad_norm": 0.29316946864128113, + "learning_rate": 8.47953030993096e-09, + "loss": 0.1618, + "step": 16335 + }, + { + "epoch": 4.346993081426291, + "grad_norm": 0.2712320387363434, + "learning_rate": 8.472725743871e-09, + "loss": 0.1582, + "step": 16336 + }, + { + "epoch": 4.347259180415114, + "grad_norm": 0.2899205684661865, + "learning_rate": 8.4659237883549e-09, + "loss": 0.1713, + "step": 16337 + }, + { + "epoch": 4.347525279403938, + "grad_norm": 0.3027266263961792, + "learning_rate": 8.45912444357667e-09, + "loss": 0.1901, + "step": 16338 + }, + { + "epoch": 4.347791378392762, + "grad_norm": 0.24872706830501556, + "learning_rate": 8.452327709730212e-09, + "loss": 0.1541, + "step": 16339 + }, + { + "epoch": 4.348057477381586, + "grad_norm": 0.28437334299087524, + "learning_rate": 8.445533587009423e-09, + "loss": 0.185, + "step": 16340 + }, + { + "epoch": 4.34832357637041, + "grad_norm": 0.25321727991104126, + "learning_rate": 8.438742075608041e-09, + "loss": 0.1541, + "step": 16341 + }, + { + "epoch": 4.348589675359234, + "grad_norm": 0.260646790266037, + "learning_rate": 8.431953175719796e-09, + "loss": 0.1703, + "step": 16342 + }, + { + "epoch": 4.348855774348057, + "grad_norm": 0.27386730909347534, + "learning_rate": 8.425166887538327e-09, + "loss": 0.1901, + "step": 16343 + }, + { + "epoch": 4.349121873336881, + "grad_norm": 0.39013490080833435, + "learning_rate": 8.418383211257185e-09, + "loss": 0.1714, + "step": 16344 + }, + { + "epoch": 4.349387972325705, + "grad_norm": 0.27668896317481995, + "learning_rate": 8.411602147069886e-09, + "loss": 0.1707, + "step": 16345 + }, + { + "epoch": 4.349654071314529, + "grad_norm": 0.36838778853416443, + "learning_rate": 8.404823695169771e-09, + "loss": 0.2067, + "step": 16346 + }, + { + "epoch": 4.349920170303353, + "grad_norm": 0.377760112285614, + "learning_rate": 8.398047855750245e-09, + "loss": 0.1749, + "step": 16347 + }, + { + "epoch": 4.350186269292177, + "grad_norm": 0.379658043384552, + "learning_rate": 8.39127462900452e-09, + "loss": 0.1805, + "step": 16348 + }, + { + "epoch": 4.350452368281, + "grad_norm": 0.2958917021751404, + "learning_rate": 8.384504015125816e-09, + "loss": 0.18, + "step": 16349 + }, + { + "epoch": 4.350718467269824, + "grad_norm": 0.34432336688041687, + "learning_rate": 8.377736014307213e-09, + "loss": 0.191, + "step": 16350 + }, + { + "epoch": 4.350984566258648, + "grad_norm": 0.2794024646282196, + "learning_rate": 8.37097062674177e-09, + "loss": 0.1631, + "step": 16351 + }, + { + "epoch": 4.351250665247472, + "grad_norm": 0.2685023546218872, + "learning_rate": 8.36420785262243e-09, + "loss": 0.1615, + "step": 16352 + }, + { + "epoch": 4.351516764236296, + "grad_norm": 0.28673094511032104, + "learning_rate": 8.357447692142117e-09, + "loss": 0.1751, + "step": 16353 + }, + { + "epoch": 4.35178286322512, + "grad_norm": 0.35950595140457153, + "learning_rate": 8.35069014549361e-09, + "loss": 0.2064, + "step": 16354 + }, + { + "epoch": 4.352048962213944, + "grad_norm": 0.27974626421928406, + "learning_rate": 8.343935212869679e-09, + "loss": 0.1638, + "step": 16355 + }, + { + "epoch": 4.352315061202767, + "grad_norm": 0.28196731209754944, + "learning_rate": 8.337182894462946e-09, + "loss": 0.1739, + "step": 16356 + }, + { + "epoch": 4.352581160191591, + "grad_norm": 0.28763899207115173, + "learning_rate": 8.330433190466024e-09, + "loss": 0.1811, + "step": 16357 + }, + { + "epoch": 4.3528472591804155, + "grad_norm": 0.27946120500564575, + "learning_rate": 8.323686101071437e-09, + "loss": 0.1733, + "step": 16358 + }, + { + "epoch": 4.353113358169239, + "grad_norm": 0.3641919493675232, + "learning_rate": 8.31694162647163e-09, + "loss": 0.1681, + "step": 16359 + }, + { + "epoch": 4.353379457158063, + "grad_norm": 0.41186872124671936, + "learning_rate": 8.310199766858938e-09, + "loss": 0.1699, + "step": 16360 + }, + { + "epoch": 4.353645556146887, + "grad_norm": 0.2702215015888214, + "learning_rate": 8.303460522425677e-09, + "loss": 0.19, + "step": 16361 + }, + { + "epoch": 4.35391165513571, + "grad_norm": 0.3733658790588379, + "learning_rate": 8.296723893364065e-09, + "loss": 0.1772, + "step": 16362 + }, + { + "epoch": 4.354177754124534, + "grad_norm": 0.27971285581588745, + "learning_rate": 8.289989879866222e-09, + "loss": 0.1888, + "step": 16363 + }, + { + "epoch": 4.3544438531133585, + "grad_norm": 0.30787399411201477, + "learning_rate": 8.283258482124257e-09, + "loss": 0.2061, + "step": 16364 + }, + { + "epoch": 4.354709952102182, + "grad_norm": 0.28785932064056396, + "learning_rate": 8.276529700330104e-09, + "loss": 0.1838, + "step": 16365 + }, + { + "epoch": 4.354976051091006, + "grad_norm": 0.278104305267334, + "learning_rate": 8.269803534675735e-09, + "loss": 0.1776, + "step": 16366 + }, + { + "epoch": 4.35524215007983, + "grad_norm": 0.28192347288131714, + "learning_rate": 8.263079985352972e-09, + "loss": 0.1784, + "step": 16367 + }, + { + "epoch": 4.355508249068653, + "grad_norm": 0.28923848271369934, + "learning_rate": 8.256359052553597e-09, + "loss": 0.1902, + "step": 16368 + }, + { + "epoch": 4.355774348057477, + "grad_norm": 0.5036178231239319, + "learning_rate": 8.249640736469277e-09, + "loss": 0.1569, + "step": 16369 + }, + { + "epoch": 4.3560404470463014, + "grad_norm": 0.34789031744003296, + "learning_rate": 8.24292503729167e-09, + "loss": 0.2015, + "step": 16370 + }, + { + "epoch": 4.356306546035125, + "grad_norm": 0.2852506935596466, + "learning_rate": 8.23621195521227e-09, + "loss": 0.1755, + "step": 16371 + }, + { + "epoch": 4.356572645023949, + "grad_norm": 0.2486182153224945, + "learning_rate": 8.229501490422585e-09, + "loss": 0.1554, + "step": 16372 + }, + { + "epoch": 4.356838744012773, + "grad_norm": 0.29580169916152954, + "learning_rate": 8.222793643114012e-09, + "loss": 0.178, + "step": 16373 + }, + { + "epoch": 4.357104843001597, + "grad_norm": 0.3543327748775482, + "learning_rate": 8.216088413477873e-09, + "loss": 0.1718, + "step": 16374 + }, + { + "epoch": 4.35737094199042, + "grad_norm": 0.2659834623336792, + "learning_rate": 8.209385801705393e-09, + "loss": 0.1682, + "step": 16375 + }, + { + "epoch": 4.357637040979244, + "grad_norm": 0.35652846097946167, + "learning_rate": 8.202685807987775e-09, + "loss": 0.1857, + "step": 16376 + }, + { + "epoch": 4.357903139968068, + "grad_norm": 0.2700451612472534, + "learning_rate": 8.195988432516077e-09, + "loss": 0.178, + "step": 16377 + }, + { + "epoch": 4.358169238956892, + "grad_norm": 0.26215195655822754, + "learning_rate": 8.189293675481345e-09, + "loss": 0.1678, + "step": 16378 + }, + { + "epoch": 4.358435337945716, + "grad_norm": 0.29411908984184265, + "learning_rate": 8.18260153707453e-09, + "loss": 0.1971, + "step": 16379 + }, + { + "epoch": 4.35870143693454, + "grad_norm": 0.2608281970024109, + "learning_rate": 8.175912017486497e-09, + "loss": 0.1569, + "step": 16380 + }, + { + "epoch": 4.358967535923363, + "grad_norm": 0.26885539293289185, + "learning_rate": 8.169225116908074e-09, + "loss": 0.1782, + "step": 16381 + }, + { + "epoch": 4.359233634912187, + "grad_norm": 0.41442394256591797, + "learning_rate": 8.16254083552993e-09, + "loss": 0.1825, + "step": 16382 + }, + { + "epoch": 4.3594997339010115, + "grad_norm": 0.27305325865745544, + "learning_rate": 8.155859173542767e-09, + "loss": 0.1742, + "step": 16383 + }, + { + "epoch": 4.359765832889835, + "grad_norm": 0.28375181555747986, + "learning_rate": 8.149180131137123e-09, + "loss": 0.1745, + "step": 16384 + }, + { + "epoch": 4.360031931878659, + "grad_norm": 0.27104461193084717, + "learning_rate": 8.142503708503524e-09, + "loss": 0.1779, + "step": 16385 + }, + { + "epoch": 4.360298030867483, + "grad_norm": 0.27366194128990173, + "learning_rate": 8.135829905832369e-09, + "loss": 0.1672, + "step": 16386 + }, + { + "epoch": 4.360564129856306, + "grad_norm": 0.2506803572177887, + "learning_rate": 8.129158723314012e-09, + "loss": 0.1603, + "step": 16387 + }, + { + "epoch": 4.36083022884513, + "grad_norm": 0.3075013756752014, + "learning_rate": 8.122490161138751e-09, + "loss": 0.1773, + "step": 16388 + }, + { + "epoch": 4.3610963278339545, + "grad_norm": 0.27403607964515686, + "learning_rate": 8.115824219496781e-09, + "loss": 0.1712, + "step": 16389 + }, + { + "epoch": 4.361362426822778, + "grad_norm": 0.3434012532234192, + "learning_rate": 8.109160898578216e-09, + "loss": 0.1714, + "step": 16390 + }, + { + "epoch": 4.361628525811602, + "grad_norm": 0.26902925968170166, + "learning_rate": 8.102500198573126e-09, + "loss": 0.1751, + "step": 16391 + }, + { + "epoch": 4.361894624800426, + "grad_norm": 0.39004436135292053, + "learning_rate": 8.09584211967146e-09, + "loss": 0.1879, + "step": 16392 + }, + { + "epoch": 4.362160723789249, + "grad_norm": 0.2939203679561615, + "learning_rate": 8.08918666206313e-09, + "loss": 0.1934, + "step": 16393 + }, + { + "epoch": 4.362426822778073, + "grad_norm": 0.2846727669239044, + "learning_rate": 8.082533825937964e-09, + "loss": 0.2047, + "step": 16394 + }, + { + "epoch": 4.3626929217668975, + "grad_norm": 0.2609437108039856, + "learning_rate": 8.075883611485745e-09, + "loss": 0.1555, + "step": 16395 + }, + { + "epoch": 4.362959020755721, + "grad_norm": 0.2607724964618683, + "learning_rate": 8.069236018896097e-09, + "loss": 0.1584, + "step": 16396 + }, + { + "epoch": 4.363225119744545, + "grad_norm": 0.27151015400886536, + "learning_rate": 8.06259104835868e-09, + "loss": 0.184, + "step": 16397 + }, + { + "epoch": 4.363491218733369, + "grad_norm": 0.7327278256416321, + "learning_rate": 8.055948700062965e-09, + "loss": 0.1984, + "step": 16398 + }, + { + "epoch": 4.363757317722193, + "grad_norm": 0.2909834384918213, + "learning_rate": 8.049308974198442e-09, + "loss": 0.186, + "step": 16399 + }, + { + "epoch": 4.364023416711016, + "grad_norm": 0.2709267735481262, + "learning_rate": 8.042671870954487e-09, + "loss": 0.166, + "step": 16400 + }, + { + "epoch": 4.3642895156998405, + "grad_norm": 0.30121347308158875, + "learning_rate": 8.036037390520378e-09, + "loss": 0.1762, + "step": 16401 + }, + { + "epoch": 4.364555614688665, + "grad_norm": 0.34802159667015076, + "learning_rate": 8.029405533085375e-09, + "loss": 0.1782, + "step": 16402 + }, + { + "epoch": 4.364821713677488, + "grad_norm": 0.3826828598976135, + "learning_rate": 8.022776298838607e-09, + "loss": 0.1858, + "step": 16403 + }, + { + "epoch": 4.365087812666312, + "grad_norm": 0.30903565883636475, + "learning_rate": 8.016149687969188e-09, + "loss": 0.1922, + "step": 16404 + }, + { + "epoch": 4.365353911655136, + "grad_norm": 0.30831852555274963, + "learning_rate": 8.009525700666087e-09, + "loss": 0.1696, + "step": 16405 + }, + { + "epoch": 4.365620010643959, + "grad_norm": 0.3053942918777466, + "learning_rate": 8.002904337118254e-09, + "loss": 0.1695, + "step": 16406 + }, + { + "epoch": 4.3658861096327835, + "grad_norm": 0.34017783403396606, + "learning_rate": 7.99628559751453e-09, + "loss": 0.1746, + "step": 16407 + }, + { + "epoch": 4.366152208621608, + "grad_norm": 0.2787730395793915, + "learning_rate": 7.989669482043692e-09, + "loss": 0.1754, + "step": 16408 + }, + { + "epoch": 4.366418307610431, + "grad_norm": 0.2823874354362488, + "learning_rate": 7.983055990894461e-09, + "loss": 0.1699, + "step": 16409 + }, + { + "epoch": 4.366684406599255, + "grad_norm": 0.2657434046268463, + "learning_rate": 7.976445124255471e-09, + "loss": 0.1627, + "step": 16410 + }, + { + "epoch": 4.366950505588079, + "grad_norm": 0.41780737042427063, + "learning_rate": 7.969836882315251e-09, + "loss": 0.1694, + "step": 16411 + }, + { + "epoch": 4.367216604576902, + "grad_norm": 0.2871538996696472, + "learning_rate": 7.963231265262326e-09, + "loss": 0.1887, + "step": 16412 + }, + { + "epoch": 4.367482703565726, + "grad_norm": 0.2780151069164276, + "learning_rate": 7.956628273285038e-09, + "loss": 0.1799, + "step": 16413 + }, + { + "epoch": 4.367748802554551, + "grad_norm": 0.3245771825313568, + "learning_rate": 7.950027906571789e-09, + "loss": 0.1768, + "step": 16414 + }, + { + "epoch": 4.368014901543374, + "grad_norm": 0.2540195882320404, + "learning_rate": 7.94343016531075e-09, + "loss": 0.1685, + "step": 16415 + }, + { + "epoch": 4.368281000532198, + "grad_norm": 0.2690606117248535, + "learning_rate": 7.936835049690171e-09, + "loss": 0.1652, + "step": 16416 + }, + { + "epoch": 4.368547099521022, + "grad_norm": 0.32814642786979675, + "learning_rate": 7.93024255989816e-09, + "loss": 0.1798, + "step": 16417 + }, + { + "epoch": 4.368813198509845, + "grad_norm": 0.5308439135551453, + "learning_rate": 7.923652696122696e-09, + "loss": 0.1639, + "step": 16418 + }, + { + "epoch": 4.369079297498669, + "grad_norm": 0.3315674960613251, + "learning_rate": 7.917065458551798e-09, + "loss": 0.1853, + "step": 16419 + }, + { + "epoch": 4.3693453964874935, + "grad_norm": 0.3822423815727234, + "learning_rate": 7.91048084737328e-09, + "loss": 0.1881, + "step": 16420 + }, + { + "epoch": 4.369611495476317, + "grad_norm": 0.3494519889354706, + "learning_rate": 7.903898862775016e-09, + "loss": 0.1922, + "step": 16421 + }, + { + "epoch": 4.369877594465141, + "grad_norm": 0.2782149016857147, + "learning_rate": 7.897319504944678e-09, + "loss": 0.1704, + "step": 16422 + }, + { + "epoch": 4.370143693453965, + "grad_norm": 0.31582358479499817, + "learning_rate": 7.890742774069959e-09, + "loss": 0.183, + "step": 16423 + }, + { + "epoch": 4.370409792442789, + "grad_norm": 0.3114619851112366, + "learning_rate": 7.88416867033842e-09, + "loss": 0.1715, + "step": 16424 + }, + { + "epoch": 4.370675891431612, + "grad_norm": 0.2908845841884613, + "learning_rate": 7.877597193937602e-09, + "loss": 0.1907, + "step": 16425 + }, + { + "epoch": 4.3709419904204365, + "grad_norm": 0.2723710834980011, + "learning_rate": 7.871028345054897e-09, + "loss": 0.1567, + "step": 16426 + }, + { + "epoch": 4.371208089409261, + "grad_norm": 0.3394947350025177, + "learning_rate": 7.86446212387769e-09, + "loss": 0.1761, + "step": 16427 + }, + { + "epoch": 4.371474188398084, + "grad_norm": 0.27672192454338074, + "learning_rate": 7.857898530593243e-09, + "loss": 0.1693, + "step": 16428 + }, + { + "epoch": 4.371740287386908, + "grad_norm": 0.26648300886154175, + "learning_rate": 7.851337565388772e-09, + "loss": 0.1539, + "step": 16429 + }, + { + "epoch": 4.372006386375732, + "grad_norm": 0.28766244649887085, + "learning_rate": 7.844779228451404e-09, + "loss": 0.1854, + "step": 16430 + }, + { + "epoch": 4.372272485364555, + "grad_norm": 0.30161571502685547, + "learning_rate": 7.838223519968212e-09, + "loss": 0.1809, + "step": 16431 + }, + { + "epoch": 4.3725385843533795, + "grad_norm": 0.38874489068984985, + "learning_rate": 7.831670440126159e-09, + "loss": 0.1769, + "step": 16432 + }, + { + "epoch": 4.372804683342204, + "grad_norm": 0.2758193016052246, + "learning_rate": 7.825119989112171e-09, + "loss": 0.1706, + "step": 16433 + }, + { + "epoch": 4.373070782331027, + "grad_norm": 0.34700849652290344, + "learning_rate": 7.818572167113057e-09, + "loss": 0.184, + "step": 16434 + }, + { + "epoch": 4.373336881319851, + "grad_norm": 0.3353129029273987, + "learning_rate": 7.812026974315588e-09, + "loss": 0.1712, + "step": 16435 + }, + { + "epoch": 4.373602980308675, + "grad_norm": 0.2901269495487213, + "learning_rate": 7.80548441090646e-09, + "loss": 0.1699, + "step": 16436 + }, + { + "epoch": 4.373869079297498, + "grad_norm": 0.367148220539093, + "learning_rate": 7.798944477072244e-09, + "loss": 0.1895, + "step": 16437 + }, + { + "epoch": 4.3741351782863225, + "grad_norm": 0.37275367975234985, + "learning_rate": 7.792407172999493e-09, + "loss": 0.1777, + "step": 16438 + }, + { + "epoch": 4.374401277275147, + "grad_norm": 0.3461313843727112, + "learning_rate": 7.785872498874669e-09, + "loss": 0.197, + "step": 16439 + }, + { + "epoch": 4.37466737626397, + "grad_norm": 0.2769300043582916, + "learning_rate": 7.779340454884164e-09, + "loss": 0.1542, + "step": 16440 + }, + { + "epoch": 4.374933475252794, + "grad_norm": 0.2765415608882904, + "learning_rate": 7.772811041214256e-09, + "loss": 0.1701, + "step": 16441 + }, + { + "epoch": 4.375199574241618, + "grad_norm": 0.25492334365844727, + "learning_rate": 7.766284258051204e-09, + "loss": 0.1577, + "step": 16442 + }, + { + "epoch": 4.375465673230441, + "grad_norm": 0.28520771861076355, + "learning_rate": 7.759760105581137e-09, + "loss": 0.1914, + "step": 16443 + }, + { + "epoch": 4.3757317722192655, + "grad_norm": 0.37397894263267517, + "learning_rate": 7.75323858399015e-09, + "loss": 0.1727, + "step": 16444 + }, + { + "epoch": 4.37599787120809, + "grad_norm": 0.30079224705696106, + "learning_rate": 7.746719693464254e-09, + "loss": 0.1925, + "step": 16445 + }, + { + "epoch": 4.376263970196913, + "grad_norm": 0.283338338136673, + "learning_rate": 7.740203434189407e-09, + "loss": 0.1831, + "step": 16446 + }, + { + "epoch": 4.376530069185737, + "grad_norm": 0.29214856028556824, + "learning_rate": 7.733689806351407e-09, + "loss": 0.1695, + "step": 16447 + }, + { + "epoch": 4.376796168174561, + "grad_norm": 0.2991253137588501, + "learning_rate": 7.727178810136092e-09, + "loss": 0.1977, + "step": 16448 + }, + { + "epoch": 4.377062267163385, + "grad_norm": 0.29120317101478577, + "learning_rate": 7.720670445729138e-09, + "loss": 0.1699, + "step": 16449 + }, + { + "epoch": 4.377328366152208, + "grad_norm": 0.34827354550361633, + "learning_rate": 7.714164713316185e-09, + "loss": 0.1875, + "step": 16450 + }, + { + "epoch": 4.377594465141033, + "grad_norm": 0.31380152702331543, + "learning_rate": 7.707661613082773e-09, + "loss": 0.1787, + "step": 16451 + }, + { + "epoch": 4.377860564129857, + "grad_norm": 0.3820015490055084, + "learning_rate": 7.701161145214408e-09, + "loss": 0.1703, + "step": 16452 + }, + { + "epoch": 4.37812666311868, + "grad_norm": 0.28327620029449463, + "learning_rate": 7.694663309896477e-09, + "loss": 0.1726, + "step": 16453 + }, + { + "epoch": 4.378392762107504, + "grad_norm": 0.2662612497806549, + "learning_rate": 7.68816810731433e-09, + "loss": 0.1705, + "step": 16454 + }, + { + "epoch": 4.378658861096328, + "grad_norm": 0.2770961821079254, + "learning_rate": 7.681675537653231e-09, + "loss": 0.1665, + "step": 16455 + }, + { + "epoch": 4.378924960085151, + "grad_norm": 0.29096874594688416, + "learning_rate": 7.675185601098321e-09, + "loss": 0.1745, + "step": 16456 + }, + { + "epoch": 4.3791910590739755, + "grad_norm": 0.3005278408527374, + "learning_rate": 7.668698297834753e-09, + "loss": 0.1842, + "step": 16457 + }, + { + "epoch": 4.3794571580628, + "grad_norm": 0.30450135469436646, + "learning_rate": 7.66221362804752e-09, + "loss": 0.1882, + "step": 16458 + }, + { + "epoch": 4.379723257051623, + "grad_norm": 0.2839408218860626, + "learning_rate": 7.6557315919216e-09, + "loss": 0.1753, + "step": 16459 + }, + { + "epoch": 4.379989356040447, + "grad_norm": 0.32079386711120605, + "learning_rate": 7.649252189641853e-09, + "loss": 0.191, + "step": 16460 + }, + { + "epoch": 4.380255455029271, + "grad_norm": 0.2809200882911682, + "learning_rate": 7.642775421393121e-09, + "loss": 0.1761, + "step": 16461 + }, + { + "epoch": 4.380521554018094, + "grad_norm": 0.3703310489654541, + "learning_rate": 7.636301287360103e-09, + "loss": 0.1633, + "step": 16462 + }, + { + "epoch": 4.3807876530069185, + "grad_norm": 0.28294432163238525, + "learning_rate": 7.629829787727482e-09, + "loss": 0.181, + "step": 16463 + }, + { + "epoch": 4.381053751995743, + "grad_norm": 0.25650647282600403, + "learning_rate": 7.62336092267979e-09, + "loss": 0.1523, + "step": 16464 + }, + { + "epoch": 4.381319850984566, + "grad_norm": 0.3172476589679718, + "learning_rate": 7.616894692401588e-09, + "loss": 0.1785, + "step": 16465 + }, + { + "epoch": 4.38158594997339, + "grad_norm": 0.2761670649051666, + "learning_rate": 7.610431097077252e-09, + "loss": 0.1662, + "step": 16466 + }, + { + "epoch": 4.381852048962214, + "grad_norm": 0.26851218938827515, + "learning_rate": 7.603970136891192e-09, + "loss": 0.1625, + "step": 16467 + }, + { + "epoch": 4.382118147951037, + "grad_norm": 0.2660169005393982, + "learning_rate": 7.597511812027646e-09, + "loss": 0.1739, + "step": 16468 + }, + { + "epoch": 4.3823842469398615, + "grad_norm": 0.494803786277771, + "learning_rate": 7.591056122670846e-09, + "loss": 0.1738, + "step": 16469 + }, + { + "epoch": 4.382650345928686, + "grad_norm": 0.2698863446712494, + "learning_rate": 7.584603069004903e-09, + "loss": 0.1838, + "step": 16470 + }, + { + "epoch": 4.382916444917509, + "grad_norm": 0.35875946283340454, + "learning_rate": 7.578152651213876e-09, + "loss": 0.1938, + "step": 16471 + }, + { + "epoch": 4.383182543906333, + "grad_norm": 0.35244476795196533, + "learning_rate": 7.571704869481754e-09, + "loss": 0.1797, + "step": 16472 + }, + { + "epoch": 4.383448642895157, + "grad_norm": 0.2768745422363281, + "learning_rate": 7.565259723992424e-09, + "loss": 0.1639, + "step": 16473 + }, + { + "epoch": 4.383714741883981, + "grad_norm": 0.29831886291503906, + "learning_rate": 7.558817214929725e-09, + "loss": 0.1824, + "step": 16474 + }, + { + "epoch": 4.3839808408728045, + "grad_norm": 0.33657926321029663, + "learning_rate": 7.5523773424774e-09, + "loss": 0.1963, + "step": 16475 + }, + { + "epoch": 4.384246939861629, + "grad_norm": 0.283930242061615, + "learning_rate": 7.545940106819149e-09, + "loss": 0.1868, + "step": 16476 + }, + { + "epoch": 4.384513038850453, + "grad_norm": 0.42302727699279785, + "learning_rate": 7.539505508138555e-09, + "loss": 0.1775, + "step": 16477 + }, + { + "epoch": 4.384779137839276, + "grad_norm": 0.36338508129119873, + "learning_rate": 7.533073546619174e-09, + "loss": 0.1805, + "step": 16478 + }, + { + "epoch": 4.3850452368281, + "grad_norm": 0.30021560192108154, + "learning_rate": 7.526644222444412e-09, + "loss": 0.1796, + "step": 16479 + }, + { + "epoch": 4.385311335816924, + "grad_norm": 0.29770421981811523, + "learning_rate": 7.520217535797669e-09, + "loss": 0.1672, + "step": 16480 + }, + { + "epoch": 4.3855774348057475, + "grad_norm": 0.26973074674606323, + "learning_rate": 7.513793486862252e-09, + "loss": 0.1705, + "step": 16481 + }, + { + "epoch": 4.385843533794572, + "grad_norm": 0.25547078251838684, + "learning_rate": 7.507372075821405e-09, + "loss": 0.1632, + "step": 16482 + }, + { + "epoch": 4.386109632783396, + "grad_norm": 0.26810839772224426, + "learning_rate": 7.500953302858237e-09, + "loss": 0.1745, + "step": 16483 + }, + { + "epoch": 4.386375731772219, + "grad_norm": 0.39860185980796814, + "learning_rate": 7.494537168155867e-09, + "loss": 0.2028, + "step": 16484 + }, + { + "epoch": 4.386641830761043, + "grad_norm": 0.3708235025405884, + "learning_rate": 7.488123671897274e-09, + "loss": 0.1402, + "step": 16485 + }, + { + "epoch": 4.386907929749867, + "grad_norm": 0.4081699252128601, + "learning_rate": 7.481712814265385e-09, + "loss": 0.1826, + "step": 16486 + }, + { + "epoch": 4.3871740287386904, + "grad_norm": 0.27851149439811707, + "learning_rate": 7.475304595443044e-09, + "loss": 0.1723, + "step": 16487 + }, + { + "epoch": 4.387440127727515, + "grad_norm": 0.3052467107772827, + "learning_rate": 7.468899015613039e-09, + "loss": 0.1762, + "step": 16488 + }, + { + "epoch": 4.387706226716339, + "grad_norm": 0.26930493116378784, + "learning_rate": 7.462496074958058e-09, + "loss": 0.1721, + "step": 16489 + }, + { + "epoch": 4.387972325705162, + "grad_norm": 0.25449174642562866, + "learning_rate": 7.456095773660743e-09, + "loss": 0.1765, + "step": 16490 + }, + { + "epoch": 4.388238424693986, + "grad_norm": 0.2525912821292877, + "learning_rate": 7.449698111903646e-09, + "loss": 0.1625, + "step": 16491 + }, + { + "epoch": 4.38850452368281, + "grad_norm": 0.2985861897468567, + "learning_rate": 7.443303089869213e-09, + "loss": 0.1745, + "step": 16492 + }, + { + "epoch": 4.388770622671634, + "grad_norm": 0.3881661891937256, + "learning_rate": 7.436910707739874e-09, + "loss": 0.1796, + "step": 16493 + }, + { + "epoch": 4.3890367216604576, + "grad_norm": 0.3862491846084595, + "learning_rate": 7.4305209656979284e-09, + "loss": 0.2042, + "step": 16494 + }, + { + "epoch": 4.389302820649282, + "grad_norm": 0.27973729372024536, + "learning_rate": 7.42413386392563e-09, + "loss": 0.1777, + "step": 16495 + }, + { + "epoch": 4.389568919638105, + "grad_norm": 0.37685710191726685, + "learning_rate": 7.417749402605167e-09, + "loss": 0.1783, + "step": 16496 + }, + { + "epoch": 4.389835018626929, + "grad_norm": 0.40801483392715454, + "learning_rate": 7.411367581918637e-09, + "loss": 0.188, + "step": 16497 + }, + { + "epoch": 4.390101117615753, + "grad_norm": 0.2950293719768524, + "learning_rate": 7.404988402048029e-09, + "loss": 0.1747, + "step": 16498 + }, + { + "epoch": 4.390367216604577, + "grad_norm": 0.26948702335357666, + "learning_rate": 7.398611863175341e-09, + "loss": 0.1745, + "step": 16499 + }, + { + "epoch": 4.3906333155934005, + "grad_norm": 0.2775484025478363, + "learning_rate": 7.392237965482395e-09, + "loss": 0.1618, + "step": 16500 + }, + { + "epoch": 4.390899414582225, + "grad_norm": 0.346221387386322, + "learning_rate": 7.385866709151034e-09, + "loss": 0.1952, + "step": 16501 + }, + { + "epoch": 4.391165513571049, + "grad_norm": 0.31706342101097107, + "learning_rate": 7.379498094362924e-09, + "loss": 0.1897, + "step": 16502 + }, + { + "epoch": 4.391431612559872, + "grad_norm": 0.531254231929779, + "learning_rate": 7.373132121299752e-09, + "loss": 0.1869, + "step": 16503 + }, + { + "epoch": 4.391697711548696, + "grad_norm": 0.40582534670829773, + "learning_rate": 7.366768790143086e-09, + "loss": 0.1963, + "step": 16504 + }, + { + "epoch": 4.39196381053752, + "grad_norm": 0.2978203296661377, + "learning_rate": 7.360408101074422e-09, + "loss": 0.183, + "step": 16505 + }, + { + "epoch": 4.3922299095263435, + "grad_norm": 0.3706844747066498, + "learning_rate": 7.354050054275152e-09, + "loss": 0.1705, + "step": 16506 + }, + { + "epoch": 4.392496008515168, + "grad_norm": 0.2538646459579468, + "learning_rate": 7.34769464992665e-09, + "loss": 0.177, + "step": 16507 + }, + { + "epoch": 4.392762107503992, + "grad_norm": 0.2976324260234833, + "learning_rate": 7.341341888210184e-09, + "loss": 0.1655, + "step": 16508 + }, + { + "epoch": 4.393028206492815, + "grad_norm": 0.44380059838294983, + "learning_rate": 7.33499176930692e-09, + "loss": 0.1767, + "step": 16509 + }, + { + "epoch": 4.393294305481639, + "grad_norm": 0.2650766670703888, + "learning_rate": 7.328644293398001e-09, + "loss": 0.161, + "step": 16510 + }, + { + "epoch": 4.393560404470463, + "grad_norm": 0.28467804193496704, + "learning_rate": 7.3222994606644604e-09, + "loss": 0.1724, + "step": 16511 + }, + { + "epoch": 4.3938265034592865, + "grad_norm": 0.2592025399208069, + "learning_rate": 7.3159572712872875e-09, + "loss": 0.164, + "step": 16512 + }, + { + "epoch": 4.394092602448111, + "grad_norm": 0.3560122847557068, + "learning_rate": 7.309617725447337e-09, + "loss": 0.165, + "step": 16513 + }, + { + "epoch": 4.394358701436935, + "grad_norm": 0.27174821496009827, + "learning_rate": 7.303280823325464e-09, + "loss": 0.1801, + "step": 16514 + }, + { + "epoch": 4.394624800425758, + "grad_norm": 0.27917391061782837, + "learning_rate": 7.29694656510238e-09, + "loss": 0.1643, + "step": 16515 + }, + { + "epoch": 4.394890899414582, + "grad_norm": 0.26788365840911865, + "learning_rate": 7.290614950958774e-09, + "loss": 0.1649, + "step": 16516 + }, + { + "epoch": 4.395156998403406, + "grad_norm": 0.2760715186595917, + "learning_rate": 7.284285981075189e-09, + "loss": 0.1761, + "step": 16517 + }, + { + "epoch": 4.39542309739223, + "grad_norm": 0.3499169647693634, + "learning_rate": 7.277959655632204e-09, + "loss": 0.1906, + "step": 16518 + }, + { + "epoch": 4.395689196381054, + "grad_norm": 0.46420857310295105, + "learning_rate": 7.27163597481022e-09, + "loss": 0.2048, + "step": 16519 + }, + { + "epoch": 4.395955295369878, + "grad_norm": 0.4150988459587097, + "learning_rate": 7.265314938789624e-09, + "loss": 0.1785, + "step": 16520 + }, + { + "epoch": 4.396221394358702, + "grad_norm": 0.2865709662437439, + "learning_rate": 7.258996547750673e-09, + "loss": 0.1741, + "step": 16521 + }, + { + "epoch": 4.396487493347525, + "grad_norm": 0.3118354082107544, + "learning_rate": 7.252680801873623e-09, + "loss": 0.1769, + "step": 16522 + }, + { + "epoch": 4.396753592336349, + "grad_norm": 0.2921503186225891, + "learning_rate": 7.2463677013385624e-09, + "loss": 0.1698, + "step": 16523 + }, + { + "epoch": 4.397019691325173, + "grad_norm": 1.8420794010162354, + "learning_rate": 7.2400572463255924e-09, + "loss": 0.1739, + "step": 16524 + }, + { + "epoch": 4.397285790313997, + "grad_norm": 0.34040936827659607, + "learning_rate": 7.2337494370146805e-09, + "loss": 0.1673, + "step": 16525 + }, + { + "epoch": 4.397551889302821, + "grad_norm": 0.2569122016429901, + "learning_rate": 7.227444273585737e-09, + "loss": 0.1556, + "step": 16526 + }, + { + "epoch": 4.397817988291645, + "grad_norm": 0.27070707082748413, + "learning_rate": 7.221141756218629e-09, + "loss": 0.1826, + "step": 16527 + }, + { + "epoch": 4.398084087280468, + "grad_norm": 0.27842944860458374, + "learning_rate": 7.214841885093082e-09, + "loss": 0.1727, + "step": 16528 + }, + { + "epoch": 4.398350186269292, + "grad_norm": 0.4018254578113556, + "learning_rate": 7.208544660388804e-09, + "loss": 0.1843, + "step": 16529 + }, + { + "epoch": 4.398616285258116, + "grad_norm": 0.298650860786438, + "learning_rate": 7.202250082285377e-09, + "loss": 0.1851, + "step": 16530 + }, + { + "epoch": 4.39888238424694, + "grad_norm": 0.26989519596099854, + "learning_rate": 7.195958150962345e-09, + "loss": 0.1603, + "step": 16531 + }, + { + "epoch": 4.399148483235764, + "grad_norm": 0.2773780822753906, + "learning_rate": 7.189668866599185e-09, + "loss": 0.1591, + "step": 16532 + }, + { + "epoch": 4.399414582224588, + "grad_norm": 0.2690522372722626, + "learning_rate": 7.183382229375279e-09, + "loss": 0.1718, + "step": 16533 + }, + { + "epoch": 4.399680681213411, + "grad_norm": 0.3159507215023041, + "learning_rate": 7.177098239469914e-09, + "loss": 0.1928, + "step": 16534 + }, + { + "epoch": 4.399946780202235, + "grad_norm": 0.28403574228286743, + "learning_rate": 7.170816897062337e-09, + "loss": 0.1653, + "step": 16535 + }, + { + "epoch": 4.400212879191059, + "grad_norm": 0.4402656555175781, + "learning_rate": 7.164538202331694e-09, + "loss": 0.1877, + "step": 16536 + }, + { + "epoch": 4.4004789781798825, + "grad_norm": 0.40086182951927185, + "learning_rate": 7.1582621554570845e-09, + "loss": 0.1905, + "step": 16537 + }, + { + "epoch": 4.400745077168707, + "grad_norm": 0.34167736768722534, + "learning_rate": 7.1519887566174886e-09, + "loss": 0.1696, + "step": 16538 + }, + { + "epoch": 4.401011176157531, + "grad_norm": 0.2987283766269684, + "learning_rate": 7.145718005991841e-09, + "loss": 0.1733, + "step": 16539 + }, + { + "epoch": 4.401277275146354, + "grad_norm": 0.2649495303630829, + "learning_rate": 7.1394499037590095e-09, + "loss": 0.1582, + "step": 16540 + }, + { + "epoch": 4.401543374135178, + "grad_norm": 0.24991969764232635, + "learning_rate": 7.133184450097795e-09, + "loss": 0.1586, + "step": 16541 + }, + { + "epoch": 4.401809473124002, + "grad_norm": 0.26986178755760193, + "learning_rate": 7.126921645186845e-09, + "loss": 0.1697, + "step": 16542 + }, + { + "epoch": 4.402075572112826, + "grad_norm": 0.27595892548561096, + "learning_rate": 7.120661489204837e-09, + "loss": 0.1889, + "step": 16543 + }, + { + "epoch": 4.40234167110165, + "grad_norm": 0.30053818225860596, + "learning_rate": 7.114403982330286e-09, + "loss": 0.2014, + "step": 16544 + }, + { + "epoch": 4.402607770090474, + "grad_norm": 0.3033386170864105, + "learning_rate": 7.108149124741691e-09, + "loss": 0.2082, + "step": 16545 + }, + { + "epoch": 4.402873869079298, + "grad_norm": 0.28828904032707214, + "learning_rate": 7.101896916617445e-09, + "loss": 0.1742, + "step": 16546 + }, + { + "epoch": 4.403139968068121, + "grad_norm": 0.34904372692108154, + "learning_rate": 7.09564735813587e-09, + "loss": 0.1892, + "step": 16547 + }, + { + "epoch": 4.403406067056945, + "grad_norm": 0.31275850534439087, + "learning_rate": 7.089400449475247e-09, + "loss": 0.185, + "step": 16548 + }, + { + "epoch": 4.403672166045769, + "grad_norm": 0.31033408641815186, + "learning_rate": 7.083156190813711e-09, + "loss": 0.1664, + "step": 16549 + }, + { + "epoch": 4.403938265034593, + "grad_norm": 0.3788079023361206, + "learning_rate": 7.076914582329385e-09, + "loss": 0.1805, + "step": 16550 + }, + { + "epoch": 4.404204364023417, + "grad_norm": 0.3795619308948517, + "learning_rate": 7.070675624200284e-09, + "loss": 0.1686, + "step": 16551 + }, + { + "epoch": 4.404470463012241, + "grad_norm": 0.2572269141674042, + "learning_rate": 7.064439316604354e-09, + "loss": 0.1785, + "step": 16552 + }, + { + "epoch": 4.404736562001064, + "grad_norm": 0.39333608746528625, + "learning_rate": 7.058205659719463e-09, + "loss": 0.1939, + "step": 16553 + }, + { + "epoch": 4.405002660989888, + "grad_norm": 0.31559935212135315, + "learning_rate": 7.051974653723414e-09, + "loss": 0.1671, + "step": 16554 + }, + { + "epoch": 4.405268759978712, + "grad_norm": 0.30708175897598267, + "learning_rate": 7.0457462987939194e-09, + "loss": 0.1803, + "step": 16555 + }, + { + "epoch": 4.405534858967536, + "grad_norm": 0.281588077545166, + "learning_rate": 7.03952059510865e-09, + "loss": 0.1709, + "step": 16556 + }, + { + "epoch": 4.40580095795636, + "grad_norm": 0.4014512002468109, + "learning_rate": 7.0332975428451406e-09, + "loss": 0.1704, + "step": 16557 + }, + { + "epoch": 4.406067056945184, + "grad_norm": 0.301143079996109, + "learning_rate": 7.027077142180926e-09, + "loss": 0.187, + "step": 16558 + }, + { + "epoch": 4.406333155934007, + "grad_norm": 0.3972281217575073, + "learning_rate": 7.020859393293388e-09, + "loss": 0.188, + "step": 16559 + }, + { + "epoch": 4.406599254922831, + "grad_norm": 0.3170052766799927, + "learning_rate": 7.014644296359873e-09, + "loss": 0.1924, + "step": 16560 + }, + { + "epoch": 4.406865353911655, + "grad_norm": 0.26870661973953247, + "learning_rate": 7.0084318515576615e-09, + "loss": 0.1641, + "step": 16561 + }, + { + "epoch": 4.407131452900479, + "grad_norm": 0.3191737234592438, + "learning_rate": 7.002222059063933e-09, + "loss": 0.1785, + "step": 16562 + }, + { + "epoch": 4.407397551889303, + "grad_norm": 0.2889912724494934, + "learning_rate": 6.996014919055837e-09, + "loss": 0.1662, + "step": 16563 + }, + { + "epoch": 4.407663650878127, + "grad_norm": 0.2802044749259949, + "learning_rate": 6.989810431710375e-09, + "loss": 0.166, + "step": 16564 + }, + { + "epoch": 4.40792974986695, + "grad_norm": 0.4170317053794861, + "learning_rate": 6.983608597204538e-09, + "loss": 0.1674, + "step": 16565 + }, + { + "epoch": 4.408195848855774, + "grad_norm": 0.36989402770996094, + "learning_rate": 6.977409415715185e-09, + "loss": 0.1771, + "step": 16566 + }, + { + "epoch": 4.408461947844598, + "grad_norm": 0.2992914617061615, + "learning_rate": 6.9712128874191644e-09, + "loss": 0.1762, + "step": 16567 + }, + { + "epoch": 4.4087280468334225, + "grad_norm": 0.25377553701400757, + "learning_rate": 6.965019012493167e-09, + "loss": 0.1709, + "step": 16568 + }, + { + "epoch": 4.408994145822246, + "grad_norm": 0.2622515559196472, + "learning_rate": 6.958827791113908e-09, + "loss": 0.1799, + "step": 16569 + }, + { + "epoch": 4.40926024481107, + "grad_norm": 0.3634369671344757, + "learning_rate": 6.952639223457934e-09, + "loss": 0.1828, + "step": 16570 + }, + { + "epoch": 4.409526343799894, + "grad_norm": 0.38339418172836304, + "learning_rate": 6.946453309701794e-09, + "loss": 0.1834, + "step": 16571 + }, + { + "epoch": 4.409792442788717, + "grad_norm": 0.2603078782558441, + "learning_rate": 6.9402700500218794e-09, + "loss": 0.1565, + "step": 16572 + }, + { + "epoch": 4.410058541777541, + "grad_norm": 0.39224937558174133, + "learning_rate": 6.934089444594582e-09, + "loss": 0.1926, + "step": 16573 + }, + { + "epoch": 4.410324640766365, + "grad_norm": 0.34496304392814636, + "learning_rate": 6.927911493596161e-09, + "loss": 0.1766, + "step": 16574 + }, + { + "epoch": 4.410590739755189, + "grad_norm": 0.26424264907836914, + "learning_rate": 6.921736197202832e-09, + "loss": 0.1553, + "step": 16575 + }, + { + "epoch": 4.410856838744013, + "grad_norm": 0.29262998700141907, + "learning_rate": 6.915563555590742e-09, + "loss": 0.1658, + "step": 16576 + }, + { + "epoch": 4.411122937732837, + "grad_norm": 0.42791256308555603, + "learning_rate": 6.909393568935939e-09, + "loss": 0.193, + "step": 16577 + }, + { + "epoch": 4.41138903672166, + "grad_norm": 0.27279895544052124, + "learning_rate": 6.903226237414383e-09, + "loss": 0.1801, + "step": 16578 + }, + { + "epoch": 4.411655135710484, + "grad_norm": 0.27259859442710876, + "learning_rate": 6.8970615612020226e-09, + "loss": 0.1725, + "step": 16579 + }, + { + "epoch": 4.411921234699308, + "grad_norm": 0.2806645929813385, + "learning_rate": 6.890899540474637e-09, + "loss": 0.187, + "step": 16580 + }, + { + "epoch": 4.412187333688132, + "grad_norm": 0.3685671389102936, + "learning_rate": 6.8847401754080106e-09, + "loss": 0.1686, + "step": 16581 + }, + { + "epoch": 4.412453432676956, + "grad_norm": 0.2654785215854645, + "learning_rate": 6.878583466177812e-09, + "loss": 0.1616, + "step": 16582 + }, + { + "epoch": 4.41271953166578, + "grad_norm": 0.3247893452644348, + "learning_rate": 6.872429412959635e-09, + "loss": 0.1714, + "step": 16583 + }, + { + "epoch": 4.412985630654603, + "grad_norm": 0.27260392904281616, + "learning_rate": 6.866278015929039e-09, + "loss": 0.1662, + "step": 16584 + }, + { + "epoch": 4.413251729643427, + "grad_norm": 0.4800134599208832, + "learning_rate": 6.860129275261439e-09, + "loss": 0.1908, + "step": 16585 + }, + { + "epoch": 4.413517828632251, + "grad_norm": 0.39059504866600037, + "learning_rate": 6.8539831911322286e-09, + "loss": 0.2011, + "step": 16586 + }, + { + "epoch": 4.413783927621075, + "grad_norm": 0.3462659418582916, + "learning_rate": 6.84783976371669e-09, + "loss": 0.1712, + "step": 16587 + }, + { + "epoch": 4.414050026609899, + "grad_norm": 0.28168755769729614, + "learning_rate": 6.84169899319007e-09, + "loss": 0.1783, + "step": 16588 + }, + { + "epoch": 4.414316125598723, + "grad_norm": 0.34647902846336365, + "learning_rate": 6.835560879727486e-09, + "loss": 0.168, + "step": 16589 + }, + { + "epoch": 4.414582224587546, + "grad_norm": 0.2847658395767212, + "learning_rate": 6.8294254235040205e-09, + "loss": 0.1759, + "step": 16590 + }, + { + "epoch": 4.41484832357637, + "grad_norm": 0.27446451783180237, + "learning_rate": 6.823292624694676e-09, + "loss": 0.1686, + "step": 16591 + }, + { + "epoch": 4.415114422565194, + "grad_norm": 0.3097914457321167, + "learning_rate": 6.817162483474381e-09, + "loss": 0.182, + "step": 16592 + }, + { + "epoch": 4.4153805215540185, + "grad_norm": 0.2748108506202698, + "learning_rate": 6.8110350000179615e-09, + "loss": 0.1696, + "step": 16593 + }, + { + "epoch": 4.415646620542842, + "grad_norm": 0.29712024331092834, + "learning_rate": 6.804910174500211e-09, + "loss": 0.1708, + "step": 16594 + }, + { + "epoch": 4.415912719531666, + "grad_norm": 0.41117897629737854, + "learning_rate": 6.798788007095779e-09, + "loss": 0.1965, + "step": 16595 + }, + { + "epoch": 4.41617881852049, + "grad_norm": 0.24413500726222992, + "learning_rate": 6.792668497979304e-09, + "loss": 0.1573, + "step": 16596 + }, + { + "epoch": 4.416444917509313, + "grad_norm": 0.5656774640083313, + "learning_rate": 6.7865516473253446e-09, + "loss": 0.1685, + "step": 16597 + }, + { + "epoch": 4.416711016498137, + "grad_norm": 0.2766364812850952, + "learning_rate": 6.780437455308352e-09, + "loss": 0.181, + "step": 16598 + }, + { + "epoch": 4.4169771154869615, + "grad_norm": 0.279771089553833, + "learning_rate": 6.774325922102708e-09, + "loss": 0.1779, + "step": 16599 + }, + { + "epoch": 4.417243214475785, + "grad_norm": 0.262149840593338, + "learning_rate": 6.76821704788273e-09, + "loss": 0.1679, + "step": 16600 + }, + { + "epoch": 4.417509313464609, + "grad_norm": 0.34106171131134033, + "learning_rate": 6.762110832822665e-09, + "loss": 0.1822, + "step": 16601 + }, + { + "epoch": 4.417775412453433, + "grad_norm": 0.2652304172515869, + "learning_rate": 6.756007277096665e-09, + "loss": 0.1794, + "step": 16602 + }, + { + "epoch": 4.418041511442256, + "grad_norm": 0.26377853751182556, + "learning_rate": 6.749906380878823e-09, + "loss": 0.1664, + "step": 16603 + }, + { + "epoch": 4.41830761043108, + "grad_norm": 0.3059224486351013, + "learning_rate": 6.743808144343144e-09, + "loss": 0.1683, + "step": 16604 + }, + { + "epoch": 4.4185737094199045, + "grad_norm": 0.296077698469162, + "learning_rate": 6.737712567663545e-09, + "loss": 0.1713, + "step": 16605 + }, + { + "epoch": 4.418839808408728, + "grad_norm": 0.26403436064720154, + "learning_rate": 6.731619651013909e-09, + "loss": 0.1677, + "step": 16606 + }, + { + "epoch": 4.419105907397552, + "grad_norm": 0.365458220243454, + "learning_rate": 6.7255293945680305e-09, + "loss": 0.1794, + "step": 16607 + }, + { + "epoch": 4.419372006386376, + "grad_norm": 0.2988140881061554, + "learning_rate": 6.71944179849957e-09, + "loss": 0.1884, + "step": 16608 + }, + { + "epoch": 4.419638105375199, + "grad_norm": 0.2747856676578522, + "learning_rate": 6.713356862982212e-09, + "loss": 0.1718, + "step": 16609 + }, + { + "epoch": 4.419904204364023, + "grad_norm": 0.28042590618133545, + "learning_rate": 6.70727458818946e-09, + "loss": 0.1824, + "step": 16610 + }, + { + "epoch": 4.420170303352847, + "grad_norm": 0.28888314962387085, + "learning_rate": 6.7011949742948215e-09, + "loss": 0.1639, + "step": 16611 + }, + { + "epoch": 4.420436402341672, + "grad_norm": 0.30145007371902466, + "learning_rate": 6.695118021471691e-09, + "loss": 0.1739, + "step": 16612 + }, + { + "epoch": 4.420702501330495, + "grad_norm": 0.2739548981189728, + "learning_rate": 6.6890437298934175e-09, + "loss": 0.1702, + "step": 16613 + }, + { + "epoch": 4.420968600319319, + "grad_norm": 0.2861139476299286, + "learning_rate": 6.68297209973322e-09, + "loss": 0.179, + "step": 16614 + }, + { + "epoch": 4.421234699308142, + "grad_norm": 0.30571219325065613, + "learning_rate": 6.676903131164302e-09, + "loss": 0.1755, + "step": 16615 + }, + { + "epoch": 4.421500798296966, + "grad_norm": 0.28956323862075806, + "learning_rate": 6.670836824359727e-09, + "loss": 0.1845, + "step": 16616 + }, + { + "epoch": 4.42176689728579, + "grad_norm": 0.2814384400844574, + "learning_rate": 6.664773179492544e-09, + "loss": 0.1724, + "step": 16617 + }, + { + "epoch": 4.4220329962746145, + "grad_norm": 0.4819956421852112, + "learning_rate": 6.6587121967357165e-09, + "loss": 0.1994, + "step": 16618 + }, + { + "epoch": 4.422299095263438, + "grad_norm": 0.2768710255622864, + "learning_rate": 6.65265387626206e-09, + "loss": 0.17, + "step": 16619 + }, + { + "epoch": 4.422565194252262, + "grad_norm": 0.26464027166366577, + "learning_rate": 6.646598218244437e-09, + "loss": 0.1656, + "step": 16620 + }, + { + "epoch": 4.422831293241086, + "grad_norm": 0.2890174388885498, + "learning_rate": 6.64054522285552e-09, + "loss": 0.1665, + "step": 16621 + }, + { + "epoch": 4.423097392229909, + "grad_norm": 0.34943386912345886, + "learning_rate": 6.634494890267983e-09, + "loss": 0.174, + "step": 16622 + }, + { + "epoch": 4.423363491218733, + "grad_norm": 0.273799329996109, + "learning_rate": 6.628447220654365e-09, + "loss": 0.1758, + "step": 16623 + }, + { + "epoch": 4.4236295902075575, + "grad_norm": 0.2848457396030426, + "learning_rate": 6.622402214187184e-09, + "loss": 0.1794, + "step": 16624 + }, + { + "epoch": 4.423895689196381, + "grad_norm": 0.41481703519821167, + "learning_rate": 6.616359871038834e-09, + "loss": 0.1822, + "step": 16625 + }, + { + "epoch": 4.424161788185205, + "grad_norm": 0.2951644957065582, + "learning_rate": 6.6103201913816685e-09, + "loss": 0.1888, + "step": 16626 + }, + { + "epoch": 4.424427887174029, + "grad_norm": 0.32391995191574097, + "learning_rate": 6.604283175387937e-09, + "loss": 0.1821, + "step": 16627 + }, + { + "epoch": 4.424693986162852, + "grad_norm": 0.26556307077407837, + "learning_rate": 6.598248823229868e-09, + "loss": 0.1694, + "step": 16628 + }, + { + "epoch": 4.424960085151676, + "grad_norm": 0.2636234164237976, + "learning_rate": 6.592217135079514e-09, + "loss": 0.1598, + "step": 16629 + }, + { + "epoch": 4.4252261841405005, + "grad_norm": 0.29169127345085144, + "learning_rate": 6.58618811110897e-09, + "loss": 0.1829, + "step": 16630 + }, + { + "epoch": 4.425492283129324, + "grad_norm": 0.39787808060646057, + "learning_rate": 6.580161751490143e-09, + "loss": 0.1768, + "step": 16631 + }, + { + "epoch": 4.425758382118148, + "grad_norm": 0.4996621012687683, + "learning_rate": 6.57413805639494e-09, + "loss": 0.1542, + "step": 16632 + }, + { + "epoch": 4.426024481106972, + "grad_norm": 0.2595677077770233, + "learning_rate": 6.568117025995179e-09, + "loss": 0.1743, + "step": 16633 + }, + { + "epoch": 4.426290580095795, + "grad_norm": 0.30141767859458923, + "learning_rate": 6.562098660462589e-09, + "loss": 0.1761, + "step": 16634 + }, + { + "epoch": 4.426556679084619, + "grad_norm": 0.2668730616569519, + "learning_rate": 6.556082959968812e-09, + "loss": 0.1773, + "step": 16635 + }, + { + "epoch": 4.4268227780734435, + "grad_norm": 0.2725101411342621, + "learning_rate": 6.550069924685431e-09, + "loss": 0.1704, + "step": 16636 + }, + { + "epoch": 4.427088877062268, + "grad_norm": 0.3319339156150818, + "learning_rate": 6.544059554783965e-09, + "loss": 0.179, + "step": 16637 + }, + { + "epoch": 4.427354976051091, + "grad_norm": 0.28072264790534973, + "learning_rate": 6.538051850435833e-09, + "loss": 0.1775, + "step": 16638 + }, + { + "epoch": 4.427621075039915, + "grad_norm": 0.2742157280445099, + "learning_rate": 6.532046811812386e-09, + "loss": 0.1584, + "step": 16639 + }, + { + "epoch": 4.427887174028739, + "grad_norm": 0.2727229595184326, + "learning_rate": 6.526044439084888e-09, + "loss": 0.1696, + "step": 16640 + }, + { + "epoch": 4.428153273017562, + "grad_norm": 0.39120960235595703, + "learning_rate": 6.520044732424557e-09, + "loss": 0.1751, + "step": 16641 + }, + { + "epoch": 4.4284193720063865, + "grad_norm": 0.3715169131755829, + "learning_rate": 6.514047692002511e-09, + "loss": 0.1762, + "step": 16642 + }, + { + "epoch": 4.428685470995211, + "grad_norm": 0.25972893834114075, + "learning_rate": 6.5080533179898034e-09, + "loss": 0.1631, + "step": 16643 + }, + { + "epoch": 4.428951569984034, + "grad_norm": 0.27536022663116455, + "learning_rate": 6.502061610557397e-09, + "loss": 0.1691, + "step": 16644 + }, + { + "epoch": 4.429217668972858, + "grad_norm": 0.2755509316921234, + "learning_rate": 6.496072569876199e-09, + "loss": 0.1792, + "step": 16645 + }, + { + "epoch": 4.429483767961682, + "grad_norm": 0.3246176540851593, + "learning_rate": 6.490086196116995e-09, + "loss": 0.1994, + "step": 16646 + }, + { + "epoch": 4.429749866950505, + "grad_norm": 0.2798031270503998, + "learning_rate": 6.484102489450571e-09, + "loss": 0.1842, + "step": 16647 + }, + { + "epoch": 4.4300159659393294, + "grad_norm": 0.2890048623085022, + "learning_rate": 6.478121450047569e-09, + "loss": 0.1933, + "step": 16648 + }, + { + "epoch": 4.430282064928154, + "grad_norm": 0.2558535635471344, + "learning_rate": 6.472143078078607e-09, + "loss": 0.1784, + "step": 16649 + }, + { + "epoch": 4.430548163916977, + "grad_norm": 0.2882526218891144, + "learning_rate": 6.46616737371416e-09, + "loss": 0.1724, + "step": 16650 + }, + { + "epoch": 4.430814262905801, + "grad_norm": 0.3079146146774292, + "learning_rate": 6.460194337124713e-09, + "loss": 0.1926, + "step": 16651 + }, + { + "epoch": 4.431080361894625, + "grad_norm": 0.28819572925567627, + "learning_rate": 6.454223968480588e-09, + "loss": 0.18, + "step": 16652 + }, + { + "epoch": 4.431346460883448, + "grad_norm": 0.4820282757282257, + "learning_rate": 6.448256267952101e-09, + "loss": 0.1931, + "step": 16653 + }, + { + "epoch": 4.431612559872272, + "grad_norm": 0.39280253648757935, + "learning_rate": 6.442291235709441e-09, + "loss": 0.1933, + "step": 16654 + }, + { + "epoch": 4.4318786588610966, + "grad_norm": 0.4561369717121124, + "learning_rate": 6.436328871922747e-09, + "loss": 0.1812, + "step": 16655 + }, + { + "epoch": 4.43214475784992, + "grad_norm": 0.28663671016693115, + "learning_rate": 6.430369176762085e-09, + "loss": 0.1878, + "step": 16656 + }, + { + "epoch": 4.432410856838744, + "grad_norm": 0.3244505226612091, + "learning_rate": 6.4244121503974404e-09, + "loss": 0.1776, + "step": 16657 + }, + { + "epoch": 4.432676955827568, + "grad_norm": 0.4158242642879486, + "learning_rate": 6.418457792998733e-09, + "loss": 0.1582, + "step": 16658 + }, + { + "epoch": 4.432943054816391, + "grad_norm": 0.29117703437805176, + "learning_rate": 6.41250610473576e-09, + "loss": 0.1635, + "step": 16659 + }, + { + "epoch": 4.433209153805215, + "grad_norm": 0.3074641525745392, + "learning_rate": 6.406557085778308e-09, + "loss": 0.193, + "step": 16660 + }, + { + "epoch": 4.4334752527940395, + "grad_norm": 0.27314022183418274, + "learning_rate": 6.400610736296019e-09, + "loss": 0.1613, + "step": 16661 + }, + { + "epoch": 4.433741351782864, + "grad_norm": 0.31969717144966125, + "learning_rate": 6.394667056458514e-09, + "loss": 0.1798, + "step": 16662 + }, + { + "epoch": 4.434007450771687, + "grad_norm": 0.2983028292655945, + "learning_rate": 6.388726046435322e-09, + "loss": 0.1753, + "step": 16663 + }, + { + "epoch": 4.434273549760511, + "grad_norm": 0.25409314036369324, + "learning_rate": 6.382787706395909e-09, + "loss": 0.1655, + "step": 16664 + }, + { + "epoch": 4.434539648749335, + "grad_norm": 0.3965536952018738, + "learning_rate": 6.3768520365096055e-09, + "loss": 0.188, + "step": 16665 + }, + { + "epoch": 4.434805747738158, + "grad_norm": 0.27988380193710327, + "learning_rate": 6.370919036945754e-09, + "loss": 0.1673, + "step": 16666 + }, + { + "epoch": 4.4350718467269825, + "grad_norm": 0.4528764486312866, + "learning_rate": 6.364988707873542e-09, + "loss": 0.204, + "step": 16667 + }, + { + "epoch": 4.435337945715807, + "grad_norm": 0.304535448551178, + "learning_rate": 6.3590610494621445e-09, + "loss": 0.1608, + "step": 16668 + }, + { + "epoch": 4.43560404470463, + "grad_norm": 0.2870250344276428, + "learning_rate": 6.353136061880582e-09, + "loss": 0.1851, + "step": 16669 + }, + { + "epoch": 4.435870143693454, + "grad_norm": 0.2820807695388794, + "learning_rate": 6.347213745297908e-09, + "loss": 0.1618, + "step": 16670 + }, + { + "epoch": 4.436136242682278, + "grad_norm": 0.32195577025413513, + "learning_rate": 6.341294099882988e-09, + "loss": 0.1789, + "step": 16671 + }, + { + "epoch": 4.436402341671101, + "grad_norm": 0.33789440989494324, + "learning_rate": 6.335377125804686e-09, + "loss": 0.1617, + "step": 16672 + }, + { + "epoch": 4.4366684406599255, + "grad_norm": 0.30411189794540405, + "learning_rate": 6.3294628232317904e-09, + "loss": 0.1952, + "step": 16673 + }, + { + "epoch": 4.43693453964875, + "grad_norm": 0.2823595404624939, + "learning_rate": 6.3235511923329325e-09, + "loss": 0.1814, + "step": 16674 + }, + { + "epoch": 4.437200638637573, + "grad_norm": 0.25415629148483276, + "learning_rate": 6.317642233276776e-09, + "loss": 0.1641, + "step": 16675 + }, + { + "epoch": 4.437466737626397, + "grad_norm": 0.2727624177932739, + "learning_rate": 6.311735946231822e-09, + "loss": 0.1678, + "step": 16676 + }, + { + "epoch": 4.437732836615221, + "grad_norm": 0.3714253306388855, + "learning_rate": 6.305832331366545e-09, + "loss": 0.1798, + "step": 16677 + }, + { + "epoch": 4.437998935604044, + "grad_norm": 0.28408125042915344, + "learning_rate": 6.299931388849311e-09, + "loss": 0.1621, + "step": 16678 + }, + { + "epoch": 4.4382650345928685, + "grad_norm": 0.34104254841804504, + "learning_rate": 6.294033118848474e-09, + "loss": 0.1845, + "step": 16679 + }, + { + "epoch": 4.438531133581693, + "grad_norm": 0.31155145168304443, + "learning_rate": 6.288137521532211e-09, + "loss": 0.1786, + "step": 16680 + }, + { + "epoch": 4.438797232570516, + "grad_norm": 0.40141624212265015, + "learning_rate": 6.282244597068709e-09, + "loss": 0.1913, + "step": 16681 + }, + { + "epoch": 4.43906333155934, + "grad_norm": 0.2823193669319153, + "learning_rate": 6.276354345626023e-09, + "loss": 0.1648, + "step": 16682 + }, + { + "epoch": 4.439329430548164, + "grad_norm": 0.31117627024650574, + "learning_rate": 6.270466767372163e-09, + "loss": 0.1858, + "step": 16683 + }, + { + "epoch": 4.439595529536987, + "grad_norm": 0.6326230764389038, + "learning_rate": 6.26458186247506e-09, + "loss": 0.1779, + "step": 16684 + }, + { + "epoch": 4.4398616285258115, + "grad_norm": 0.44851309061050415, + "learning_rate": 6.258699631102571e-09, + "loss": 0.1834, + "step": 16685 + }, + { + "epoch": 4.440127727514636, + "grad_norm": 0.3247748613357544, + "learning_rate": 6.25282007342246e-09, + "loss": 0.1708, + "step": 16686 + }, + { + "epoch": 4.44039382650346, + "grad_norm": 0.31531476974487305, + "learning_rate": 6.246943189602427e-09, + "loss": 0.1775, + "step": 16687 + }, + { + "epoch": 4.440659925492283, + "grad_norm": 0.27759647369384766, + "learning_rate": 6.241068979810071e-09, + "loss": 0.1647, + "step": 16688 + }, + { + "epoch": 4.440926024481107, + "grad_norm": 0.2699977159500122, + "learning_rate": 6.235197444212981e-09, + "loss": 0.1593, + "step": 16689 + }, + { + "epoch": 4.441192123469931, + "grad_norm": 0.35384178161621094, + "learning_rate": 6.229328582978577e-09, + "loss": 0.184, + "step": 16690 + }, + { + "epoch": 4.441458222458754, + "grad_norm": 0.27347657084465027, + "learning_rate": 6.223462396274282e-09, + "loss": 0.1703, + "step": 16691 + }, + { + "epoch": 4.441724321447579, + "grad_norm": 0.294286847114563, + "learning_rate": 6.217598884267395e-09, + "loss": 0.1857, + "step": 16692 + }, + { + "epoch": 4.441990420436403, + "grad_norm": 0.3009665608406067, + "learning_rate": 6.211738047125159e-09, + "loss": 0.1796, + "step": 16693 + }, + { + "epoch": 4.442256519425226, + "grad_norm": 0.3337925672531128, + "learning_rate": 6.205879885014765e-09, + "loss": 0.168, + "step": 16694 + }, + { + "epoch": 4.44252261841405, + "grad_norm": 0.28371134400367737, + "learning_rate": 6.200024398103254e-09, + "loss": 0.1703, + "step": 16695 + }, + { + "epoch": 4.442788717402874, + "grad_norm": 0.291938841342926, + "learning_rate": 6.194171586557672e-09, + "loss": 0.1919, + "step": 16696 + }, + { + "epoch": 4.443054816391697, + "grad_norm": 0.27689728140830994, + "learning_rate": 6.188321450544931e-09, + "loss": 0.1748, + "step": 16697 + }, + { + "epoch": 4.4433209153805215, + "grad_norm": 0.354726642370224, + "learning_rate": 6.182473990231895e-09, + "loss": 0.2021, + "step": 16698 + }, + { + "epoch": 4.443587014369346, + "grad_norm": 0.477155864238739, + "learning_rate": 6.176629205785344e-09, + "loss": 0.1836, + "step": 16699 + }, + { + "epoch": 4.443853113358169, + "grad_norm": 0.2973034977912903, + "learning_rate": 6.170787097371999e-09, + "loss": 0.1755, + "step": 16700 + }, + { + "epoch": 4.444119212346993, + "grad_norm": 0.27947479486465454, + "learning_rate": 6.164947665158459e-09, + "loss": 0.1737, + "step": 16701 + }, + { + "epoch": 4.444385311335817, + "grad_norm": 0.2764875888824463, + "learning_rate": 6.1591109093113154e-09, + "loss": 0.1739, + "step": 16702 + }, + { + "epoch": 4.44465141032464, + "grad_norm": 0.28126975893974304, + "learning_rate": 6.153276829997012e-09, + "loss": 0.1798, + "step": 16703 + }, + { + "epoch": 4.4449175093134645, + "grad_norm": 0.2900312840938568, + "learning_rate": 6.147445427381959e-09, + "loss": 0.1696, + "step": 16704 + }, + { + "epoch": 4.445183608302289, + "grad_norm": 0.25857797265052795, + "learning_rate": 6.14161670163248e-09, + "loss": 0.1776, + "step": 16705 + }, + { + "epoch": 4.445449707291112, + "grad_norm": 0.2931055724620819, + "learning_rate": 6.135790652914819e-09, + "loss": 0.1674, + "step": 16706 + }, + { + "epoch": 4.445715806279936, + "grad_norm": 0.27216237783432007, + "learning_rate": 6.1299672813951435e-09, + "loss": 0.1694, + "step": 16707 + }, + { + "epoch": 4.44598190526876, + "grad_norm": 0.2786043584346771, + "learning_rate": 6.124146587239554e-09, + "loss": 0.1706, + "step": 16708 + }, + { + "epoch": 4.446248004257583, + "grad_norm": 0.3694521188735962, + "learning_rate": 6.118328570614084e-09, + "loss": 0.1936, + "step": 16709 + }, + { + "epoch": 4.4465141032464075, + "grad_norm": 0.361937016248703, + "learning_rate": 6.1125132316846465e-09, + "loss": 0.178, + "step": 16710 + }, + { + "epoch": 4.446780202235232, + "grad_norm": 0.30118808150291443, + "learning_rate": 6.10670057061714e-09, + "loss": 0.1747, + "step": 16711 + }, + { + "epoch": 4.447046301224056, + "grad_norm": 0.2962747812271118, + "learning_rate": 6.100890587577312e-09, + "loss": 0.1793, + "step": 16712 + }, + { + "epoch": 4.447312400212879, + "grad_norm": 0.36501094698905945, + "learning_rate": 6.095083282730906e-09, + "loss": 0.192, + "step": 16713 + }, + { + "epoch": 4.447578499201703, + "grad_norm": 0.3031248152256012, + "learning_rate": 6.089278656243535e-09, + "loss": 0.1853, + "step": 16714 + }, + { + "epoch": 4.447844598190527, + "grad_norm": 0.30711811780929565, + "learning_rate": 6.083476708280799e-09, + "loss": 0.1883, + "step": 16715 + }, + { + "epoch": 4.4481106971793505, + "grad_norm": 0.26004624366760254, + "learning_rate": 6.077677439008133e-09, + "loss": 0.1506, + "step": 16716 + }, + { + "epoch": 4.448376796168175, + "grad_norm": 0.34645575284957886, + "learning_rate": 6.071880848590982e-09, + "loss": 0.185, + "step": 16717 + }, + { + "epoch": 4.448642895156999, + "grad_norm": 0.33252429962158203, + "learning_rate": 6.066086937194648e-09, + "loss": 0.1745, + "step": 16718 + }, + { + "epoch": 4.448908994145822, + "grad_norm": 0.3368319272994995, + "learning_rate": 6.0602957049844085e-09, + "loss": 0.1661, + "step": 16719 + }, + { + "epoch": 4.449175093134646, + "grad_norm": 0.4211878180503845, + "learning_rate": 6.054507152125399e-09, + "loss": 0.1931, + "step": 16720 + }, + { + "epoch": 4.44944119212347, + "grad_norm": 0.34867098927497864, + "learning_rate": 6.0487212787827865e-09, + "loss": 0.1766, + "step": 16721 + }, + { + "epoch": 4.4497072911122935, + "grad_norm": 0.26057520508766174, + "learning_rate": 6.042938085121552e-09, + "loss": 0.1776, + "step": 16722 + }, + { + "epoch": 4.449973390101118, + "grad_norm": 0.2585197389125824, + "learning_rate": 6.037157571306662e-09, + "loss": 0.1592, + "step": 16723 + }, + { + "epoch": 4.450239489089942, + "grad_norm": 0.43489304184913635, + "learning_rate": 6.031379737502962e-09, + "loss": 0.1693, + "step": 16724 + }, + { + "epoch": 4.450505588078765, + "grad_norm": 0.31588014960289, + "learning_rate": 6.025604583875276e-09, + "loss": 0.1964, + "step": 16725 + }, + { + "epoch": 4.450771687067589, + "grad_norm": 0.3008822202682495, + "learning_rate": 6.019832110588308e-09, + "loss": 0.175, + "step": 16726 + }, + { + "epoch": 4.451037786056413, + "grad_norm": 0.27475759387016296, + "learning_rate": 6.014062317806701e-09, + "loss": 0.1811, + "step": 16727 + }, + { + "epoch": 4.451303885045236, + "grad_norm": 0.3192194700241089, + "learning_rate": 6.008295205695024e-09, + "loss": 0.1651, + "step": 16728 + }, + { + "epoch": 4.451569984034061, + "grad_norm": 0.4677888751029968, + "learning_rate": 6.002530774417769e-09, + "loss": 0.1642, + "step": 16729 + }, + { + "epoch": 4.451836083022885, + "grad_norm": 0.2821654677391052, + "learning_rate": 5.9967690241393585e-09, + "loss": 0.1694, + "step": 16730 + }, + { + "epoch": 4.452102182011709, + "grad_norm": 0.2648022472858429, + "learning_rate": 5.991009955024107e-09, + "loss": 0.1676, + "step": 16731 + }, + { + "epoch": 4.452368281000532, + "grad_norm": 0.2929806709289551, + "learning_rate": 5.985253567236303e-09, + "loss": 0.1825, + "step": 16732 + }, + { + "epoch": 4.452634379989356, + "grad_norm": 0.33593758940696716, + "learning_rate": 5.979499860940107e-09, + "loss": 0.1701, + "step": 16733 + }, + { + "epoch": 4.452900478978179, + "grad_norm": 0.3642506003379822, + "learning_rate": 5.9737488362996305e-09, + "loss": 0.1697, + "step": 16734 + }, + { + "epoch": 4.4531665779670035, + "grad_norm": 0.2887832820415497, + "learning_rate": 5.968000493478897e-09, + "loss": 0.1767, + "step": 16735 + }, + { + "epoch": 4.453432676955828, + "grad_norm": 0.26270589232444763, + "learning_rate": 5.9622548326418995e-09, + "loss": 0.1654, + "step": 16736 + }, + { + "epoch": 4.453698775944652, + "grad_norm": 1.002366542816162, + "learning_rate": 5.956511853952462e-09, + "loss": 0.1918, + "step": 16737 + }, + { + "epoch": 4.453964874933475, + "grad_norm": 0.34525999426841736, + "learning_rate": 5.95077155757443e-09, + "loss": 0.166, + "step": 16738 + }, + { + "epoch": 4.454230973922299, + "grad_norm": 0.32529300451278687, + "learning_rate": 5.945033943671496e-09, + "loss": 0.1698, + "step": 16739 + }, + { + "epoch": 4.454497072911123, + "grad_norm": 0.26434656977653503, + "learning_rate": 5.939299012407339e-09, + "loss": 0.1639, + "step": 16740 + }, + { + "epoch": 4.4547631718999465, + "grad_norm": 2.353527784347534, + "learning_rate": 5.933566763945508e-09, + "loss": 0.1664, + "step": 16741 + }, + { + "epoch": 4.455029270888771, + "grad_norm": 0.45391562581062317, + "learning_rate": 5.9278371984494926e-09, + "loss": 0.1798, + "step": 16742 + }, + { + "epoch": 4.455295369877595, + "grad_norm": 0.27807652950286865, + "learning_rate": 5.922110316082729e-09, + "loss": 0.1873, + "step": 16743 + }, + { + "epoch": 4.455561468866418, + "grad_norm": 0.2748173475265503, + "learning_rate": 5.916386117008565e-09, + "loss": 0.1786, + "step": 16744 + }, + { + "epoch": 4.455827567855242, + "grad_norm": 0.2575368881225586, + "learning_rate": 5.910664601390247e-09, + "loss": 0.1583, + "step": 16745 + }, + { + "epoch": 4.456093666844066, + "grad_norm": 0.3682350218296051, + "learning_rate": 5.904945769390968e-09, + "loss": 0.169, + "step": 16746 + }, + { + "epoch": 4.4563597658328895, + "grad_norm": 0.3726021945476532, + "learning_rate": 5.899229621173873e-09, + "loss": 0.172, + "step": 16747 + }, + { + "epoch": 4.456625864821714, + "grad_norm": 0.33969607949256897, + "learning_rate": 5.8935161569019456e-09, + "loss": 0.1812, + "step": 16748 + }, + { + "epoch": 4.456891963810538, + "grad_norm": 0.29367777705192566, + "learning_rate": 5.8878053767381645e-09, + "loss": 0.1704, + "step": 16749 + }, + { + "epoch": 4.457158062799361, + "grad_norm": 0.332949697971344, + "learning_rate": 5.8820972808454324e-09, + "loss": 0.1768, + "step": 16750 + }, + { + "epoch": 4.457424161788185, + "grad_norm": 0.3587685227394104, + "learning_rate": 5.876391869386554e-09, + "loss": 0.1713, + "step": 16751 + }, + { + "epoch": 4.457690260777009, + "grad_norm": 0.27360039949417114, + "learning_rate": 5.870689142524232e-09, + "loss": 0.1639, + "step": 16752 + }, + { + "epoch": 4.4579563597658325, + "grad_norm": 0.31078431010246277, + "learning_rate": 5.864989100421147e-09, + "loss": 0.1713, + "step": 16753 + }, + { + "epoch": 4.458222458754657, + "grad_norm": 0.26730474829673767, + "learning_rate": 5.859291743239847e-09, + "loss": 0.1784, + "step": 16754 + }, + { + "epoch": 4.458488557743481, + "grad_norm": 0.2642102539539337, + "learning_rate": 5.853597071142879e-09, + "loss": 0.1816, + "step": 16755 + }, + { + "epoch": 4.458754656732305, + "grad_norm": 0.27975180745124817, + "learning_rate": 5.847905084292604e-09, + "loss": 0.1713, + "step": 16756 + }, + { + "epoch": 4.459020755721128, + "grad_norm": 0.3237477242946625, + "learning_rate": 5.8422157828514114e-09, + "loss": 0.171, + "step": 16757 + }, + { + "epoch": 4.459286854709952, + "grad_norm": 0.26542606949806213, + "learning_rate": 5.836529166981563e-09, + "loss": 0.1683, + "step": 16758 + }, + { + "epoch": 4.459552953698776, + "grad_norm": 0.27650922536849976, + "learning_rate": 5.830845236845272e-09, + "loss": 0.1765, + "step": 16759 + }, + { + "epoch": 4.4598190526876, + "grad_norm": 0.27291011810302734, + "learning_rate": 5.825163992604609e-09, + "loss": 0.1674, + "step": 16760 + }, + { + "epoch": 4.460085151676424, + "grad_norm": 0.34606340527534485, + "learning_rate": 5.819485434421667e-09, + "loss": 0.1795, + "step": 16761 + }, + { + "epoch": 4.460351250665248, + "grad_norm": 0.3581279516220093, + "learning_rate": 5.8138095624583604e-09, + "loss": 0.1735, + "step": 16762 + }, + { + "epoch": 4.460617349654071, + "grad_norm": 0.28756171464920044, + "learning_rate": 5.808136376876605e-09, + "loss": 0.1865, + "step": 16763 + }, + { + "epoch": 4.460883448642895, + "grad_norm": 0.253498375415802, + "learning_rate": 5.802465877838214e-09, + "loss": 0.1816, + "step": 16764 + }, + { + "epoch": 4.461149547631719, + "grad_norm": 0.2876102328300476, + "learning_rate": 5.796798065504904e-09, + "loss": 0.1747, + "step": 16765 + }, + { + "epoch": 4.461415646620543, + "grad_norm": 0.3103514015674591, + "learning_rate": 5.791132940038357e-09, + "loss": 0.1755, + "step": 16766 + }, + { + "epoch": 4.461681745609367, + "grad_norm": 0.2735275328159332, + "learning_rate": 5.785470501600132e-09, + "loss": 0.1729, + "step": 16767 + }, + { + "epoch": 4.461947844598191, + "grad_norm": 0.3602042496204376, + "learning_rate": 5.779810750351755e-09, + "loss": 0.1837, + "step": 16768 + }, + { + "epoch": 4.462213943587014, + "grad_norm": 0.292431116104126, + "learning_rate": 5.77415368645463e-09, + "loss": 0.1806, + "step": 16769 + }, + { + "epoch": 4.462480042575838, + "grad_norm": 0.26115792989730835, + "learning_rate": 5.7684993100701186e-09, + "loss": 0.1676, + "step": 16770 + }, + { + "epoch": 4.462746141564662, + "grad_norm": 0.26622194051742554, + "learning_rate": 5.762847621359479e-09, + "loss": 0.1609, + "step": 16771 + }, + { + "epoch": 4.4630122405534856, + "grad_norm": 0.40122076869010925, + "learning_rate": 5.7571986204839494e-09, + "loss": 0.1654, + "step": 16772 + }, + { + "epoch": 4.46327833954231, + "grad_norm": 0.3499275743961334, + "learning_rate": 5.7515523076046125e-09, + "loss": 0.1625, + "step": 16773 + }, + { + "epoch": 4.463544438531134, + "grad_norm": 0.3658735156059265, + "learning_rate": 5.7459086828825275e-09, + "loss": 0.1905, + "step": 16774 + }, + { + "epoch": 4.463810537519957, + "grad_norm": 0.31644999980926514, + "learning_rate": 5.740267746478666e-09, + "loss": 0.1837, + "step": 16775 + }, + { + "epoch": 4.464076636508781, + "grad_norm": 0.3264918923377991, + "learning_rate": 5.734629498553911e-09, + "loss": 0.1745, + "step": 16776 + }, + { + "epoch": 4.464342735497605, + "grad_norm": 0.27581456303596497, + "learning_rate": 5.728993939269078e-09, + "loss": 0.1739, + "step": 16777 + }, + { + "epoch": 4.4646088344864285, + "grad_norm": 0.30940788984298706, + "learning_rate": 5.723361068784893e-09, + "loss": 0.2016, + "step": 16778 + }, + { + "epoch": 4.464874933475253, + "grad_norm": 0.3380996882915497, + "learning_rate": 5.71773088726204e-09, + "loss": 0.1861, + "step": 16779 + }, + { + "epoch": 4.465141032464077, + "grad_norm": 0.32729393243789673, + "learning_rate": 5.712103394861101e-09, + "loss": 0.1907, + "step": 16780 + }, + { + "epoch": 4.465407131452901, + "grad_norm": 0.2967623174190521, + "learning_rate": 5.706478591742558e-09, + "loss": 0.1782, + "step": 16781 + }, + { + "epoch": 4.465673230441724, + "grad_norm": 0.27026355266571045, + "learning_rate": 5.700856478066873e-09, + "loss": 0.1813, + "step": 16782 + }, + { + "epoch": 4.465939329430548, + "grad_norm": 0.366441011428833, + "learning_rate": 5.695237053994384e-09, + "loss": 0.1867, + "step": 16783 + }, + { + "epoch": 4.466205428419372, + "grad_norm": 0.35851380228996277, + "learning_rate": 5.689620319685373e-09, + "loss": 0.1792, + "step": 16784 + }, + { + "epoch": 4.466471527408196, + "grad_norm": 0.28025349974632263, + "learning_rate": 5.684006275300024e-09, + "loss": 0.1523, + "step": 16785 + }, + { + "epoch": 4.46673762639702, + "grad_norm": 0.2889268398284912, + "learning_rate": 5.678394920998486e-09, + "loss": 0.1679, + "step": 16786 + }, + { + "epoch": 4.467003725385844, + "grad_norm": 0.39696255326271057, + "learning_rate": 5.672786256940798e-09, + "loss": 0.1787, + "step": 16787 + }, + { + "epoch": 4.467269824374667, + "grad_norm": 0.27939194440841675, + "learning_rate": 5.667180283286921e-09, + "loss": 0.1706, + "step": 16788 + }, + { + "epoch": 4.467535923363491, + "grad_norm": 0.2610236406326294, + "learning_rate": 5.661577000196771e-09, + "loss": 0.1695, + "step": 16789 + }, + { + "epoch": 4.467802022352315, + "grad_norm": 0.2797725200653076, + "learning_rate": 5.655976407830132e-09, + "loss": 0.1853, + "step": 16790 + }, + { + "epoch": 4.468068121341139, + "grad_norm": 0.5586885809898376, + "learning_rate": 5.650378506346776e-09, + "loss": 0.1764, + "step": 16791 + }, + { + "epoch": 4.468334220329963, + "grad_norm": 0.3054581582546234, + "learning_rate": 5.644783295906341e-09, + "loss": 0.1976, + "step": 16792 + }, + { + "epoch": 4.468600319318787, + "grad_norm": 0.3057156503200531, + "learning_rate": 5.639190776668423e-09, + "loss": 0.1773, + "step": 16793 + }, + { + "epoch": 4.46886641830761, + "grad_norm": 0.26881086826324463, + "learning_rate": 5.633600948792538e-09, + "loss": 0.1735, + "step": 16794 + }, + { + "epoch": 4.469132517296434, + "grad_norm": 0.30225279927253723, + "learning_rate": 5.628013812438137e-09, + "loss": 0.1839, + "step": 16795 + }, + { + "epoch": 4.469398616285258, + "grad_norm": 0.2888995409011841, + "learning_rate": 5.622429367764525e-09, + "loss": 0.1728, + "step": 16796 + }, + { + "epoch": 4.469664715274082, + "grad_norm": 0.2732817828655243, + "learning_rate": 5.6168476149310304e-09, + "loss": 0.1606, + "step": 16797 + }, + { + "epoch": 4.469930814262906, + "grad_norm": 0.27226313948631287, + "learning_rate": 5.611268554096826e-09, + "loss": 0.1744, + "step": 16798 + }, + { + "epoch": 4.47019691325173, + "grad_norm": 0.3247871994972229, + "learning_rate": 5.605692185421051e-09, + "loss": 0.187, + "step": 16799 + }, + { + "epoch": 4.470463012240553, + "grad_norm": 0.3890274465084076, + "learning_rate": 5.6001185090627565e-09, + "loss": 0.193, + "step": 16800 + }, + { + "epoch": 4.470729111229377, + "grad_norm": 0.287759393453598, + "learning_rate": 5.5945475251809035e-09, + "loss": 0.1754, + "step": 16801 + }, + { + "epoch": 4.470995210218201, + "grad_norm": 0.2874178886413574, + "learning_rate": 5.588979233934421e-09, + "loss": 0.1653, + "step": 16802 + }, + { + "epoch": 4.471261309207025, + "grad_norm": 0.37088242173194885, + "learning_rate": 5.583413635482082e-09, + "loss": 0.1794, + "step": 16803 + }, + { + "epoch": 4.471527408195849, + "grad_norm": 0.35096874833106995, + "learning_rate": 5.5778507299826584e-09, + "loss": 0.1838, + "step": 16804 + }, + { + "epoch": 4.471793507184673, + "grad_norm": 0.26419997215270996, + "learning_rate": 5.572290517594802e-09, + "loss": 0.1665, + "step": 16805 + }, + { + "epoch": 4.472059606173497, + "grad_norm": 0.3438515365123749, + "learning_rate": 5.5667329984771195e-09, + "loss": 0.1794, + "step": 16806 + }, + { + "epoch": 4.47232570516232, + "grad_norm": 0.3458789885044098, + "learning_rate": 5.561178172788084e-09, + "loss": 0.1744, + "step": 16807 + }, + { + "epoch": 4.472591804151144, + "grad_norm": 0.26046067476272583, + "learning_rate": 5.555626040686167e-09, + "loss": 0.1599, + "step": 16808 + }, + { + "epoch": 4.4728579031399684, + "grad_norm": 0.2883478105068207, + "learning_rate": 5.550076602329712e-09, + "loss": 0.176, + "step": 16809 + }, + { + "epoch": 4.473124002128792, + "grad_norm": 0.27058178186416626, + "learning_rate": 5.544529857877023e-09, + "loss": 0.1631, + "step": 16810 + }, + { + "epoch": 4.473390101117616, + "grad_norm": 0.4229472577571869, + "learning_rate": 5.538985807486263e-09, + "loss": 0.1948, + "step": 16811 + }, + { + "epoch": 4.47365620010644, + "grad_norm": 0.29564616084098816, + "learning_rate": 5.533444451315605e-09, + "loss": 0.1704, + "step": 16812 + }, + { + "epoch": 4.473922299095263, + "grad_norm": 0.2558997571468353, + "learning_rate": 5.5279057895230574e-09, + "loss": 0.1474, + "step": 16813 + }, + { + "epoch": 4.474188398084087, + "grad_norm": 0.29024985432624817, + "learning_rate": 5.5223698222666145e-09, + "loss": 0.1816, + "step": 16814 + }, + { + "epoch": 4.474454497072911, + "grad_norm": 0.32730618119239807, + "learning_rate": 5.516836549704163e-09, + "loss": 0.172, + "step": 16815 + }, + { + "epoch": 4.474720596061735, + "grad_norm": 0.2629063129425049, + "learning_rate": 5.511305971993552e-09, + "loss": 0.1759, + "step": 16816 + }, + { + "epoch": 4.474986695050559, + "grad_norm": 0.32878994941711426, + "learning_rate": 5.505778089292501e-09, + "loss": 0.1833, + "step": 16817 + }, + { + "epoch": 4.475252794039383, + "grad_norm": 0.2773420810699463, + "learning_rate": 5.500252901758673e-09, + "loss": 0.1762, + "step": 16818 + }, + { + "epoch": 4.475518893028206, + "grad_norm": 0.2776477634906769, + "learning_rate": 5.494730409549675e-09, + "loss": 0.1726, + "step": 16819 + }, + { + "epoch": 4.47578499201703, + "grad_norm": 0.27349337935447693, + "learning_rate": 5.4892106128229924e-09, + "loss": 0.1688, + "step": 16820 + }, + { + "epoch": 4.476051091005854, + "grad_norm": 0.30898311734199524, + "learning_rate": 5.4836935117360985e-09, + "loss": 0.1639, + "step": 16821 + }, + { + "epoch": 4.476317189994678, + "grad_norm": 0.2944006323814392, + "learning_rate": 5.4781791064463016e-09, + "loss": 0.1728, + "step": 16822 + }, + { + "epoch": 4.476583288983502, + "grad_norm": 0.3516082465648651, + "learning_rate": 5.472667397110953e-09, + "loss": 0.1712, + "step": 16823 + }, + { + "epoch": 4.476849387972326, + "grad_norm": 0.37025657296180725, + "learning_rate": 5.467158383887194e-09, + "loss": 0.1772, + "step": 16824 + }, + { + "epoch": 4.477115486961149, + "grad_norm": 0.2629339396953583, + "learning_rate": 5.461652066932187e-09, + "loss": 0.162, + "step": 16825 + }, + { + "epoch": 4.477381585949973, + "grad_norm": 0.2737305760383606, + "learning_rate": 5.456148446402975e-09, + "loss": 0.1697, + "step": 16826 + }, + { + "epoch": 4.477647684938797, + "grad_norm": 0.3527158200740814, + "learning_rate": 5.4506475224565305e-09, + "loss": 0.1797, + "step": 16827 + }, + { + "epoch": 4.477913783927621, + "grad_norm": 0.2860719561576843, + "learning_rate": 5.445149295249751e-09, + "loss": 0.1571, + "step": 16828 + }, + { + "epoch": 4.478179882916445, + "grad_norm": 0.3527641296386719, + "learning_rate": 5.439653764939456e-09, + "loss": 0.1744, + "step": 16829 + }, + { + "epoch": 4.478445981905269, + "grad_norm": 0.35104337334632874, + "learning_rate": 5.434160931682386e-09, + "loss": 0.1738, + "step": 16830 + }, + { + "epoch": 4.478712080894093, + "grad_norm": 0.3736984431743622, + "learning_rate": 5.428670795635226e-09, + "loss": 0.1822, + "step": 16831 + }, + { + "epoch": 4.478978179882916, + "grad_norm": 0.2824196219444275, + "learning_rate": 5.423183356954541e-09, + "loss": 0.168, + "step": 16832 + }, + { + "epoch": 4.47924427887174, + "grad_norm": 0.2794841527938843, + "learning_rate": 5.417698615796873e-09, + "loss": 0.1786, + "step": 16833 + }, + { + "epoch": 4.4795103778605645, + "grad_norm": 0.3701518476009369, + "learning_rate": 5.412216572318629e-09, + "loss": 0.1862, + "step": 16834 + }, + { + "epoch": 4.479776476849388, + "grad_norm": 0.3462349474430084, + "learning_rate": 5.406737226676183e-09, + "loss": 0.1856, + "step": 16835 + }, + { + "epoch": 4.480042575838212, + "grad_norm": 0.2684818506240845, + "learning_rate": 5.40126057902579e-09, + "loss": 0.168, + "step": 16836 + }, + { + "epoch": 4.480308674827036, + "grad_norm": 0.2764264941215515, + "learning_rate": 5.395786629523702e-09, + "loss": 0.1734, + "step": 16837 + }, + { + "epoch": 4.480574773815859, + "grad_norm": 0.3743070065975189, + "learning_rate": 5.390315378326038e-09, + "loss": 0.1703, + "step": 16838 + }, + { + "epoch": 4.480840872804683, + "grad_norm": 0.3723808526992798, + "learning_rate": 5.3848468255888065e-09, + "loss": 0.1726, + "step": 16839 + }, + { + "epoch": 4.4811069717935075, + "grad_norm": 0.2593678832054138, + "learning_rate": 5.37938097146804e-09, + "loss": 0.164, + "step": 16840 + }, + { + "epoch": 4.481373070782331, + "grad_norm": 0.35133248567581177, + "learning_rate": 5.373917816119589e-09, + "loss": 0.1763, + "step": 16841 + }, + { + "epoch": 4.481639169771155, + "grad_norm": 0.2939228415489197, + "learning_rate": 5.368457359699296e-09, + "loss": 0.1801, + "step": 16842 + }, + { + "epoch": 4.481905268759979, + "grad_norm": 0.29604971408843994, + "learning_rate": 5.362999602362894e-09, + "loss": 0.1802, + "step": 16843 + }, + { + "epoch": 4.482171367748802, + "grad_norm": 0.28458255529403687, + "learning_rate": 5.357544544266057e-09, + "loss": 0.1834, + "step": 16844 + }, + { + "epoch": 4.482437466737626, + "grad_norm": 0.28533318638801575, + "learning_rate": 5.352092185564361e-09, + "loss": 0.1834, + "step": 16845 + }, + { + "epoch": 4.4827035657264505, + "grad_norm": 0.2863151431083679, + "learning_rate": 5.3466425264133585e-09, + "loss": 0.1683, + "step": 16846 + }, + { + "epoch": 4.482969664715274, + "grad_norm": 0.30131635069847107, + "learning_rate": 5.341195566968437e-09, + "loss": 0.1723, + "step": 16847 + }, + { + "epoch": 4.483235763704098, + "grad_norm": 0.2799026370048523, + "learning_rate": 5.335751307384983e-09, + "loss": 0.1793, + "step": 16848 + }, + { + "epoch": 4.483501862692922, + "grad_norm": 0.2706337869167328, + "learning_rate": 5.3303097478182515e-09, + "loss": 0.1602, + "step": 16849 + }, + { + "epoch": 4.483767961681746, + "grad_norm": 0.2867948114871979, + "learning_rate": 5.3248708884234716e-09, + "loss": 0.1807, + "step": 16850 + }, + { + "epoch": 4.484034060670569, + "grad_norm": 0.5022926926612854, + "learning_rate": 5.319434729355765e-09, + "loss": 0.1925, + "step": 16851 + }, + { + "epoch": 4.484300159659393, + "grad_norm": 0.36917755007743835, + "learning_rate": 5.314001270770185e-09, + "loss": 0.167, + "step": 16852 + }, + { + "epoch": 4.484566258648217, + "grad_norm": 0.3689896762371063, + "learning_rate": 5.3085705128216975e-09, + "loss": 0.2046, + "step": 16853 + }, + { + "epoch": 4.484832357637041, + "grad_norm": 0.3017697036266327, + "learning_rate": 5.303142455665211e-09, + "loss": 0.1777, + "step": 16854 + }, + { + "epoch": 4.485098456625865, + "grad_norm": 0.4064064919948578, + "learning_rate": 5.297717099455512e-09, + "loss": 0.1905, + "step": 16855 + }, + { + "epoch": 4.485364555614689, + "grad_norm": 0.31497684121131897, + "learning_rate": 5.2922944443473785e-09, + "loss": 0.1663, + "step": 16856 + }, + { + "epoch": 4.485630654603512, + "grad_norm": 0.25605309009552, + "learning_rate": 5.286874490495463e-09, + "loss": 0.1575, + "step": 16857 + }, + { + "epoch": 4.485896753592336, + "grad_norm": 0.2637310028076172, + "learning_rate": 5.281457238054354e-09, + "loss": 0.1692, + "step": 16858 + }, + { + "epoch": 4.4861628525811605, + "grad_norm": 0.30550044775009155, + "learning_rate": 5.27604268717855e-09, + "loss": 0.1933, + "step": 16859 + }, + { + "epoch": 4.486428951569984, + "grad_norm": 0.25201231241226196, + "learning_rate": 5.270630838022505e-09, + "loss": 0.1674, + "step": 16860 + }, + { + "epoch": 4.486695050558808, + "grad_norm": 0.3455379605293274, + "learning_rate": 5.265221690740573e-09, + "loss": 0.172, + "step": 16861 + }, + { + "epoch": 4.486961149547632, + "grad_norm": 0.370309054851532, + "learning_rate": 5.25981524548702e-09, + "loss": 0.1848, + "step": 16862 + }, + { + "epoch": 4.487227248536455, + "grad_norm": 0.3316618502140045, + "learning_rate": 5.254411502416079e-09, + "loss": 0.1785, + "step": 16863 + }, + { + "epoch": 4.487493347525279, + "grad_norm": 0.3769398033618927, + "learning_rate": 5.249010461681824e-09, + "loss": 0.1959, + "step": 16864 + }, + { + "epoch": 4.4877594465141035, + "grad_norm": 0.29410648345947266, + "learning_rate": 5.2436121234383456e-09, + "loss": 0.1795, + "step": 16865 + }, + { + "epoch": 4.488025545502927, + "grad_norm": 0.3161224126815796, + "learning_rate": 5.238216487839597e-09, + "loss": 0.1768, + "step": 16866 + }, + { + "epoch": 4.488291644491751, + "grad_norm": 0.39939600229263306, + "learning_rate": 5.232823555039501e-09, + "loss": 0.1862, + "step": 16867 + }, + { + "epoch": 4.488557743480575, + "grad_norm": 0.2725835144519806, + "learning_rate": 5.227433325191832e-09, + "loss": 0.1647, + "step": 16868 + }, + { + "epoch": 4.488823842469398, + "grad_norm": 0.28792881965637207, + "learning_rate": 5.22204579845037e-09, + "loss": 0.1814, + "step": 16869 + }, + { + "epoch": 4.489089941458222, + "grad_norm": 0.2677007019519806, + "learning_rate": 5.216660974968745e-09, + "loss": 0.1595, + "step": 16870 + }, + { + "epoch": 4.4893560404470465, + "grad_norm": 0.36846327781677246, + "learning_rate": 5.211278854900569e-09, + "loss": 0.1812, + "step": 16871 + }, + { + "epoch": 4.48962213943587, + "grad_norm": 0.32333341240882874, + "learning_rate": 5.20589943839933e-09, + "loss": 0.1723, + "step": 16872 + }, + { + "epoch": 4.489888238424694, + "grad_norm": 0.2782641053199768, + "learning_rate": 5.200522725618462e-09, + "loss": 0.1752, + "step": 16873 + }, + { + "epoch": 4.490154337413518, + "grad_norm": 0.3285810649394989, + "learning_rate": 5.1951487167113395e-09, + "loss": 0.197, + "step": 16874 + }, + { + "epoch": 4.490420436402342, + "grad_norm": 0.2967728078365326, + "learning_rate": 5.189777411831231e-09, + "loss": 0.1685, + "step": 16875 + }, + { + "epoch": 4.490686535391165, + "grad_norm": 0.4230883717536926, + "learning_rate": 5.1844088111313355e-09, + "loss": 0.1766, + "step": 16876 + }, + { + "epoch": 4.4909526343799895, + "grad_norm": 0.27927979826927185, + "learning_rate": 5.179042914764764e-09, + "loss": 0.1812, + "step": 16877 + }, + { + "epoch": 4.491218733368814, + "grad_norm": 0.4288439154624939, + "learning_rate": 5.173679722884594e-09, + "loss": 0.1869, + "step": 16878 + }, + { + "epoch": 4.491484832357637, + "grad_norm": 0.27074217796325684, + "learning_rate": 5.168319235643759e-09, + "loss": 0.1729, + "step": 16879 + }, + { + "epoch": 4.491750931346461, + "grad_norm": 0.29466512799263, + "learning_rate": 5.162961453195159e-09, + "loss": 0.1663, + "step": 16880 + }, + { + "epoch": 4.492017030335285, + "grad_norm": 0.4107985496520996, + "learning_rate": 5.157606375691625e-09, + "loss": 0.2081, + "step": 16881 + }, + { + "epoch": 4.492283129324108, + "grad_norm": 0.298016756772995, + "learning_rate": 5.152254003285894e-09, + "loss": 0.1621, + "step": 16882 + }, + { + "epoch": 4.4925492283129325, + "grad_norm": 0.3795374929904938, + "learning_rate": 5.146904336130609e-09, + "loss": 0.2013, + "step": 16883 + }, + { + "epoch": 4.492815327301757, + "grad_norm": 0.32415181398391724, + "learning_rate": 5.141557374378369e-09, + "loss": 0.1688, + "step": 16884 + }, + { + "epoch": 4.49308142629058, + "grad_norm": 0.2591184377670288, + "learning_rate": 5.136213118181676e-09, + "loss": 0.1798, + "step": 16885 + }, + { + "epoch": 4.493347525279404, + "grad_norm": 0.4313336908817291, + "learning_rate": 5.13087156769294e-09, + "loss": 0.18, + "step": 16886 + }, + { + "epoch": 4.493613624268228, + "grad_norm": 0.26975882053375244, + "learning_rate": 5.125532723064541e-09, + "loss": 0.1822, + "step": 16887 + }, + { + "epoch": 4.493879723257051, + "grad_norm": 0.35403192043304443, + "learning_rate": 5.120196584448755e-09, + "loss": 0.171, + "step": 16888 + }, + { + "epoch": 4.494145822245875, + "grad_norm": 0.279270201921463, + "learning_rate": 5.11486315199775e-09, + "loss": 0.1883, + "step": 16889 + }, + { + "epoch": 4.4944119212347, + "grad_norm": 0.2657024562358856, + "learning_rate": 5.109532425863683e-09, + "loss": 0.174, + "step": 16890 + }, + { + "epoch": 4.494678020223523, + "grad_norm": 0.41266000270843506, + "learning_rate": 5.1042044061985536e-09, + "loss": 0.2117, + "step": 16891 + }, + { + "epoch": 4.494944119212347, + "grad_norm": 0.2528335154056549, + "learning_rate": 5.098879093154363e-09, + "loss": 0.1557, + "step": 16892 + }, + { + "epoch": 4.495210218201171, + "grad_norm": 0.26392990350723267, + "learning_rate": 5.093556486882999e-09, + "loss": 0.1527, + "step": 16893 + }, + { + "epoch": 4.495476317189994, + "grad_norm": 0.347289502620697, + "learning_rate": 5.088236587536254e-09, + "loss": 0.1799, + "step": 16894 + }, + { + "epoch": 4.495742416178818, + "grad_norm": 0.26780733466148376, + "learning_rate": 5.082919395265861e-09, + "loss": 0.1744, + "step": 16895 + }, + { + "epoch": 4.4960085151676425, + "grad_norm": 0.34476161003112793, + "learning_rate": 5.077604910223499e-09, + "loss": 0.1676, + "step": 16896 + }, + { + "epoch": 4.496274614156466, + "grad_norm": 0.3673900067806244, + "learning_rate": 5.072293132560745e-09, + "loss": 0.1822, + "step": 16897 + }, + { + "epoch": 4.49654071314529, + "grad_norm": 0.2824658155441284, + "learning_rate": 5.066984062429081e-09, + "loss": 0.1761, + "step": 16898 + }, + { + "epoch": 4.496806812134114, + "grad_norm": 0.2702940106391907, + "learning_rate": 5.061677699979949e-09, + "loss": 0.1785, + "step": 16899 + }, + { + "epoch": 4.497072911122938, + "grad_norm": 0.27372825145721436, + "learning_rate": 5.056374045364687e-09, + "loss": 0.1747, + "step": 16900 + }, + { + "epoch": 4.497339010111761, + "grad_norm": 0.2861344516277313, + "learning_rate": 5.051073098734571e-09, + "loss": 0.1796, + "step": 16901 + }, + { + "epoch": 4.4976051091005855, + "grad_norm": 0.2677094638347626, + "learning_rate": 5.045774860240803e-09, + "loss": 0.168, + "step": 16902 + }, + { + "epoch": 4.49787120808941, + "grad_norm": 0.2834632694721222, + "learning_rate": 5.040479330034497e-09, + "loss": 0.1712, + "step": 16903 + }, + { + "epoch": 4.498137307078233, + "grad_norm": 0.7838910222053528, + "learning_rate": 5.035186508266676e-09, + "loss": 0.1923, + "step": 16904 + }, + { + "epoch": 4.498403406067057, + "grad_norm": 0.25788480043411255, + "learning_rate": 5.029896395088329e-09, + "loss": 0.1589, + "step": 16905 + }, + { + "epoch": 4.498669505055881, + "grad_norm": 0.270211398601532, + "learning_rate": 5.024608990650303e-09, + "loss": 0.16, + "step": 16906 + }, + { + "epoch": 4.498935604044704, + "grad_norm": 0.3000725209712982, + "learning_rate": 5.019324295103456e-09, + "loss": 0.1787, + "step": 16907 + }, + { + "epoch": 4.4992017030335285, + "grad_norm": 0.46266576647758484, + "learning_rate": 5.014042308598465e-09, + "loss": 0.185, + "step": 16908 + }, + { + "epoch": 4.499467802022353, + "grad_norm": 0.33373120427131653, + "learning_rate": 5.0087630312860115e-09, + "loss": 0.1565, + "step": 16909 + }, + { + "epoch": 4.499733901011176, + "grad_norm": 0.6813269853591919, + "learning_rate": 5.003486463316664e-09, + "loss": 0.1516, + "step": 16910 + }, + { + "epoch": 4.5, + "grad_norm": 0.4972303807735443, + "learning_rate": 4.998212604840934e-09, + "loss": 0.1938, + "step": 16911 + }, + { + "epoch": 4.500266098988824, + "grad_norm": 0.40014734864234924, + "learning_rate": 4.992941456009236e-09, + "loss": 0.1953, + "step": 16912 + }, + { + "epoch": 4.500532197977647, + "grad_norm": 0.2568492889404297, + "learning_rate": 4.987673016971905e-09, + "loss": 0.1608, + "step": 16913 + }, + { + "epoch": 4.5007982969664715, + "grad_norm": 0.3287116289138794, + "learning_rate": 4.982407287879231e-09, + "loss": 0.1649, + "step": 16914 + }, + { + "epoch": 4.501064395955296, + "grad_norm": 0.26559415459632874, + "learning_rate": 4.977144268881361e-09, + "loss": 0.1546, + "step": 16915 + }, + { + "epoch": 4.501330494944119, + "grad_norm": 0.2747972011566162, + "learning_rate": 4.971883960128442e-09, + "loss": 0.182, + "step": 16916 + }, + { + "epoch": 4.501596593932943, + "grad_norm": 0.3728230595588684, + "learning_rate": 4.966626361770499e-09, + "loss": 0.1947, + "step": 16917 + }, + { + "epoch": 4.501862692921767, + "grad_norm": 0.3621189296245575, + "learning_rate": 4.961371473957499e-09, + "loss": 0.1712, + "step": 16918 + }, + { + "epoch": 4.502128791910591, + "grad_norm": 0.25207749009132385, + "learning_rate": 4.9561192968393005e-09, + "loss": 0.1656, + "step": 16919 + }, + { + "epoch": 4.5023948908994145, + "grad_norm": 0.27321669459342957, + "learning_rate": 4.95086983056573e-09, + "loss": 0.1637, + "step": 16920 + }, + { + "epoch": 4.502660989888239, + "grad_norm": 0.28515511751174927, + "learning_rate": 4.945623075286487e-09, + "loss": 0.1754, + "step": 16921 + }, + { + "epoch": 4.502927088877062, + "grad_norm": 0.37893593311309814, + "learning_rate": 4.940379031151243e-09, + "loss": 0.1725, + "step": 16922 + }, + { + "epoch": 4.503193187865886, + "grad_norm": 0.3371862769126892, + "learning_rate": 4.935137698309544e-09, + "loss": 0.1682, + "step": 16923 + }, + { + "epoch": 4.50345928685471, + "grad_norm": 0.333122581243515, + "learning_rate": 4.929899076910915e-09, + "loss": 0.1607, + "step": 16924 + }, + { + "epoch": 4.503725385843534, + "grad_norm": 0.27770256996154785, + "learning_rate": 4.924663167104748e-09, + "loss": 0.1583, + "step": 16925 + }, + { + "epoch": 4.5039914848323575, + "grad_norm": 0.3674510717391968, + "learning_rate": 4.9194299690404005e-09, + "loss": 0.179, + "step": 16926 + }, + { + "epoch": 4.504257583821182, + "grad_norm": 0.33014118671417236, + "learning_rate": 4.914199482867109e-09, + "loss": 0.1785, + "step": 16927 + }, + { + "epoch": 4.504523682810006, + "grad_norm": 0.3220450282096863, + "learning_rate": 4.908971708734077e-09, + "loss": 0.1696, + "step": 16928 + }, + { + "epoch": 4.504789781798829, + "grad_norm": 0.34192872047424316, + "learning_rate": 4.903746646790418e-09, + "loss": 0.2026, + "step": 16929 + }, + { + "epoch": 4.505055880787653, + "grad_norm": 0.3552684485912323, + "learning_rate": 4.898524297185136e-09, + "loss": 0.1742, + "step": 16930 + }, + { + "epoch": 4.505321979776477, + "grad_norm": 0.29073891043663025, + "learning_rate": 4.893304660067188e-09, + "loss": 0.1752, + "step": 16931 + }, + { + "epoch": 4.5055880787653, + "grad_norm": 0.30117928981781006, + "learning_rate": 4.888087735585467e-09, + "loss": 0.1791, + "step": 16932 + }, + { + "epoch": 4.505854177754125, + "grad_norm": 0.2868354916572571, + "learning_rate": 4.882873523888764e-09, + "loss": 0.1848, + "step": 16933 + }, + { + "epoch": 4.506120276742949, + "grad_norm": 0.3597204089164734, + "learning_rate": 4.877662025125795e-09, + "loss": 0.1922, + "step": 16934 + }, + { + "epoch": 4.506386375731772, + "grad_norm": 0.27312126755714417, + "learning_rate": 4.872453239445207e-09, + "loss": 0.1782, + "step": 16935 + }, + { + "epoch": 4.506652474720596, + "grad_norm": 0.2745169997215271, + "learning_rate": 4.867247166995558e-09, + "loss": 0.1667, + "step": 16936 + }, + { + "epoch": 4.50691857370942, + "grad_norm": 0.3658733665943146, + "learning_rate": 4.8620438079253315e-09, + "loss": 0.1914, + "step": 16937 + }, + { + "epoch": 4.507184672698243, + "grad_norm": 0.275818407535553, + "learning_rate": 4.85684316238294e-09, + "loss": 0.1887, + "step": 16938 + }, + { + "epoch": 4.5074507716870675, + "grad_norm": 0.3182969391345978, + "learning_rate": 4.851645230516743e-09, + "loss": 0.1669, + "step": 16939 + }, + { + "epoch": 4.507716870675892, + "grad_norm": 0.3995599150657654, + "learning_rate": 4.846450012474956e-09, + "loss": 0.201, + "step": 16940 + }, + { + "epoch": 4.507982969664715, + "grad_norm": 0.28493407368659973, + "learning_rate": 4.841257508405794e-09, + "loss": 0.1801, + "step": 16941 + }, + { + "epoch": 4.508249068653539, + "grad_norm": 0.29472050070762634, + "learning_rate": 4.836067718457326e-09, + "loss": 0.1673, + "step": 16942 + }, + { + "epoch": 4.508515167642363, + "grad_norm": 0.2707662284374237, + "learning_rate": 4.830880642777602e-09, + "loss": 0.1734, + "step": 16943 + }, + { + "epoch": 4.508781266631187, + "grad_norm": 0.360860139131546, + "learning_rate": 4.825696281514535e-09, + "loss": 0.1762, + "step": 16944 + }, + { + "epoch": 4.5090473656200105, + "grad_norm": 0.2641350328922272, + "learning_rate": 4.820514634816019e-09, + "loss": 0.1709, + "step": 16945 + }, + { + "epoch": 4.509313464608835, + "grad_norm": 0.3380310535430908, + "learning_rate": 4.815335702829837e-09, + "loss": 0.1751, + "step": 16946 + }, + { + "epoch": 4.509579563597658, + "grad_norm": 0.2700670659542084, + "learning_rate": 4.8101594857037135e-09, + "loss": 0.1584, + "step": 16947 + }, + { + "epoch": 4.509845662586482, + "grad_norm": 0.35546186566352844, + "learning_rate": 4.804985983585286e-09, + "loss": 0.1758, + "step": 16948 + }, + { + "epoch": 4.510111761575306, + "grad_norm": 0.2863697111606598, + "learning_rate": 4.799815196622081e-09, + "loss": 0.157, + "step": 16949 + }, + { + "epoch": 4.51037786056413, + "grad_norm": 0.3670385479927063, + "learning_rate": 4.794647124961626e-09, + "loss": 0.1776, + "step": 16950 + }, + { + "epoch": 4.5106439595529535, + "grad_norm": 0.3186129629611969, + "learning_rate": 4.78948176875128e-09, + "loss": 0.1782, + "step": 16951 + }, + { + "epoch": 4.510910058541778, + "grad_norm": 0.32440873980522156, + "learning_rate": 4.784319128138403e-09, + "loss": 0.1661, + "step": 16952 + }, + { + "epoch": 4.511176157530602, + "grad_norm": 0.35003310441970825, + "learning_rate": 4.779159203270222e-09, + "loss": 0.173, + "step": 16953 + }, + { + "epoch": 4.511442256519425, + "grad_norm": 0.35968244075775146, + "learning_rate": 4.774001994293941e-09, + "loss": 0.1773, + "step": 16954 + }, + { + "epoch": 4.511708355508249, + "grad_norm": 0.344539076089859, + "learning_rate": 4.76884750135661e-09, + "loss": 0.186, + "step": 16955 + }, + { + "epoch": 4.511974454497073, + "grad_norm": 0.28397637605667114, + "learning_rate": 4.763695724605277e-09, + "loss": 0.1754, + "step": 16956 + }, + { + "epoch": 4.5122405534858965, + "grad_norm": 1.5421737432479858, + "learning_rate": 4.758546664186869e-09, + "loss": 0.1667, + "step": 16957 + }, + { + "epoch": 4.512506652474721, + "grad_norm": 0.36913594603538513, + "learning_rate": 4.753400320248258e-09, + "loss": 0.1823, + "step": 16958 + }, + { + "epoch": 4.512772751463545, + "grad_norm": 0.2793814241886139, + "learning_rate": 4.748256692936203e-09, + "loss": 0.1738, + "step": 16959 + }, + { + "epoch": 4.513038850452368, + "grad_norm": 0.26675909757614136, + "learning_rate": 4.743115782397433e-09, + "loss": 0.1614, + "step": 16960 + }, + { + "epoch": 4.513304949441192, + "grad_norm": 0.32733026146888733, + "learning_rate": 4.737977588778575e-09, + "loss": 0.1825, + "step": 16961 + }, + { + "epoch": 4.513571048430016, + "grad_norm": 0.26274847984313965, + "learning_rate": 4.732842112226187e-09, + "loss": 0.1625, + "step": 16962 + }, + { + "epoch": 4.5138371474188395, + "grad_norm": 0.35082611441612244, + "learning_rate": 4.7277093528867216e-09, + "loss": 0.1734, + "step": 16963 + }, + { + "epoch": 4.514103246407664, + "grad_norm": 0.5957666635513306, + "learning_rate": 4.722579310906593e-09, + "loss": 0.1831, + "step": 16964 + }, + { + "epoch": 4.514369345396488, + "grad_norm": 0.30257633328437805, + "learning_rate": 4.717451986432119e-09, + "loss": 0.196, + "step": 16965 + }, + { + "epoch": 4.514635444385311, + "grad_norm": 0.3178081214427948, + "learning_rate": 4.712327379609527e-09, + "loss": 0.1832, + "step": 16966 + }, + { + "epoch": 4.514901543374135, + "grad_norm": 0.312629371881485, + "learning_rate": 4.707205490584998e-09, + "loss": 0.1718, + "step": 16967 + }, + { + "epoch": 4.515167642362959, + "grad_norm": 0.2673994302749634, + "learning_rate": 4.702086319504606e-09, + "loss": 0.1639, + "step": 16968 + }, + { + "epoch": 4.515433741351783, + "grad_norm": 0.30980950593948364, + "learning_rate": 4.696969866514389e-09, + "loss": 0.1676, + "step": 16969 + }, + { + "epoch": 4.515699840340607, + "grad_norm": 0.3103446364402771, + "learning_rate": 4.69185613176023e-09, + "loss": 0.1743, + "step": 16970 + }, + { + "epoch": 4.515965939329431, + "grad_norm": 0.2926679253578186, + "learning_rate": 4.686745115388036e-09, + "loss": 0.1631, + "step": 16971 + }, + { + "epoch": 4.516232038318254, + "grad_norm": 0.29228299856185913, + "learning_rate": 4.681636817543533e-09, + "loss": 0.1945, + "step": 16972 + }, + { + "epoch": 4.516498137307078, + "grad_norm": 0.3829462230205536, + "learning_rate": 4.676531238372461e-09, + "loss": 0.1876, + "step": 16973 + }, + { + "epoch": 4.516764236295902, + "grad_norm": 0.27361926436424255, + "learning_rate": 4.6714283780204034e-09, + "loss": 0.1682, + "step": 16974 + }, + { + "epoch": 4.517030335284726, + "grad_norm": 0.2827662229537964, + "learning_rate": 4.666328236632944e-09, + "loss": 0.1855, + "step": 16975 + }, + { + "epoch": 4.5172964342735495, + "grad_norm": 0.24827535450458527, + "learning_rate": 4.661230814355521e-09, + "loss": 0.1537, + "step": 16976 + }, + { + "epoch": 4.517562533262374, + "grad_norm": 0.3828171193599701, + "learning_rate": 4.6561361113335416e-09, + "loss": 0.174, + "step": 16977 + }, + { + "epoch": 4.517828632251198, + "grad_norm": 0.3138120174407959, + "learning_rate": 4.651044127712301e-09, + "loss": 0.1793, + "step": 16978 + }, + { + "epoch": 4.518094731240021, + "grad_norm": 0.3540616035461426, + "learning_rate": 4.645954863637058e-09, + "loss": 0.1682, + "step": 16979 + }, + { + "epoch": 4.518360830228845, + "grad_norm": 0.2856830954551697, + "learning_rate": 4.640868319252922e-09, + "loss": 0.1752, + "step": 16980 + }, + { + "epoch": 4.518626929217669, + "grad_norm": 0.28330573439598083, + "learning_rate": 4.635784494705019e-09, + "loss": 0.1688, + "step": 16981 + }, + { + "epoch": 4.5188930282064925, + "grad_norm": 0.417545348405838, + "learning_rate": 4.630703390138324e-09, + "loss": 0.181, + "step": 16982 + }, + { + "epoch": 4.519159127195317, + "grad_norm": 0.3962477743625641, + "learning_rate": 4.625625005697764e-09, + "loss": 0.1709, + "step": 16983 + }, + { + "epoch": 4.519425226184141, + "grad_norm": 0.2749653160572052, + "learning_rate": 4.620549341528201e-09, + "loss": 0.1621, + "step": 16984 + }, + { + "epoch": 4.519691325172964, + "grad_norm": 0.517146646976471, + "learning_rate": 4.615476397774387e-09, + "loss": 0.1732, + "step": 16985 + }, + { + "epoch": 4.519957424161788, + "grad_norm": 0.29827460646629333, + "learning_rate": 4.610406174581016e-09, + "loss": 0.1947, + "step": 16986 + }, + { + "epoch": 4.520223523150612, + "grad_norm": 0.3716919422149658, + "learning_rate": 4.605338672092706e-09, + "loss": 0.194, + "step": 16987 + }, + { + "epoch": 4.5204896221394355, + "grad_norm": 0.3524969220161438, + "learning_rate": 4.600273890453976e-09, + "loss": 0.1807, + "step": 16988 + }, + { + "epoch": 4.52075572112826, + "grad_norm": 0.28764608502388, + "learning_rate": 4.595211829809298e-09, + "loss": 0.1821, + "step": 16989 + }, + { + "epoch": 4.521021820117084, + "grad_norm": 0.33392176032066345, + "learning_rate": 4.5901524903030674e-09, + "loss": 0.1708, + "step": 16990 + }, + { + "epoch": 4.521287919105907, + "grad_norm": 0.39046767354011536, + "learning_rate": 4.585095872079558e-09, + "loss": 0.2047, + "step": 16991 + }, + { + "epoch": 4.521554018094731, + "grad_norm": 0.34352898597717285, + "learning_rate": 4.580041975283011e-09, + "loss": 0.1904, + "step": 16992 + }, + { + "epoch": 4.521820117083555, + "grad_norm": 0.3422164022922516, + "learning_rate": 4.574990800057565e-09, + "loss": 0.1949, + "step": 16993 + }, + { + "epoch": 4.522086216072379, + "grad_norm": 0.2880539298057556, + "learning_rate": 4.569942346547306e-09, + "loss": 0.1592, + "step": 16994 + }, + { + "epoch": 4.522352315061203, + "grad_norm": 0.26940855383872986, + "learning_rate": 4.564896614896197e-09, + "loss": 0.1912, + "step": 16995 + }, + { + "epoch": 4.522618414050027, + "grad_norm": 0.352567583322525, + "learning_rate": 4.559853605248165e-09, + "loss": 0.1724, + "step": 16996 + }, + { + "epoch": 4.52288451303885, + "grad_norm": 0.28791049122810364, + "learning_rate": 4.554813317747063e-09, + "loss": 0.1755, + "step": 16997 + }, + { + "epoch": 4.523150612027674, + "grad_norm": 0.3101642429828644, + "learning_rate": 4.549775752536655e-09, + "loss": 0.1685, + "step": 16998 + }, + { + "epoch": 4.523416711016498, + "grad_norm": 0.28880375623703003, + "learning_rate": 4.54474090976058e-09, + "loss": 0.1766, + "step": 16999 + }, + { + "epoch": 4.523682810005322, + "grad_norm": 0.2695980370044708, + "learning_rate": 4.5397087895624905e-09, + "loss": 0.1554, + "step": 17000 + }, + { + "epoch": 4.523948908994146, + "grad_norm": 0.2755892872810364, + "learning_rate": 4.534679392085872e-09, + "loss": 0.1756, + "step": 17001 + }, + { + "epoch": 4.52421500798297, + "grad_norm": 0.2730548083782196, + "learning_rate": 4.529652717474197e-09, + "loss": 0.1734, + "step": 17002 + }, + { + "epoch": 4.524481106971794, + "grad_norm": 0.4172699451446533, + "learning_rate": 4.524628765870831e-09, + "loss": 0.1742, + "step": 17003 + }, + { + "epoch": 4.524747205960617, + "grad_norm": 0.32486996054649353, + "learning_rate": 4.5196075374190675e-09, + "loss": 0.1672, + "step": 17004 + }, + { + "epoch": 4.525013304949441, + "grad_norm": 0.33142587542533875, + "learning_rate": 4.514589032262139e-09, + "loss": 0.1755, + "step": 17005 + }, + { + "epoch": 4.525279403938265, + "grad_norm": 0.3821611702442169, + "learning_rate": 4.509573250543152e-09, + "loss": 0.1929, + "step": 17006 + }, + { + "epoch": 4.525545502927089, + "grad_norm": 0.3865682780742645, + "learning_rate": 4.504560192405193e-09, + "loss": 0.1619, + "step": 17007 + }, + { + "epoch": 4.525811601915913, + "grad_norm": 0.269186794757843, + "learning_rate": 4.499549857991214e-09, + "loss": 0.1717, + "step": 17008 + }, + { + "epoch": 4.526077700904737, + "grad_norm": 0.4036407172679901, + "learning_rate": 4.494542247444166e-09, + "loss": 0.1847, + "step": 17009 + }, + { + "epoch": 4.52634379989356, + "grad_norm": 0.3136768937110901, + "learning_rate": 4.489537360906825e-09, + "loss": 0.1835, + "step": 17010 + }, + { + "epoch": 4.526609898882384, + "grad_norm": 0.29944828152656555, + "learning_rate": 4.4845351985219656e-09, + "loss": 0.1806, + "step": 17011 + }, + { + "epoch": 4.526875997871208, + "grad_norm": 0.27903276681900024, + "learning_rate": 4.479535760432274e-09, + "loss": 0.1521, + "step": 17012 + }, + { + "epoch": 4.5271420968600316, + "grad_norm": 0.2672329545021057, + "learning_rate": 4.474539046780323e-09, + "loss": 0.1747, + "step": 17013 + }, + { + "epoch": 4.527408195848856, + "grad_norm": 0.3866409361362457, + "learning_rate": 4.4695450577086345e-09, + "loss": 0.1859, + "step": 17014 + }, + { + "epoch": 4.52767429483768, + "grad_norm": 0.24325640499591827, + "learning_rate": 4.46455379335966e-09, + "loss": 0.1482, + "step": 17015 + }, + { + "epoch": 4.527940393826503, + "grad_norm": 0.36398738622665405, + "learning_rate": 4.45956525387573e-09, + "loss": 0.2, + "step": 17016 + }, + { + "epoch": 4.528206492815327, + "grad_norm": 0.3352597653865814, + "learning_rate": 4.454579439399142e-09, + "loss": 0.179, + "step": 17017 + }, + { + "epoch": 4.528472591804151, + "grad_norm": 0.29744675755500793, + "learning_rate": 4.449596350072116e-09, + "loss": 0.1694, + "step": 17018 + }, + { + "epoch": 4.528738690792975, + "grad_norm": 0.35221534967422485, + "learning_rate": 4.4446159860367705e-09, + "loss": 0.1935, + "step": 17019 + }, + { + "epoch": 4.529004789781799, + "grad_norm": 0.2665887475013733, + "learning_rate": 4.4396383474351595e-09, + "loss": 0.1626, + "step": 17020 + }, + { + "epoch": 4.529270888770623, + "grad_norm": 0.31513670086860657, + "learning_rate": 4.434663434409247e-09, + "loss": 0.1658, + "step": 17021 + }, + { + "epoch": 4.529536987759446, + "grad_norm": 0.3882831037044525, + "learning_rate": 4.429691247100942e-09, + "loss": 0.1897, + "step": 17022 + }, + { + "epoch": 4.52980308674827, + "grad_norm": 0.3495238721370697, + "learning_rate": 4.42472178565203e-09, + "loss": 0.1928, + "step": 17023 + }, + { + "epoch": 4.530069185737094, + "grad_norm": 0.28202614188194275, + "learning_rate": 4.419755050204288e-09, + "loss": 0.1809, + "step": 17024 + }, + { + "epoch": 4.530335284725918, + "grad_norm": 0.3523894250392914, + "learning_rate": 4.414791040899335e-09, + "loss": 0.1733, + "step": 17025 + }, + { + "epoch": 4.530601383714742, + "grad_norm": 0.2625278830528259, + "learning_rate": 4.409829757878813e-09, + "loss": 0.1613, + "step": 17026 + }, + { + "epoch": 4.530867482703566, + "grad_norm": 0.3350466191768646, + "learning_rate": 4.404871201284177e-09, + "loss": 0.1721, + "step": 17027 + }, + { + "epoch": 4.53113358169239, + "grad_norm": 0.4740290343761444, + "learning_rate": 4.399915371256879e-09, + "loss": 0.1999, + "step": 17028 + }, + { + "epoch": 4.531399680681213, + "grad_norm": 0.30102306604385376, + "learning_rate": 4.394962267938262e-09, + "loss": 0.176, + "step": 17029 + }, + { + "epoch": 4.531665779670037, + "grad_norm": 0.29908880591392517, + "learning_rate": 4.390011891469603e-09, + "loss": 0.1786, + "step": 17030 + }, + { + "epoch": 4.531931878658861, + "grad_norm": 0.28681182861328125, + "learning_rate": 4.385064241992087e-09, + "loss": 0.1832, + "step": 17031 + }, + { + "epoch": 4.532197977647685, + "grad_norm": 0.27909672260284424, + "learning_rate": 4.380119319646836e-09, + "loss": 0.1593, + "step": 17032 + }, + { + "epoch": 4.532464076636509, + "grad_norm": 0.3072565793991089, + "learning_rate": 4.375177124574891e-09, + "loss": 0.1867, + "step": 17033 + }, + { + "epoch": 4.532730175625333, + "grad_norm": 0.2819925546646118, + "learning_rate": 4.370237656917219e-09, + "loss": 0.163, + "step": 17034 + }, + { + "epoch": 4.532996274614156, + "grad_norm": 0.3455409109592438, + "learning_rate": 4.3653009168146956e-09, + "loss": 0.1767, + "step": 17035 + }, + { + "epoch": 4.53326237360298, + "grad_norm": 0.2696104347705841, + "learning_rate": 4.360366904408142e-09, + "loss": 0.1783, + "step": 17036 + }, + { + "epoch": 4.533528472591804, + "grad_norm": 0.33290010690689087, + "learning_rate": 4.355435619838255e-09, + "loss": 0.171, + "step": 17037 + }, + { + "epoch": 4.5337945715806285, + "grad_norm": 0.3119158446788788, + "learning_rate": 4.350507063245701e-09, + "loss": 0.1803, + "step": 17038 + }, + { + "epoch": 4.534060670569452, + "grad_norm": 0.323578417301178, + "learning_rate": 4.345581234771057e-09, + "loss": 0.1606, + "step": 17039 + }, + { + "epoch": 4.534326769558276, + "grad_norm": 0.27514100074768066, + "learning_rate": 4.34065813455482e-09, + "loss": 0.1765, + "step": 17040 + }, + { + "epoch": 4.534592868547099, + "grad_norm": 0.3130313456058502, + "learning_rate": 4.3357377627374125e-09, + "loss": 0.1848, + "step": 17041 + }, + { + "epoch": 4.534858967535923, + "grad_norm": 0.4709198474884033, + "learning_rate": 4.330820119459144e-09, + "loss": 0.2135, + "step": 17042 + }, + { + "epoch": 4.535125066524747, + "grad_norm": 0.3055090606212616, + "learning_rate": 4.325905204860314e-09, + "loss": 0.1817, + "step": 17043 + }, + { + "epoch": 4.5353911655135715, + "grad_norm": 0.31180140376091003, + "learning_rate": 4.320993019081076e-09, + "loss": 0.189, + "step": 17044 + }, + { + "epoch": 4.535657264502395, + "grad_norm": 0.2706958055496216, + "learning_rate": 4.316083562261552e-09, + "loss": 0.1656, + "step": 17045 + }, + { + "epoch": 4.535923363491219, + "grad_norm": 0.29077231884002686, + "learning_rate": 4.311176834541752e-09, + "loss": 0.1696, + "step": 17046 + }, + { + "epoch": 4.536189462480043, + "grad_norm": 0.4305492639541626, + "learning_rate": 4.306272836061642e-09, + "loss": 0.1869, + "step": 17047 + }, + { + "epoch": 4.536455561468866, + "grad_norm": 0.2770722806453705, + "learning_rate": 4.301371566961087e-09, + "loss": 0.1752, + "step": 17048 + }, + { + "epoch": 4.53672166045769, + "grad_norm": 0.44912537932395935, + "learning_rate": 4.296473027379899e-09, + "loss": 0.1964, + "step": 17049 + }, + { + "epoch": 4.5369877594465144, + "grad_norm": 0.2946256101131439, + "learning_rate": 4.291577217457765e-09, + "loss": 0.1708, + "step": 17050 + }, + { + "epoch": 4.537253858435338, + "grad_norm": 0.28272441029548645, + "learning_rate": 4.286684137334351e-09, + "loss": 0.1957, + "step": 17051 + }, + { + "epoch": 4.537519957424162, + "grad_norm": 0.30176350474357605, + "learning_rate": 4.28179378714919e-09, + "loss": 0.166, + "step": 17052 + }, + { + "epoch": 4.537786056412986, + "grad_norm": 0.26779869198799133, + "learning_rate": 4.2769061670417825e-09, + "loss": 0.1642, + "step": 17053 + }, + { + "epoch": 4.538052155401809, + "grad_norm": 0.2620191276073456, + "learning_rate": 4.2720212771515165e-09, + "loss": 0.1583, + "step": 17054 + }, + { + "epoch": 4.538318254390633, + "grad_norm": 0.29532590508461, + "learning_rate": 4.2671391176177575e-09, + "loss": 0.185, + "step": 17055 + }, + { + "epoch": 4.538584353379457, + "grad_norm": 0.4884564280509949, + "learning_rate": 4.262259688579717e-09, + "loss": 0.1834, + "step": 17056 + }, + { + "epoch": 4.538850452368281, + "grad_norm": 0.2694171965122223, + "learning_rate": 4.257382990176572e-09, + "loss": 0.1803, + "step": 17057 + }, + { + "epoch": 4.539116551357105, + "grad_norm": 0.27138370275497437, + "learning_rate": 4.252509022547446e-09, + "loss": 0.1426, + "step": 17058 + }, + { + "epoch": 4.539382650345929, + "grad_norm": 0.3215865194797516, + "learning_rate": 4.247637785831304e-09, + "loss": 0.1704, + "step": 17059 + }, + { + "epoch": 4.539648749334752, + "grad_norm": 0.2922990918159485, + "learning_rate": 4.242769280167136e-09, + "loss": 0.1826, + "step": 17060 + }, + { + "epoch": 4.539914848323576, + "grad_norm": 0.35070931911468506, + "learning_rate": 4.237903505693752e-09, + "loss": 0.1636, + "step": 17061 + }, + { + "epoch": 4.5401809473124, + "grad_norm": 0.3279281258583069, + "learning_rate": 4.233040462549964e-09, + "loss": 0.1832, + "step": 17062 + }, + { + "epoch": 4.5404470463012245, + "grad_norm": 0.27518969774246216, + "learning_rate": 4.2281801508744716e-09, + "loss": 0.1637, + "step": 17063 + }, + { + "epoch": 4.540713145290048, + "grad_norm": 0.2742588222026825, + "learning_rate": 4.2233225708059096e-09, + "loss": 0.1708, + "step": 17064 + }, + { + "epoch": 4.540979244278872, + "grad_norm": 0.2805440127849579, + "learning_rate": 4.218467722482799e-09, + "loss": 0.1759, + "step": 17065 + }, + { + "epoch": 4.541245343267695, + "grad_norm": 0.2619366943836212, + "learning_rate": 4.213615606043641e-09, + "loss": 0.1592, + "step": 17066 + }, + { + "epoch": 4.541511442256519, + "grad_norm": 0.2774248421192169, + "learning_rate": 4.208766221626803e-09, + "loss": 0.1782, + "step": 17067 + }, + { + "epoch": 4.541777541245343, + "grad_norm": 0.34296298027038574, + "learning_rate": 4.203919569370618e-09, + "loss": 0.1836, + "step": 17068 + }, + { + "epoch": 4.5420436402341675, + "grad_norm": 0.3749605119228363, + "learning_rate": 4.1990756494132975e-09, + "loss": 0.1815, + "step": 17069 + }, + { + "epoch": 4.542309739222991, + "grad_norm": 0.2749269902706146, + "learning_rate": 4.194234461893043e-09, + "loss": 0.1683, + "step": 17070 + }, + { + "epoch": 4.542575838211815, + "grad_norm": 0.3465230166912079, + "learning_rate": 4.189396006947887e-09, + "loss": 0.1567, + "step": 17071 + }, + { + "epoch": 4.542841937200639, + "grad_norm": 0.3086376488208771, + "learning_rate": 4.184560284715866e-09, + "loss": 0.1773, + "step": 17072 + }, + { + "epoch": 4.543108036189462, + "grad_norm": 0.27276939153671265, + "learning_rate": 4.179727295334878e-09, + "loss": 0.1655, + "step": 17073 + }, + { + "epoch": 4.543374135178286, + "grad_norm": 0.2924775183200836, + "learning_rate": 4.174897038942782e-09, + "loss": 0.1831, + "step": 17074 + }, + { + "epoch": 4.5436402341671105, + "grad_norm": 0.3882361948490143, + "learning_rate": 4.170069515677366e-09, + "loss": 0.1757, + "step": 17075 + }, + { + "epoch": 4.543906333155934, + "grad_norm": 0.30141711235046387, + "learning_rate": 4.165244725676276e-09, + "loss": 0.164, + "step": 17076 + }, + { + "epoch": 4.544172432144758, + "grad_norm": 0.28921231627464294, + "learning_rate": 4.160422669077179e-09, + "loss": 0.1796, + "step": 17077 + }, + { + "epoch": 4.544438531133582, + "grad_norm": 0.28113383054733276, + "learning_rate": 4.155603346017578e-09, + "loss": 0.1813, + "step": 17078 + }, + { + "epoch": 4.544704630122405, + "grad_norm": 0.3956051468849182, + "learning_rate": 4.150786756634938e-09, + "loss": 0.1865, + "step": 17079 + }, + { + "epoch": 4.544970729111229, + "grad_norm": 0.35096418857574463, + "learning_rate": 4.145972901066619e-09, + "loss": 0.171, + "step": 17080 + }, + { + "epoch": 4.5452368281000535, + "grad_norm": 0.25683897733688354, + "learning_rate": 4.141161779449953e-09, + "loss": 0.1617, + "step": 17081 + }, + { + "epoch": 4.545502927088877, + "grad_norm": 0.26020124554634094, + "learning_rate": 4.136353391922143e-09, + "loss": 0.1732, + "step": 17082 + }, + { + "epoch": 4.545769026077701, + "grad_norm": 0.731862485408783, + "learning_rate": 4.1315477386203224e-09, + "loss": 0.1719, + "step": 17083 + }, + { + "epoch": 4.546035125066525, + "grad_norm": 0.2787435054779053, + "learning_rate": 4.126744819681582e-09, + "loss": 0.1871, + "step": 17084 + }, + { + "epoch": 4.546301224055348, + "grad_norm": 0.3147546648979187, + "learning_rate": 4.121944635242924e-09, + "loss": 0.1766, + "step": 17085 + }, + { + "epoch": 4.546567323044172, + "grad_norm": 0.2917976379394531, + "learning_rate": 4.117147185441216e-09, + "loss": 0.167, + "step": 17086 + }, + { + "epoch": 4.5468334220329965, + "grad_norm": 0.4411047697067261, + "learning_rate": 4.1123524704133275e-09, + "loss": 0.1808, + "step": 17087 + }, + { + "epoch": 4.547099521021821, + "grad_norm": 0.2746748626232147, + "learning_rate": 4.107560490295992e-09, + "loss": 0.1757, + "step": 17088 + }, + { + "epoch": 4.547365620010644, + "grad_norm": 0.2873624265193939, + "learning_rate": 4.1027712452259e-09, + "loss": 0.1906, + "step": 17089 + }, + { + "epoch": 4.547631718999468, + "grad_norm": 0.2926301062107086, + "learning_rate": 4.097984735339621e-09, + "loss": 0.1728, + "step": 17090 + }, + { + "epoch": 4.547897817988291, + "grad_norm": 0.3015429377555847, + "learning_rate": 4.093200960773735e-09, + "loss": 0.1735, + "step": 17091 + }, + { + "epoch": 4.548163916977115, + "grad_norm": 0.2631898820400238, + "learning_rate": 4.088419921664621e-09, + "loss": 0.1658, + "step": 17092 + }, + { + "epoch": 4.548430015965939, + "grad_norm": 0.2837263345718384, + "learning_rate": 4.083641618148681e-09, + "loss": 0.1719, + "step": 17093 + }, + { + "epoch": 4.548696114954764, + "grad_norm": 0.2623350918292999, + "learning_rate": 4.078866050362207e-09, + "loss": 0.1726, + "step": 17094 + }, + { + "epoch": 4.548962213943587, + "grad_norm": 0.30197712779045105, + "learning_rate": 4.074093218441377e-09, + "loss": 0.1634, + "step": 17095 + }, + { + "epoch": 4.549228312932411, + "grad_norm": 0.27469757199287415, + "learning_rate": 4.0693231225223616e-09, + "loss": 0.1681, + "step": 17096 + }, + { + "epoch": 4.549494411921235, + "grad_norm": 0.28256452083587646, + "learning_rate": 4.064555762741173e-09, + "loss": 0.1694, + "step": 17097 + }, + { + "epoch": 4.549760510910058, + "grad_norm": 0.2746782600879669, + "learning_rate": 4.059791139233793e-09, + "loss": 0.1687, + "step": 17098 + }, + { + "epoch": 4.550026609898882, + "grad_norm": 0.33054444193840027, + "learning_rate": 4.055029252136144e-09, + "loss": 0.1727, + "step": 17099 + }, + { + "epoch": 4.5502927088877065, + "grad_norm": 0.33574795722961426, + "learning_rate": 4.050270101584041e-09, + "loss": 0.1794, + "step": 17100 + }, + { + "epoch": 4.55055880787653, + "grad_norm": 0.2775890827178955, + "learning_rate": 4.045513687713198e-09, + "loss": 0.1677, + "step": 17101 + }, + { + "epoch": 4.550824906865354, + "grad_norm": 0.27876076102256775, + "learning_rate": 4.040760010659305e-09, + "loss": 0.1916, + "step": 17102 + }, + { + "epoch": 4.551091005854178, + "grad_norm": 0.2863912880420685, + "learning_rate": 4.036009070557922e-09, + "loss": 0.1826, + "step": 17103 + }, + { + "epoch": 4.551357104843001, + "grad_norm": 0.3584041893482208, + "learning_rate": 4.031260867544572e-09, + "loss": 0.1994, + "step": 17104 + }, + { + "epoch": 4.551623203831825, + "grad_norm": 0.26324713230133057, + "learning_rate": 4.026515401754682e-09, + "loss": 0.1718, + "step": 17105 + }, + { + "epoch": 4.5518893028206495, + "grad_norm": 0.2746170163154602, + "learning_rate": 4.021772673323609e-09, + "loss": 0.1748, + "step": 17106 + }, + { + "epoch": 4.552155401809473, + "grad_norm": 0.2747848331928253, + "learning_rate": 4.017032682386601e-09, + "loss": 0.1679, + "step": 17107 + }, + { + "epoch": 4.552421500798297, + "grad_norm": 0.3293350338935852, + "learning_rate": 4.012295429078882e-09, + "loss": 0.1736, + "step": 17108 + }, + { + "epoch": 4.552687599787121, + "grad_norm": 0.2729966640472412, + "learning_rate": 4.007560913535546e-09, + "loss": 0.1655, + "step": 17109 + }, + { + "epoch": 4.552953698775944, + "grad_norm": 0.25013670325279236, + "learning_rate": 4.002829135891628e-09, + "loss": 0.1722, + "step": 17110 + }, + { + "epoch": 4.553219797764768, + "grad_norm": 0.2675703167915344, + "learning_rate": 3.99810009628212e-09, + "loss": 0.1653, + "step": 17111 + }, + { + "epoch": 4.5534858967535925, + "grad_norm": 0.366802841424942, + "learning_rate": 3.99337379484187e-09, + "loss": 0.1894, + "step": 17112 + }, + { + "epoch": 4.553751995742417, + "grad_norm": 0.3299425542354584, + "learning_rate": 3.988650231705693e-09, + "loss": 0.1787, + "step": 17113 + }, + { + "epoch": 4.55401809473124, + "grad_norm": 0.4451216161251068, + "learning_rate": 3.983929407008313e-09, + "loss": 0.1641, + "step": 17114 + }, + { + "epoch": 4.554284193720064, + "grad_norm": 0.4074093997478485, + "learning_rate": 3.9792113208843905e-09, + "loss": 0.176, + "step": 17115 + }, + { + "epoch": 4.554550292708887, + "grad_norm": 0.26508229970932007, + "learning_rate": 3.974495973468484e-09, + "loss": 0.1706, + "step": 17116 + }, + { + "epoch": 4.554816391697711, + "grad_norm": 0.2802693843841553, + "learning_rate": 3.969783364895085e-09, + "loss": 0.1707, + "step": 17117 + }, + { + "epoch": 4.5550824906865355, + "grad_norm": 0.4081586003303528, + "learning_rate": 3.965073495298599e-09, + "loss": 0.1936, + "step": 17118 + }, + { + "epoch": 4.55534858967536, + "grad_norm": 0.354564368724823, + "learning_rate": 3.960366364813373e-09, + "loss": 0.1859, + "step": 17119 + }, + { + "epoch": 4.555614688664183, + "grad_norm": 0.28679969906806946, + "learning_rate": 3.955661973573654e-09, + "loss": 0.1899, + "step": 17120 + }, + { + "epoch": 4.555880787653007, + "grad_norm": 0.3160500228404999, + "learning_rate": 3.950960321713648e-09, + "loss": 0.1763, + "step": 17121 + }, + { + "epoch": 4.556146886641831, + "grad_norm": 0.2900695204734802, + "learning_rate": 3.946261409367413e-09, + "loss": 0.184, + "step": 17122 + }, + { + "epoch": 4.556412985630654, + "grad_norm": 0.25957179069519043, + "learning_rate": 3.9415652366690086e-09, + "loss": 0.1695, + "step": 17123 + }, + { + "epoch": 4.5566790846194785, + "grad_norm": 0.44183629751205444, + "learning_rate": 3.936871803752351e-09, + "loss": 0.1907, + "step": 17124 + }, + { + "epoch": 4.556945183608303, + "grad_norm": 0.273320734500885, + "learning_rate": 3.932181110751331e-09, + "loss": 0.1643, + "step": 17125 + }, + { + "epoch": 4.557211282597126, + "grad_norm": 0.25361600518226624, + "learning_rate": 3.927493157799722e-09, + "loss": 0.1571, + "step": 17126 + }, + { + "epoch": 4.55747738158595, + "grad_norm": 0.44805222749710083, + "learning_rate": 3.922807945031226e-09, + "loss": 0.1877, + "step": 17127 + }, + { + "epoch": 4.557743480574774, + "grad_norm": 0.2744453549385071, + "learning_rate": 3.918125472579492e-09, + "loss": 0.1764, + "step": 17128 + }, + { + "epoch": 4.558009579563597, + "grad_norm": 0.31441715359687805, + "learning_rate": 3.913445740578059e-09, + "loss": 0.1901, + "step": 17129 + }, + { + "epoch": 4.558275678552421, + "grad_norm": 0.41978123784065247, + "learning_rate": 3.908768749160429e-09, + "loss": 0.2217, + "step": 17130 + }, + { + "epoch": 4.558541777541246, + "grad_norm": 0.3043372333049774, + "learning_rate": 3.904094498459964e-09, + "loss": 0.1738, + "step": 17131 + }, + { + "epoch": 4.558807876530069, + "grad_norm": 0.29070261120796204, + "learning_rate": 3.899422988610024e-09, + "loss": 0.1719, + "step": 17132 + }, + { + "epoch": 4.559073975518893, + "grad_norm": 0.3203783333301544, + "learning_rate": 3.894754219743801e-09, + "loss": 0.1579, + "step": 17133 + }, + { + "epoch": 4.559340074507717, + "grad_norm": 0.2640939950942993, + "learning_rate": 3.890088191994489e-09, + "loss": 0.161, + "step": 17134 + }, + { + "epoch": 4.55960617349654, + "grad_norm": 0.3586614727973938, + "learning_rate": 3.885424905495171e-09, + "loss": 0.2058, + "step": 17135 + }, + { + "epoch": 4.559872272485364, + "grad_norm": 0.5158207416534424, + "learning_rate": 3.880764360378852e-09, + "loss": 0.2005, + "step": 17136 + }, + { + "epoch": 4.5601383714741885, + "grad_norm": 0.3126353323459625, + "learning_rate": 3.876106556778447e-09, + "loss": 0.1653, + "step": 17137 + }, + { + "epoch": 4.560404470463013, + "grad_norm": 0.2620187997817993, + "learning_rate": 3.871451494826828e-09, + "loss": 0.1616, + "step": 17138 + }, + { + "epoch": 4.560670569451836, + "grad_norm": 0.3005579710006714, + "learning_rate": 3.866799174656743e-09, + "loss": 0.1754, + "step": 17139 + }, + { + "epoch": 4.56093666844066, + "grad_norm": 0.2895961403846741, + "learning_rate": 3.862149596400899e-09, + "loss": 0.1855, + "step": 17140 + }, + { + "epoch": 4.561202767429483, + "grad_norm": 0.32468512654304504, + "learning_rate": 3.857502760191911e-09, + "loss": 0.1914, + "step": 17141 + }, + { + "epoch": 4.561468866418307, + "grad_norm": 0.28161659836769104, + "learning_rate": 3.8528586661623286e-09, + "loss": 0.1852, + "step": 17142 + }, + { + "epoch": 4.5617349654071315, + "grad_norm": 0.37595611810684204, + "learning_rate": 3.848217314444579e-09, + "loss": 0.217, + "step": 17143 + }, + { + "epoch": 4.562001064395956, + "grad_norm": 0.2706446051597595, + "learning_rate": 3.84357870517108e-09, + "loss": 0.1608, + "step": 17144 + }, + { + "epoch": 4.562267163384779, + "grad_norm": 0.3490734398365021, + "learning_rate": 3.838942838474102e-09, + "loss": 0.1929, + "step": 17145 + }, + { + "epoch": 4.562533262373603, + "grad_norm": 0.27128830552101135, + "learning_rate": 3.834309714485895e-09, + "loss": 0.176, + "step": 17146 + }, + { + "epoch": 4.562799361362427, + "grad_norm": 0.28429535031318665, + "learning_rate": 3.8296793333385755e-09, + "loss": 0.19, + "step": 17147 + }, + { + "epoch": 4.56306546035125, + "grad_norm": 0.5361369848251343, + "learning_rate": 3.825051695164239e-09, + "loss": 0.1864, + "step": 17148 + }, + { + "epoch": 4.5633315593400745, + "grad_norm": 0.30578920245170593, + "learning_rate": 3.820426800094856e-09, + "loss": 0.1753, + "step": 17149 + }, + { + "epoch": 4.563597658328899, + "grad_norm": 0.29372233152389526, + "learning_rate": 3.815804648262344e-09, + "loss": 0.1711, + "step": 17150 + }, + { + "epoch": 4.563863757317722, + "grad_norm": 0.669001579284668, + "learning_rate": 3.811185239798565e-09, + "loss": 0.1775, + "step": 17151 + }, + { + "epoch": 4.564129856306546, + "grad_norm": 0.24920035898685455, + "learning_rate": 3.806568574835223e-09, + "loss": 0.1497, + "step": 17152 + }, + { + "epoch": 4.56439595529537, + "grad_norm": 0.26096072793006897, + "learning_rate": 3.801954653504036e-09, + "loss": 0.1681, + "step": 17153 + }, + { + "epoch": 4.564662054284193, + "grad_norm": 0.3083858788013458, + "learning_rate": 3.797343475936576e-09, + "loss": 0.1687, + "step": 17154 + }, + { + "epoch": 4.5649281532730175, + "grad_norm": 0.30739498138427734, + "learning_rate": 3.792735042264372e-09, + "loss": 0.1873, + "step": 17155 + }, + { + "epoch": 4.565194252261842, + "grad_norm": 0.4345802664756775, + "learning_rate": 3.7881293526188625e-09, + "loss": 0.2021, + "step": 17156 + }, + { + "epoch": 4.565460351250666, + "grad_norm": 0.2777957320213318, + "learning_rate": 3.783526407131432e-09, + "loss": 0.1718, + "step": 17157 + }, + { + "epoch": 4.565726450239489, + "grad_norm": 0.29905858635902405, + "learning_rate": 3.778926205933342e-09, + "loss": 0.1606, + "step": 17158 + }, + { + "epoch": 4.565992549228313, + "grad_norm": 0.27418801188468933, + "learning_rate": 3.7743287491558086e-09, + "loss": 0.1708, + "step": 17159 + }, + { + "epoch": 4.566258648217136, + "grad_norm": 0.2839604616165161, + "learning_rate": 3.769734036929961e-09, + "loss": 0.1792, + "step": 17160 + }, + { + "epoch": 4.5665247472059605, + "grad_norm": 0.296817809343338, + "learning_rate": 3.765142069386851e-09, + "loss": 0.1822, + "step": 17161 + }, + { + "epoch": 4.566790846194785, + "grad_norm": 0.26161813735961914, + "learning_rate": 3.760552846657439e-09, + "loss": 0.1766, + "step": 17162 + }, + { + "epoch": 4.567056945183609, + "grad_norm": 0.2808796465396881, + "learning_rate": 3.755966368872621e-09, + "loss": 0.17, + "step": 17163 + }, + { + "epoch": 4.567323044172432, + "grad_norm": 0.32435622811317444, + "learning_rate": 3.751382636163225e-09, + "loss": 0.1801, + "step": 17164 + }, + { + "epoch": 4.567589143161256, + "grad_norm": 0.26589953899383545, + "learning_rate": 3.746801648659981e-09, + "loss": 0.1635, + "step": 17165 + }, + { + "epoch": 4.56785524215008, + "grad_norm": 0.27512887120246887, + "learning_rate": 3.742223406493561e-09, + "loss": 0.1634, + "step": 17166 + }, + { + "epoch": 4.5681213411389034, + "grad_norm": 0.2820441722869873, + "learning_rate": 3.737647909794528e-09, + "loss": 0.1913, + "step": 17167 + }, + { + "epoch": 4.568387440127728, + "grad_norm": 0.3129112124443054, + "learning_rate": 3.7330751586934e-09, + "loss": 0.1731, + "step": 17168 + }, + { + "epoch": 4.568653539116552, + "grad_norm": 0.2735184133052826, + "learning_rate": 3.728505153320583e-09, + "loss": 0.1725, + "step": 17169 + }, + { + "epoch": 4.568919638105375, + "grad_norm": 0.3000975251197815, + "learning_rate": 3.723937893806428e-09, + "loss": 0.1707, + "step": 17170 + }, + { + "epoch": 4.569185737094199, + "grad_norm": 0.381985068321228, + "learning_rate": 3.71937338028121e-09, + "loss": 0.1836, + "step": 17171 + }, + { + "epoch": 4.569451836083023, + "grad_norm": 0.3414268493652344, + "learning_rate": 3.7148116128751237e-09, + "loss": 0.176, + "step": 17172 + }, + { + "epoch": 4.569717935071846, + "grad_norm": 0.41584086418151855, + "learning_rate": 3.710252591718255e-09, + "loss": 0.1705, + "step": 17173 + }, + { + "epoch": 4.5699840340606706, + "grad_norm": 0.28138965368270874, + "learning_rate": 3.7056963169406653e-09, + "loss": 0.1937, + "step": 17174 + }, + { + "epoch": 4.570250133049495, + "grad_norm": 0.2438502162694931, + "learning_rate": 3.701142788672296e-09, + "loss": 0.1605, + "step": 17175 + }, + { + "epoch": 4.570516232038318, + "grad_norm": 0.36550408601760864, + "learning_rate": 3.69659200704302e-09, + "loss": 0.192, + "step": 17176 + }, + { + "epoch": 4.570782331027142, + "grad_norm": 0.3918505311012268, + "learning_rate": 3.6920439721826344e-09, + "loss": 0.1805, + "step": 17177 + }, + { + "epoch": 4.571048430015966, + "grad_norm": 0.3848244249820709, + "learning_rate": 3.6874986842208577e-09, + "loss": 0.1867, + "step": 17178 + }, + { + "epoch": 4.571314529004789, + "grad_norm": 0.33605480194091797, + "learning_rate": 3.682956143287341e-09, + "loss": 0.1878, + "step": 17179 + }, + { + "epoch": 4.5715806279936135, + "grad_norm": 0.37452933192253113, + "learning_rate": 3.678416349511648e-09, + "loss": 0.1753, + "step": 17180 + }, + { + "epoch": 4.571846726982438, + "grad_norm": 0.279263436794281, + "learning_rate": 3.6738793030232417e-09, + "loss": 0.1776, + "step": 17181 + }, + { + "epoch": 4.572112825971262, + "grad_norm": 0.33886316418647766, + "learning_rate": 3.6693450039515626e-09, + "loss": 0.1818, + "step": 17182 + }, + { + "epoch": 4.572378924960085, + "grad_norm": 0.2646388113498688, + "learning_rate": 3.6648134524258967e-09, + "loss": 0.1633, + "step": 17183 + }, + { + "epoch": 4.572645023948909, + "grad_norm": 0.363692045211792, + "learning_rate": 3.660284648575518e-09, + "loss": 0.1782, + "step": 17184 + }, + { + "epoch": 4.572911122937732, + "grad_norm": 0.3287922143936157, + "learning_rate": 3.6557585925295898e-09, + "loss": 0.1831, + "step": 17185 + }, + { + "epoch": 4.5731772219265565, + "grad_norm": 0.2923298180103302, + "learning_rate": 3.651235284417209e-09, + "loss": 0.1979, + "step": 17186 + }, + { + "epoch": 4.573443320915381, + "grad_norm": 0.3287260830402374, + "learning_rate": 3.6467147243673946e-09, + "loss": 0.1864, + "step": 17187 + }, + { + "epoch": 4.573709419904205, + "grad_norm": 0.3635883927345276, + "learning_rate": 3.6421969125090656e-09, + "loss": 0.1746, + "step": 17188 + }, + { + "epoch": 4.573975518893028, + "grad_norm": 0.3638024926185608, + "learning_rate": 3.637681848971108e-09, + "loss": 0.1818, + "step": 17189 + }, + { + "epoch": 4.574241617881852, + "grad_norm": 0.2915589213371277, + "learning_rate": 3.6331695338822634e-09, + "loss": 0.1736, + "step": 17190 + }, + { + "epoch": 4.574507716870676, + "grad_norm": 0.32462966442108154, + "learning_rate": 3.6286599673712616e-09, + "loss": 0.1641, + "step": 17191 + }, + { + "epoch": 4.5747738158594995, + "grad_norm": 0.2734721601009369, + "learning_rate": 3.6241531495667e-09, + "loss": 0.1705, + "step": 17192 + }, + { + "epoch": 4.575039914848324, + "grad_norm": 0.4201320707798004, + "learning_rate": 3.6196490805971537e-09, + "loss": 0.1679, + "step": 17193 + }, + { + "epoch": 4.575306013837148, + "grad_norm": 0.3044228255748749, + "learning_rate": 3.615147760591053e-09, + "loss": 0.168, + "step": 17194 + }, + { + "epoch": 4.575572112825971, + "grad_norm": 0.33629900217056274, + "learning_rate": 3.6106491896768177e-09, + "loss": 0.1643, + "step": 17195 + }, + { + "epoch": 4.575838211814795, + "grad_norm": 0.28277477622032166, + "learning_rate": 3.6061533679827227e-09, + "loss": 0.1659, + "step": 17196 + }, + { + "epoch": 4.576104310803619, + "grad_norm": 0.30563369393348694, + "learning_rate": 3.6016602956370324e-09, + "loss": 0.1939, + "step": 17197 + }, + { + "epoch": 4.5763704097924425, + "grad_norm": 0.4158085286617279, + "learning_rate": 3.5971699727678774e-09, + "loss": 0.1812, + "step": 17198 + }, + { + "epoch": 4.576636508781267, + "grad_norm": 0.2986198663711548, + "learning_rate": 3.592682399503322e-09, + "loss": 0.1715, + "step": 17199 + }, + { + "epoch": 4.576902607770091, + "grad_norm": 0.27219879627227783, + "learning_rate": 3.5881975759713745e-09, + "loss": 0.1789, + "step": 17200 + }, + { + "epoch": 4.577168706758914, + "grad_norm": 0.2973845899105072, + "learning_rate": 3.5837155022999665e-09, + "loss": 0.1902, + "step": 17201 + }, + { + "epoch": 4.577434805747738, + "grad_norm": 0.32961615920066833, + "learning_rate": 3.5792361786169065e-09, + "loss": 0.1683, + "step": 17202 + }, + { + "epoch": 4.577700904736562, + "grad_norm": 0.29464027285575867, + "learning_rate": 3.5747596050499708e-09, + "loss": 0.1725, + "step": 17203 + }, + { + "epoch": 4.5779670037253855, + "grad_norm": 0.3277115821838379, + "learning_rate": 3.570285781726845e-09, + "loss": 0.1633, + "step": 17204 + }, + { + "epoch": 4.57823310271421, + "grad_norm": 0.24348661303520203, + "learning_rate": 3.5658147087751058e-09, + "loss": 0.1498, + "step": 17205 + }, + { + "epoch": 4.578499201703034, + "grad_norm": 0.28100672364234924, + "learning_rate": 3.561346386322306e-09, + "loss": 0.1647, + "step": 17206 + }, + { + "epoch": 4.578765300691858, + "grad_norm": 0.2963910698890686, + "learning_rate": 3.5568808144958663e-09, + "loss": 0.1846, + "step": 17207 + }, + { + "epoch": 4.579031399680681, + "grad_norm": 0.34784334897994995, + "learning_rate": 3.5524179934231955e-09, + "loss": 0.1713, + "step": 17208 + }, + { + "epoch": 4.579297498669505, + "grad_norm": 0.27428779006004333, + "learning_rate": 3.547957923231526e-09, + "loss": 0.163, + "step": 17209 + }, + { + "epoch": 4.579563597658328, + "grad_norm": 0.35554808378219604, + "learning_rate": 3.543500604048122e-09, + "loss": 0.1691, + "step": 17210 + }, + { + "epoch": 4.579829696647153, + "grad_norm": 0.45124107599258423, + "learning_rate": 3.5390460360000708e-09, + "loss": 0.1881, + "step": 17211 + }, + { + "epoch": 4.580095795635977, + "grad_norm": 0.4250081777572632, + "learning_rate": 3.5345942192144597e-09, + "loss": 0.1693, + "step": 17212 + }, + { + "epoch": 4.580361894624801, + "grad_norm": 0.3425264358520508, + "learning_rate": 3.5301451538182313e-09, + "loss": 0.1742, + "step": 17213 + }, + { + "epoch": 4.580627993613624, + "grad_norm": 0.26878821849823, + "learning_rate": 3.525698839938307e-09, + "loss": 0.1706, + "step": 17214 + }, + { + "epoch": 4.580894092602448, + "grad_norm": 0.3009617328643799, + "learning_rate": 3.521255277701496e-09, + "loss": 0.1687, + "step": 17215 + }, + { + "epoch": 4.581160191591272, + "grad_norm": 0.41669151186943054, + "learning_rate": 3.516814467234541e-09, + "loss": 0.1898, + "step": 17216 + }, + { + "epoch": 4.5814262905800955, + "grad_norm": 0.25428175926208496, + "learning_rate": 3.5123764086640973e-09, + "loss": 0.1555, + "step": 17217 + }, + { + "epoch": 4.58169238956892, + "grad_norm": 0.47188621759414673, + "learning_rate": 3.5079411021167628e-09, + "loss": 0.1785, + "step": 17218 + }, + { + "epoch": 4.581958488557744, + "grad_norm": 0.46556952595710754, + "learning_rate": 3.503508547719014e-09, + "loss": 0.1847, + "step": 17219 + }, + { + "epoch": 4.582224587546567, + "grad_norm": 0.3612276613712311, + "learning_rate": 3.4990787455972947e-09, + "loss": 0.1821, + "step": 17220 + }, + { + "epoch": 4.582490686535391, + "grad_norm": 0.2824997901916504, + "learning_rate": 3.4946516958779483e-09, + "loss": 0.1739, + "step": 17221 + }, + { + "epoch": 4.582756785524215, + "grad_norm": 0.33576318621635437, + "learning_rate": 3.49022739868724e-09, + "loss": 0.1701, + "step": 17222 + }, + { + "epoch": 4.5830228845130385, + "grad_norm": 0.27386218309402466, + "learning_rate": 3.4858058541513803e-09, + "loss": 0.1727, + "step": 17223 + }, + { + "epoch": 4.583288983501863, + "grad_norm": 0.4846923053264618, + "learning_rate": 3.481387062396446e-09, + "loss": 0.1703, + "step": 17224 + }, + { + "epoch": 4.583555082490687, + "grad_norm": 0.38170841336250305, + "learning_rate": 3.476971023548503e-09, + "loss": 0.1692, + "step": 17225 + }, + { + "epoch": 4.58382118147951, + "grad_norm": 0.2902805805206299, + "learning_rate": 3.4725577377334835e-09, + "loss": 0.1564, + "step": 17226 + }, + { + "epoch": 4.584087280468334, + "grad_norm": 0.28674641251564026, + "learning_rate": 3.4681472050772766e-09, + "loss": 0.1755, + "step": 17227 + }, + { + "epoch": 4.584353379457158, + "grad_norm": 0.30090758204460144, + "learning_rate": 3.463739425705647e-09, + "loss": 0.1825, + "step": 17228 + }, + { + "epoch": 4.5846194784459815, + "grad_norm": 0.2793358266353607, + "learning_rate": 3.4593343997443735e-09, + "loss": 0.1673, + "step": 17229 + }, + { + "epoch": 4.584885577434806, + "grad_norm": 0.25849321484565735, + "learning_rate": 3.4549321273190435e-09, + "loss": 0.1583, + "step": 17230 + }, + { + "epoch": 4.58515167642363, + "grad_norm": 0.3953934609889984, + "learning_rate": 3.4505326085552454e-09, + "loss": 0.1726, + "step": 17231 + }, + { + "epoch": 4.585417775412454, + "grad_norm": 0.31246650218963623, + "learning_rate": 3.4461358435784573e-09, + "loss": 0.1744, + "step": 17232 + }, + { + "epoch": 4.585683874401277, + "grad_norm": 0.34168195724487305, + "learning_rate": 3.441741832514078e-09, + "loss": 0.1827, + "step": 17233 + }, + { + "epoch": 4.585949973390101, + "grad_norm": 0.30507898330688477, + "learning_rate": 3.437350575487441e-09, + "loss": 0.1755, + "step": 17234 + }, + { + "epoch": 4.5862160723789245, + "grad_norm": 0.37239307165145874, + "learning_rate": 3.4329620726237797e-09, + "loss": 0.1858, + "step": 17235 + }, + { + "epoch": 4.586482171367749, + "grad_norm": 0.2886366546154022, + "learning_rate": 3.4285763240482822e-09, + "loss": 0.1731, + "step": 17236 + }, + { + "epoch": 4.586748270356573, + "grad_norm": 0.28622978925704956, + "learning_rate": 3.424193329886027e-09, + "loss": 0.1866, + "step": 17237 + }, + { + "epoch": 4.587014369345397, + "grad_norm": 0.2688523828983307, + "learning_rate": 3.419813090262036e-09, + "loss": 0.1675, + "step": 17238 + }, + { + "epoch": 4.58728046833422, + "grad_norm": 1.4039353132247925, + "learning_rate": 3.41543560530122e-09, + "loss": 0.1742, + "step": 17239 + }, + { + "epoch": 4.587546567323044, + "grad_norm": 0.901323676109314, + "learning_rate": 3.4110608751284796e-09, + "loss": 0.1969, + "step": 17240 + }, + { + "epoch": 4.587812666311868, + "grad_norm": 0.29017460346221924, + "learning_rate": 3.4066888998685374e-09, + "loss": 0.1751, + "step": 17241 + }, + { + "epoch": 4.588078765300692, + "grad_norm": 0.27208641171455383, + "learning_rate": 3.4023196796461153e-09, + "loss": 0.1831, + "step": 17242 + }, + { + "epoch": 4.588344864289516, + "grad_norm": 0.29499608278274536, + "learning_rate": 3.397953214585836e-09, + "loss": 0.1854, + "step": 17243 + }, + { + "epoch": 4.58861096327834, + "grad_norm": 0.2581513524055481, + "learning_rate": 3.3935895048122443e-09, + "loss": 0.1634, + "step": 17244 + }, + { + "epoch": 4.588877062267163, + "grad_norm": 0.25168168544769287, + "learning_rate": 3.3892285504497743e-09, + "loss": 0.1618, + "step": 17245 + }, + { + "epoch": 4.589143161255987, + "grad_norm": 0.27811989188194275, + "learning_rate": 3.3848703516228484e-09, + "loss": 0.1813, + "step": 17246 + }, + { + "epoch": 4.589409260244811, + "grad_norm": 0.4680911600589752, + "learning_rate": 3.380514908455745e-09, + "loss": 0.1723, + "step": 17247 + }, + { + "epoch": 4.589675359233635, + "grad_norm": 0.33528006076812744, + "learning_rate": 3.376162221072698e-09, + "loss": 0.1989, + "step": 17248 + }, + { + "epoch": 4.589941458222459, + "grad_norm": 0.29717588424682617, + "learning_rate": 3.3718122895978527e-09, + "loss": 0.1732, + "step": 17249 + }, + { + "epoch": 4.590207557211283, + "grad_norm": 0.2817237079143524, + "learning_rate": 3.367465114155266e-09, + "loss": 0.1729, + "step": 17250 + }, + { + "epoch": 4.590473656200106, + "grad_norm": 0.3986896276473999, + "learning_rate": 3.3631206948689483e-09, + "loss": 0.1792, + "step": 17251 + }, + { + "epoch": 4.59073975518893, + "grad_norm": 0.410441517829895, + "learning_rate": 3.3587790318628127e-09, + "loss": 0.1712, + "step": 17252 + }, + { + "epoch": 4.591005854177754, + "grad_norm": 0.31687867641448975, + "learning_rate": 3.3544401252606714e-09, + "loss": 0.176, + "step": 17253 + }, + { + "epoch": 4.5912719531665775, + "grad_norm": 0.33127573132514954, + "learning_rate": 3.3501039751863136e-09, + "loss": 0.171, + "step": 17254 + }, + { + "epoch": 4.591538052155402, + "grad_norm": 0.2813379764556885, + "learning_rate": 3.345770581763374e-09, + "loss": 0.1742, + "step": 17255 + }, + { + "epoch": 4.591804151144226, + "grad_norm": 0.2589178681373596, + "learning_rate": 3.341439945115465e-09, + "loss": 0.1658, + "step": 17256 + }, + { + "epoch": 4.59207025013305, + "grad_norm": 0.27357217669487, + "learning_rate": 3.33711206536611e-09, + "loss": 0.1777, + "step": 17257 + }, + { + "epoch": 4.592336349121873, + "grad_norm": 0.2788732945919037, + "learning_rate": 3.3327869426387434e-09, + "loss": 0.1601, + "step": 17258 + }, + { + "epoch": 4.592602448110697, + "grad_norm": 0.27918192744255066, + "learning_rate": 3.328464577056744e-09, + "loss": 0.1748, + "step": 17259 + }, + { + "epoch": 4.5928685470995205, + "grad_norm": 0.28348878026008606, + "learning_rate": 3.3241449687433696e-09, + "loss": 0.186, + "step": 17260 + }, + { + "epoch": 4.593134646088345, + "grad_norm": 0.3637886941432953, + "learning_rate": 3.3198281178218544e-09, + "loss": 0.1639, + "step": 17261 + }, + { + "epoch": 4.593400745077169, + "grad_norm": 0.29357877373695374, + "learning_rate": 3.315514024415278e-09, + "loss": 0.1911, + "step": 17262 + }, + { + "epoch": 4.593666844065993, + "grad_norm": 0.3030543327331543, + "learning_rate": 3.3112026886467413e-09, + "loss": 0.1705, + "step": 17263 + }, + { + "epoch": 4.593932943054816, + "grad_norm": 0.2846691310405731, + "learning_rate": 3.306894110639169e-09, + "loss": 0.1703, + "step": 17264 + }, + { + "epoch": 4.59419904204364, + "grad_norm": 0.2860594391822815, + "learning_rate": 3.3025882905154625e-09, + "loss": 0.1732, + "step": 17265 + }, + { + "epoch": 4.594465141032464, + "grad_norm": 0.38175371289253235, + "learning_rate": 3.2982852283984342e-09, + "loss": 0.2009, + "step": 17266 + }, + { + "epoch": 4.594731240021288, + "grad_norm": 0.2972368896007538, + "learning_rate": 3.293984924410831e-09, + "loss": 0.1707, + "step": 17267 + }, + { + "epoch": 4.594997339010112, + "grad_norm": 0.2700854241847992, + "learning_rate": 3.2896873786752876e-09, + "loss": 0.1604, + "step": 17268 + }, + { + "epoch": 4.595263437998936, + "grad_norm": 0.32784515619277954, + "learning_rate": 3.2853925913143955e-09, + "loss": 0.1617, + "step": 17269 + }, + { + "epoch": 4.595529536987759, + "grad_norm": 0.4341753423213959, + "learning_rate": 3.281100562450623e-09, + "loss": 0.1681, + "step": 17270 + }, + { + "epoch": 4.595795635976583, + "grad_norm": 0.24912340939044952, + "learning_rate": 3.2768112922064163e-09, + "loss": 0.1582, + "step": 17271 + }, + { + "epoch": 4.596061734965407, + "grad_norm": 0.2666376829147339, + "learning_rate": 3.272524780704089e-09, + "loss": 0.1693, + "step": 17272 + }, + { + "epoch": 4.596327833954231, + "grad_norm": 0.2524784207344055, + "learning_rate": 3.2682410280659323e-09, + "loss": 0.1573, + "step": 17273 + }, + { + "epoch": 4.596593932943055, + "grad_norm": 0.25580573081970215, + "learning_rate": 3.2639600344141036e-09, + "loss": 0.1641, + "step": 17274 + }, + { + "epoch": 4.596860031931879, + "grad_norm": 0.2854941487312317, + "learning_rate": 3.259681799870706e-09, + "loss": 0.1879, + "step": 17275 + }, + { + "epoch": 4.597126130920703, + "grad_norm": 0.29179903864860535, + "learning_rate": 3.2554063245577745e-09, + "loss": 0.1805, + "step": 17276 + }, + { + "epoch": 4.597392229909526, + "grad_norm": 0.2702235281467438, + "learning_rate": 3.251133608597245e-09, + "loss": 0.1547, + "step": 17277 + }, + { + "epoch": 4.59765832889835, + "grad_norm": 0.3155938684940338, + "learning_rate": 3.246863652110998e-09, + "loss": 0.1646, + "step": 17278 + }, + { + "epoch": 4.597924427887174, + "grad_norm": 0.2918543517589569, + "learning_rate": 3.242596455220792e-09, + "loss": 0.1786, + "step": 17279 + }, + { + "epoch": 4.598190526875998, + "grad_norm": 0.379926472902298, + "learning_rate": 3.238332018048373e-09, + "loss": 0.207, + "step": 17280 + }, + { + "epoch": 4.598456625864822, + "grad_norm": 0.2620057761669159, + "learning_rate": 3.2340703407153446e-09, + "loss": 0.1715, + "step": 17281 + }, + { + "epoch": 4.598722724853646, + "grad_norm": 0.27239102125167847, + "learning_rate": 3.229811423343287e-09, + "loss": 0.1684, + "step": 17282 + }, + { + "epoch": 4.598988823842469, + "grad_norm": 0.26751938462257385, + "learning_rate": 3.2255552660536255e-09, + "loss": 0.1696, + "step": 17283 + }, + { + "epoch": 4.599254922831293, + "grad_norm": 0.27361297607421875, + "learning_rate": 3.221301868967807e-09, + "loss": 0.1616, + "step": 17284 + }, + { + "epoch": 4.5995210218201175, + "grad_norm": 0.27695518732070923, + "learning_rate": 3.217051232207113e-09, + "loss": 0.1709, + "step": 17285 + }, + { + "epoch": 4.599787120808941, + "grad_norm": 0.34917011857032776, + "learning_rate": 3.212803355892779e-09, + "loss": 0.198, + "step": 17286 + }, + { + "epoch": 4.600053219797765, + "grad_norm": 0.31177467107772827, + "learning_rate": 3.2085582401459866e-09, + "loss": 0.1848, + "step": 17287 + }, + { + "epoch": 4.600319318786589, + "grad_norm": 0.3138304054737091, + "learning_rate": 3.204315885087805e-09, + "loss": 0.183, + "step": 17288 + }, + { + "epoch": 4.600585417775412, + "grad_norm": 0.29587438702583313, + "learning_rate": 3.2000762908392266e-09, + "loss": 0.1871, + "step": 17289 + }, + { + "epoch": 4.600851516764236, + "grad_norm": 0.6566874384880066, + "learning_rate": 3.195839457521188e-09, + "loss": 0.1902, + "step": 17290 + }, + { + "epoch": 4.60111761575306, + "grad_norm": 0.3745863437652588, + "learning_rate": 3.1916053852545143e-09, + "loss": 0.1755, + "step": 17291 + }, + { + "epoch": 4.601383714741884, + "grad_norm": 0.268485426902771, + "learning_rate": 3.1873740741599873e-09, + "loss": 0.1642, + "step": 17292 + }, + { + "epoch": 4.601649813730708, + "grad_norm": 0.27279219031333923, + "learning_rate": 3.183145524358266e-09, + "loss": 0.1763, + "step": 17293 + }, + { + "epoch": 4.601915912719532, + "grad_norm": 0.2960175573825836, + "learning_rate": 3.178919735969987e-09, + "loss": 0.1838, + "step": 17294 + }, + { + "epoch": 4.602182011708355, + "grad_norm": 0.2742052972316742, + "learning_rate": 3.1746967091156763e-09, + "loss": 0.1654, + "step": 17295 + }, + { + "epoch": 4.602448110697179, + "grad_norm": 0.2950516939163208, + "learning_rate": 3.1704764439157595e-09, + "loss": 0.1844, + "step": 17296 + }, + { + "epoch": 4.602714209686003, + "grad_norm": 0.3022657036781311, + "learning_rate": 3.1662589404906405e-09, + "loss": 0.1925, + "step": 17297 + }, + { + "epoch": 4.602980308674827, + "grad_norm": 0.2693539559841156, + "learning_rate": 3.1620441989605674e-09, + "loss": 0.1732, + "step": 17298 + }, + { + "epoch": 4.603246407663651, + "grad_norm": 0.3723892569541931, + "learning_rate": 3.1578322194457998e-09, + "loss": 0.1688, + "step": 17299 + }, + { + "epoch": 4.603512506652475, + "grad_norm": 0.2724071741104126, + "learning_rate": 3.1536230020664413e-09, + "loss": 0.1693, + "step": 17300 + }, + { + "epoch": 4.603778605641299, + "grad_norm": 0.24894340336322784, + "learning_rate": 3.1494165469425517e-09, + "loss": 0.1497, + "step": 17301 + }, + { + "epoch": 4.604044704630122, + "grad_norm": 0.38714373111724854, + "learning_rate": 3.1452128541941012e-09, + "loss": 0.1706, + "step": 17302 + }, + { + "epoch": 4.604310803618946, + "grad_norm": 0.39696159958839417, + "learning_rate": 3.1410119239410284e-09, + "loss": 0.1882, + "step": 17303 + }, + { + "epoch": 4.60457690260777, + "grad_norm": 0.38187238574028015, + "learning_rate": 3.1368137563031026e-09, + "loss": 0.1842, + "step": 17304 + }, + { + "epoch": 4.604843001596594, + "grad_norm": 0.2748364210128784, + "learning_rate": 3.132618351400096e-09, + "loss": 0.1828, + "step": 17305 + }, + { + "epoch": 4.605109100585418, + "grad_norm": 0.2708585858345032, + "learning_rate": 3.1284257093516453e-09, + "loss": 0.1733, + "step": 17306 + }, + { + "epoch": 4.605375199574242, + "grad_norm": 0.3748894929885864, + "learning_rate": 3.1242358302773443e-09, + "loss": 0.1806, + "step": 17307 + }, + { + "epoch": 4.605641298563065, + "grad_norm": 0.37652787566185, + "learning_rate": 3.120048714296697e-09, + "loss": 0.1747, + "step": 17308 + }, + { + "epoch": 4.605907397551889, + "grad_norm": 0.3337678909301758, + "learning_rate": 3.1158643615291526e-09, + "loss": 0.1676, + "step": 17309 + }, + { + "epoch": 4.6061734965407135, + "grad_norm": 0.27832451462745667, + "learning_rate": 3.1116827720940155e-09, + "loss": 0.1753, + "step": 17310 + }, + { + "epoch": 4.606439595529537, + "grad_norm": 0.2578809857368469, + "learning_rate": 3.1075039461105677e-09, + "loss": 0.161, + "step": 17311 + }, + { + "epoch": 4.606705694518361, + "grad_norm": 0.2937382757663727, + "learning_rate": 3.1033278836980147e-09, + "loss": 0.1809, + "step": 17312 + }, + { + "epoch": 4.606971793507185, + "grad_norm": 0.24747700989246368, + "learning_rate": 3.0991545849754497e-09, + "loss": 0.1497, + "step": 17313 + }, + { + "epoch": 4.607237892496008, + "grad_norm": 0.27859246730804443, + "learning_rate": 3.094984050061911e-09, + "loss": 0.1836, + "step": 17314 + }, + { + "epoch": 4.607503991484832, + "grad_norm": 0.2770361602306366, + "learning_rate": 3.090816279076347e-09, + "loss": 0.1612, + "step": 17315 + }, + { + "epoch": 4.6077700904736565, + "grad_norm": 0.43424779176712036, + "learning_rate": 3.086651272137619e-09, + "loss": 0.2061, + "step": 17316 + }, + { + "epoch": 4.60803618946248, + "grad_norm": 0.35271206498146057, + "learning_rate": 3.0824890293645435e-09, + "loss": 0.1886, + "step": 17317 + }, + { + "epoch": 4.608302288451304, + "grad_norm": 0.27170833945274353, + "learning_rate": 3.0783295508758357e-09, + "loss": 0.1658, + "step": 17318 + }, + { + "epoch": 4.608568387440128, + "grad_norm": 0.2780643105506897, + "learning_rate": 3.074172836790112e-09, + "loss": 0.1588, + "step": 17319 + }, + { + "epoch": 4.608834486428951, + "grad_norm": 0.3021070957183838, + "learning_rate": 3.070018887225956e-09, + "loss": 0.1736, + "step": 17320 + }, + { + "epoch": 4.609100585417775, + "grad_norm": 0.40628063678741455, + "learning_rate": 3.0658677023018166e-09, + "loss": 0.189, + "step": 17321 + }, + { + "epoch": 4.6093666844065995, + "grad_norm": 0.2733278274536133, + "learning_rate": 3.0617192821361105e-09, + "loss": 0.1779, + "step": 17322 + }, + { + "epoch": 4.609632783395423, + "grad_norm": 0.2759440243244171, + "learning_rate": 3.057573626847154e-09, + "loss": 0.1735, + "step": 17323 + }, + { + "epoch": 4.609898882384247, + "grad_norm": 0.30488190054893494, + "learning_rate": 3.0534307365532086e-09, + "loss": 0.1525, + "step": 17324 + }, + { + "epoch": 4.610164981373071, + "grad_norm": 0.2651190757751465, + "learning_rate": 3.0492906113724125e-09, + "loss": 0.1832, + "step": 17325 + }, + { + "epoch": 4.610431080361895, + "grad_norm": 0.26420465111732483, + "learning_rate": 3.0451532514228828e-09, + "loss": 0.1633, + "step": 17326 + }, + { + "epoch": 4.610697179350718, + "grad_norm": 0.29997479915618896, + "learning_rate": 3.04101865682258e-09, + "loss": 0.1697, + "step": 17327 + }, + { + "epoch": 4.6109632783395424, + "grad_norm": 0.249830424785614, + "learning_rate": 3.0368868276894666e-09, + "loss": 0.1507, + "step": 17328 + }, + { + "epoch": 4.611229377328366, + "grad_norm": 0.3408641517162323, + "learning_rate": 3.032757764141369e-09, + "loss": 0.1872, + "step": 17329 + }, + { + "epoch": 4.61149547631719, + "grad_norm": 0.27346956729888916, + "learning_rate": 3.028631466296061e-09, + "loss": 0.1705, + "step": 17330 + }, + { + "epoch": 4.611761575306014, + "grad_norm": 0.36929264664649963, + "learning_rate": 3.0245079342712588e-09, + "loss": 0.177, + "step": 17331 + }, + { + "epoch": 4.612027674294838, + "grad_norm": 0.2699776291847229, + "learning_rate": 3.020387168184546e-09, + "loss": 0.1784, + "step": 17332 + }, + { + "epoch": 4.612293773283661, + "grad_norm": 0.332690566778183, + "learning_rate": 3.0162691681534733e-09, + "loss": 0.1627, + "step": 17333 + }, + { + "epoch": 4.612559872272485, + "grad_norm": 0.3428177237510681, + "learning_rate": 3.0121539342954692e-09, + "loss": 0.1943, + "step": 17334 + }, + { + "epoch": 4.6128259712613096, + "grad_norm": 0.2952798306941986, + "learning_rate": 3.00804146672794e-09, + "loss": 0.1847, + "step": 17335 + }, + { + "epoch": 4.613092070250133, + "grad_norm": 0.27099722623825073, + "learning_rate": 3.0039317655681464e-09, + "loss": 0.1741, + "step": 17336 + }, + { + "epoch": 4.613358169238957, + "grad_norm": 0.31355834007263184, + "learning_rate": 2.9998248309333285e-09, + "loss": 0.1771, + "step": 17337 + }, + { + "epoch": 4.613624268227781, + "grad_norm": 0.3431967794895172, + "learning_rate": 2.995720662940615e-09, + "loss": 0.172, + "step": 17338 + }, + { + "epoch": 4.613890367216604, + "grad_norm": 0.38333845138549805, + "learning_rate": 2.99161926170709e-09, + "loss": 0.1668, + "step": 17339 + }, + { + "epoch": 4.614156466205428, + "grad_norm": 0.3470374345779419, + "learning_rate": 2.9875206273497044e-09, + "loss": 0.1746, + "step": 17340 + }, + { + "epoch": 4.6144225651942525, + "grad_norm": 0.2628503739833832, + "learning_rate": 2.9834247599853756e-09, + "loss": 0.1624, + "step": 17341 + }, + { + "epoch": 4.614688664183076, + "grad_norm": 0.35901832580566406, + "learning_rate": 2.9793316597308994e-09, + "loss": 0.1727, + "step": 17342 + }, + { + "epoch": 4.6149547631719, + "grad_norm": 0.27471891045570374, + "learning_rate": 2.9752413267030596e-09, + "loss": 0.1657, + "step": 17343 + }, + { + "epoch": 4.615220862160724, + "grad_norm": 0.25881829857826233, + "learning_rate": 2.9711537610184854e-09, + "loss": 0.1604, + "step": 17344 + }, + { + "epoch": 4.615486961149547, + "grad_norm": 0.3562336266040802, + "learning_rate": 2.967068962793795e-09, + "loss": 0.1925, + "step": 17345 + }, + { + "epoch": 4.615753060138371, + "grad_norm": 0.341911643743515, + "learning_rate": 2.9629869321454613e-09, + "loss": 0.1869, + "step": 17346 + }, + { + "epoch": 4.6160191591271955, + "grad_norm": 0.2844598591327667, + "learning_rate": 2.9589076691899472e-09, + "loss": 0.1899, + "step": 17347 + }, + { + "epoch": 4.616285258116019, + "grad_norm": 0.33983153104782104, + "learning_rate": 2.9548311740435705e-09, + "loss": 0.1716, + "step": 17348 + }, + { + "epoch": 4.616551357104843, + "grad_norm": 0.3033377528190613, + "learning_rate": 2.950757446822616e-09, + "loss": 0.1645, + "step": 17349 + }, + { + "epoch": 4.616817456093667, + "grad_norm": 0.46977344155311584, + "learning_rate": 2.9466864876432795e-09, + "loss": 0.1819, + "step": 17350 + }, + { + "epoch": 4.617083555082491, + "grad_norm": 0.25919586420059204, + "learning_rate": 2.9426182966216573e-09, + "loss": 0.1681, + "step": 17351 + }, + { + "epoch": 4.617349654071314, + "grad_norm": 0.3377779722213745, + "learning_rate": 2.9385528738737895e-09, + "loss": 0.1956, + "step": 17352 + }, + { + "epoch": 4.6176157530601385, + "grad_norm": 0.2704828977584839, + "learning_rate": 2.9344902195156283e-09, + "loss": 0.1785, + "step": 17353 + }, + { + "epoch": 4.617881852048962, + "grad_norm": 0.38144057989120483, + "learning_rate": 2.93043033366307e-09, + "loss": 0.1869, + "step": 17354 + }, + { + "epoch": 4.618147951037786, + "grad_norm": 0.28384608030319214, + "learning_rate": 2.9263732164318766e-09, + "loss": 0.1707, + "step": 17355 + }, + { + "epoch": 4.61841405002661, + "grad_norm": 0.26926612854003906, + "learning_rate": 2.9223188679378007e-09, + "loss": 0.1634, + "step": 17356 + }, + { + "epoch": 4.618680149015434, + "grad_norm": 0.28828316926956177, + "learning_rate": 2.91826728829645e-09, + "loss": 0.1685, + "step": 17357 + }, + { + "epoch": 4.618946248004257, + "grad_norm": 0.3390130400657654, + "learning_rate": 2.914218477623387e-09, + "loss": 0.164, + "step": 17358 + }, + { + "epoch": 4.6192123469930815, + "grad_norm": 0.3947736918926239, + "learning_rate": 2.9101724360341084e-09, + "loss": 0.1749, + "step": 17359 + }, + { + "epoch": 4.619478445981906, + "grad_norm": 0.4037071466445923, + "learning_rate": 2.9061291636440223e-09, + "loss": 0.1988, + "step": 17360 + }, + { + "epoch": 4.619744544970729, + "grad_norm": 0.33253443241119385, + "learning_rate": 2.9020886605684138e-09, + "loss": 0.189, + "step": 17361 + }, + { + "epoch": 4.620010643959553, + "grad_norm": 0.27962934970855713, + "learning_rate": 2.8980509269225686e-09, + "loss": 0.1609, + "step": 17362 + }, + { + "epoch": 4.620276742948377, + "grad_norm": 0.462944895029068, + "learning_rate": 2.8940159628216277e-09, + "loss": 0.1899, + "step": 17363 + }, + { + "epoch": 4.6205428419372, + "grad_norm": 0.2615622878074646, + "learning_rate": 2.8899837683806773e-09, + "loss": 0.156, + "step": 17364 + }, + { + "epoch": 4.6208089409260245, + "grad_norm": 0.2735423147678375, + "learning_rate": 2.885954343714725e-09, + "loss": 0.1763, + "step": 17365 + }, + { + "epoch": 4.621075039914849, + "grad_norm": 0.35290926694869995, + "learning_rate": 2.88192768893869e-09, + "loss": 0.1932, + "step": 17366 + }, + { + "epoch": 4.621341138903672, + "grad_norm": 0.2663459777832031, + "learning_rate": 2.8779038041674366e-09, + "loss": 0.1853, + "step": 17367 + }, + { + "epoch": 4.621607237892496, + "grad_norm": 0.27299371361732483, + "learning_rate": 2.8738826895157277e-09, + "loss": 0.175, + "step": 17368 + }, + { + "epoch": 4.62187333688132, + "grad_norm": 0.32575398683547974, + "learning_rate": 2.8698643450982716e-09, + "loss": 0.1745, + "step": 17369 + }, + { + "epoch": 4.622139435870143, + "grad_norm": 0.3689294755458832, + "learning_rate": 2.8658487710296443e-09, + "loss": 0.1784, + "step": 17370 + }, + { + "epoch": 4.622405534858967, + "grad_norm": 0.2697029113769531, + "learning_rate": 2.8618359674244087e-09, + "loss": 0.1706, + "step": 17371 + }, + { + "epoch": 4.622671633847792, + "grad_norm": 0.29418033361434937, + "learning_rate": 2.8578259343969957e-09, + "loss": 0.1698, + "step": 17372 + }, + { + "epoch": 4.622937732836615, + "grad_norm": 0.3858593702316284, + "learning_rate": 2.853818672061781e-09, + "loss": 0.2028, + "step": 17373 + }, + { + "epoch": 4.623203831825439, + "grad_norm": 0.373800665140152, + "learning_rate": 2.8498141805330833e-09, + "loss": 0.176, + "step": 17374 + }, + { + "epoch": 4.623469930814263, + "grad_norm": 0.2801045775413513, + "learning_rate": 2.845812459925101e-09, + "loss": 0.1696, + "step": 17375 + }, + { + "epoch": 4.623736029803087, + "grad_norm": 0.2576066851615906, + "learning_rate": 2.841813510351976e-09, + "loss": 0.1551, + "step": 17376 + }, + { + "epoch": 4.62400212879191, + "grad_norm": 0.33379417657852173, + "learning_rate": 2.837817331927761e-09, + "loss": 0.1714, + "step": 17377 + }, + { + "epoch": 4.6242682277807345, + "grad_norm": 0.2931860089302063, + "learning_rate": 2.833823924766432e-09, + "loss": 0.1791, + "step": 17378 + }, + { + "epoch": 4.624534326769558, + "grad_norm": 0.25818684697151184, + "learning_rate": 2.8298332889819197e-09, + "loss": 0.1545, + "step": 17379 + }, + { + "epoch": 4.624800425758382, + "grad_norm": 0.3826529085636139, + "learning_rate": 2.825845424688e-09, + "loss": 0.1764, + "step": 17380 + }, + { + "epoch": 4.625066524747206, + "grad_norm": 0.2898176312446594, + "learning_rate": 2.8218603319984378e-09, + "loss": 0.1605, + "step": 17381 + }, + { + "epoch": 4.62533262373603, + "grad_norm": 0.2620013654232025, + "learning_rate": 2.8178780110268862e-09, + "loss": 0.1652, + "step": 17382 + }, + { + "epoch": 4.625598722724853, + "grad_norm": 0.41631585359573364, + "learning_rate": 2.8138984618869542e-09, + "loss": 0.1694, + "step": 17383 + }, + { + "epoch": 4.6258648217136775, + "grad_norm": 0.27195483446121216, + "learning_rate": 2.809921684692118e-09, + "loss": 0.1697, + "step": 17384 + }, + { + "epoch": 4.626130920702502, + "grad_norm": 0.2913331985473633, + "learning_rate": 2.80594767955582e-09, + "loss": 0.1783, + "step": 17385 + }, + { + "epoch": 4.626397019691325, + "grad_norm": 0.34158968925476074, + "learning_rate": 2.801976446591414e-09, + "loss": 0.1863, + "step": 17386 + }, + { + "epoch": 4.626663118680149, + "grad_norm": 0.4350281059741974, + "learning_rate": 2.798007985912132e-09, + "loss": 0.2057, + "step": 17387 + }, + { + "epoch": 4.626929217668973, + "grad_norm": 0.27964967489242554, + "learning_rate": 2.794042297631194e-09, + "loss": 0.1643, + "step": 17388 + }, + { + "epoch": 4.627195316657796, + "grad_norm": 0.4338390529155731, + "learning_rate": 2.79007938186171e-09, + "loss": 0.1893, + "step": 17389 + }, + { + "epoch": 4.6274614156466205, + "grad_norm": 0.35148707032203674, + "learning_rate": 2.7861192387167e-09, + "loss": 0.183, + "step": 17390 + }, + { + "epoch": 4.627727514635445, + "grad_norm": 0.2734745144844055, + "learning_rate": 2.7821618683091077e-09, + "loss": 0.1723, + "step": 17391 + }, + { + "epoch": 4.627993613624268, + "grad_norm": 0.2548164129257202, + "learning_rate": 2.778207270751831e-09, + "loss": 0.1734, + "step": 17392 + }, + { + "epoch": 4.628259712613092, + "grad_norm": 0.33138027787208557, + "learning_rate": 2.774255446157636e-09, + "loss": 0.1757, + "step": 17393 + }, + { + "epoch": 4.628525811601916, + "grad_norm": 0.27118954062461853, + "learning_rate": 2.7703063946392437e-09, + "loss": 0.1733, + "step": 17394 + }, + { + "epoch": 4.62879191059074, + "grad_norm": 0.3896614909172058, + "learning_rate": 2.7663601163092964e-09, + "loss": 0.1699, + "step": 17395 + }, + { + "epoch": 4.6290580095795635, + "grad_norm": 0.2961224317550659, + "learning_rate": 2.7624166112803603e-09, + "loss": 0.1782, + "step": 17396 + }, + { + "epoch": 4.629324108568388, + "grad_norm": 0.3184262216091156, + "learning_rate": 2.75847587966489e-09, + "loss": 0.1724, + "step": 17397 + }, + { + "epoch": 4.629590207557211, + "grad_norm": 0.34315070509910583, + "learning_rate": 2.754537921575295e-09, + "loss": 0.1812, + "step": 17398 + }, + { + "epoch": 4.629856306546035, + "grad_norm": 0.2960553765296936, + "learning_rate": 2.750602737123886e-09, + "loss": 0.1738, + "step": 17399 + }, + { + "epoch": 4.630122405534859, + "grad_norm": 0.2855161428451538, + "learning_rate": 2.7466703264229063e-09, + "loss": 0.1886, + "step": 17400 + }, + { + "epoch": 4.630388504523683, + "grad_norm": 0.28315678238868713, + "learning_rate": 2.7427406895845216e-09, + "loss": 0.159, + "step": 17401 + }, + { + "epoch": 4.6306546035125065, + "grad_norm": 0.3714338541030884, + "learning_rate": 2.7388138267207983e-09, + "loss": 0.1685, + "step": 17402 + }, + { + "epoch": 4.630920702501331, + "grad_norm": 0.30885186791419983, + "learning_rate": 2.7348897379437464e-09, + "loss": 0.2076, + "step": 17403 + }, + { + "epoch": 4.631186801490155, + "grad_norm": 0.2788945734500885, + "learning_rate": 2.7309684233652984e-09, + "loss": 0.1955, + "step": 17404 + }, + { + "epoch": 4.631452900478978, + "grad_norm": 0.43105489015579224, + "learning_rate": 2.7270498830972987e-09, + "loss": 0.1604, + "step": 17405 + }, + { + "epoch": 4.631718999467802, + "grad_norm": 0.29602643847465515, + "learning_rate": 2.723134117251502e-09, + "loss": 0.1913, + "step": 17406 + }, + { + "epoch": 4.631985098456626, + "grad_norm": 0.26792243123054504, + "learning_rate": 2.719221125939597e-09, + "loss": 0.176, + "step": 17407 + }, + { + "epoch": 4.632251197445449, + "grad_norm": 0.5282093286514282, + "learning_rate": 2.7153109092731828e-09, + "loss": 0.1866, + "step": 17408 + }, + { + "epoch": 4.632517296434274, + "grad_norm": 0.4588828384876251, + "learning_rate": 2.711403467363793e-09, + "loss": 0.1972, + "step": 17409 + }, + { + "epoch": 4.632783395423098, + "grad_norm": 0.27917802333831787, + "learning_rate": 2.7074988003228825e-09, + "loss": 0.189, + "step": 17410 + }, + { + "epoch": 4.633049494411921, + "grad_norm": 0.2717069983482361, + "learning_rate": 2.7035969082618182e-09, + "loss": 0.1632, + "step": 17411 + }, + { + "epoch": 4.633315593400745, + "grad_norm": 0.2827845513820648, + "learning_rate": 2.6996977912918663e-09, + "loss": 0.1862, + "step": 17412 + }, + { + "epoch": 4.633581692389569, + "grad_norm": 0.31914299726486206, + "learning_rate": 2.695801449524282e-09, + "loss": 0.1735, + "step": 17413 + }, + { + "epoch": 4.633847791378392, + "grad_norm": 0.5009400844573975, + "learning_rate": 2.6919078830701546e-09, + "loss": 0.1817, + "step": 17414 + }, + { + "epoch": 4.6341138903672165, + "grad_norm": 0.29157280921936035, + "learning_rate": 2.688017092040562e-09, + "loss": 0.1855, + "step": 17415 + }, + { + "epoch": 4.634379989356041, + "grad_norm": 0.2791864275932312, + "learning_rate": 2.684129076546471e-09, + "loss": 0.1721, + "step": 17416 + }, + { + "epoch": 4.634646088344864, + "grad_norm": 0.2462053745985031, + "learning_rate": 2.680243836698759e-09, + "loss": 0.1551, + "step": 17417 + }, + { + "epoch": 4.634912187333688, + "grad_norm": 0.31724318861961365, + "learning_rate": 2.6763613726082603e-09, + "loss": 0.1793, + "step": 17418 + }, + { + "epoch": 4.635178286322512, + "grad_norm": 0.2821742594242096, + "learning_rate": 2.672481684385719e-09, + "loss": 0.1848, + "step": 17419 + }, + { + "epoch": 4.635444385311336, + "grad_norm": 0.28312721848487854, + "learning_rate": 2.668604772141769e-09, + "loss": 0.1981, + "step": 17420 + }, + { + "epoch": 4.6357104843001595, + "grad_norm": 0.312465637922287, + "learning_rate": 2.6647306359870004e-09, + "loss": 0.1849, + "step": 17421 + }, + { + "epoch": 4.635976583288984, + "grad_norm": 0.4320337176322937, + "learning_rate": 2.660859276031913e-09, + "loss": 0.1748, + "step": 17422 + }, + { + "epoch": 4.636242682277807, + "grad_norm": 0.35913410782814026, + "learning_rate": 2.6569906923869177e-09, + "loss": 0.1915, + "step": 17423 + }, + { + "epoch": 4.636508781266631, + "grad_norm": 0.2797103226184845, + "learning_rate": 2.6531248851623613e-09, + "loss": 0.1536, + "step": 17424 + }, + { + "epoch": 4.636774880255455, + "grad_norm": 0.2955096364021301, + "learning_rate": 2.6492618544684873e-09, + "loss": 0.1932, + "step": 17425 + }, + { + "epoch": 4.637040979244279, + "grad_norm": 0.28163373470306396, + "learning_rate": 2.645401600415509e-09, + "loss": 0.1751, + "step": 17426 + }, + { + "epoch": 4.6373070782331025, + "grad_norm": 0.2595124840736389, + "learning_rate": 2.641544123113504e-09, + "loss": 0.168, + "step": 17427 + }, + { + "epoch": 4.637573177221927, + "grad_norm": 0.5858185291290283, + "learning_rate": 2.6376894226725065e-09, + "loss": 0.1903, + "step": 17428 + }, + { + "epoch": 4.637839276210751, + "grad_norm": 0.28757357597351074, + "learning_rate": 2.6338374992024516e-09, + "loss": 0.1752, + "step": 17429 + }, + { + "epoch": 4.638105375199574, + "grad_norm": 0.38442516326904297, + "learning_rate": 2.629988352813217e-09, + "loss": 0.19, + "step": 17430 + }, + { + "epoch": 4.638371474188398, + "grad_norm": 0.3225254416465759, + "learning_rate": 2.6261419836145716e-09, + "loss": 0.1727, + "step": 17431 + }, + { + "epoch": 4.638637573177222, + "grad_norm": 0.39911088347435, + "learning_rate": 2.622298391716227e-09, + "loss": 0.1878, + "step": 17432 + }, + { + "epoch": 4.6389036721660455, + "grad_norm": 0.27933380007743835, + "learning_rate": 2.618457577227817e-09, + "loss": 0.1769, + "step": 17433 + }, + { + "epoch": 4.63916977115487, + "grad_norm": 0.3706730604171753, + "learning_rate": 2.6146195402588887e-09, + "loss": 0.1738, + "step": 17434 + }, + { + "epoch": 4.639435870143694, + "grad_norm": 0.267556369304657, + "learning_rate": 2.6107842809189096e-09, + "loss": 0.1796, + "step": 17435 + }, + { + "epoch": 4.639701969132517, + "grad_norm": 0.36676064133644104, + "learning_rate": 2.6069517993172697e-09, + "loss": 0.1775, + "step": 17436 + }, + { + "epoch": 4.639968068121341, + "grad_norm": 0.283889502286911, + "learning_rate": 2.6031220955632595e-09, + "loss": 0.1773, + "step": 17437 + }, + { + "epoch": 4.640234167110165, + "grad_norm": 0.3380991816520691, + "learning_rate": 2.599295169766136e-09, + "loss": 0.1789, + "step": 17438 + }, + { + "epoch": 4.6405002660989885, + "grad_norm": 0.29647356271743774, + "learning_rate": 2.595471022035045e-09, + "loss": 0.1917, + "step": 17439 + }, + { + "epoch": 4.640766365087813, + "grad_norm": 0.3962707817554474, + "learning_rate": 2.5916496524790443e-09, + "loss": 0.1821, + "step": 17440 + }, + { + "epoch": 4.641032464076637, + "grad_norm": 0.310226708650589, + "learning_rate": 2.5878310612071575e-09, + "loss": 0.1946, + "step": 17441 + }, + { + "epoch": 4.64129856306546, + "grad_norm": 0.4055209457874298, + "learning_rate": 2.584015248328275e-09, + "loss": 0.1669, + "step": 17442 + }, + { + "epoch": 4.641564662054284, + "grad_norm": 0.29902559518814087, + "learning_rate": 2.5802022139512325e-09, + "loss": 0.1747, + "step": 17443 + }, + { + "epoch": 4.641830761043108, + "grad_norm": 0.2933635711669922, + "learning_rate": 2.576391958184787e-09, + "loss": 0.1723, + "step": 17444 + }, + { + "epoch": 4.642096860031932, + "grad_norm": 0.529884934425354, + "learning_rate": 2.5725844811376185e-09, + "loss": 0.1817, + "step": 17445 + }, + { + "epoch": 4.642362959020756, + "grad_norm": 0.4984031915664673, + "learning_rate": 2.5687797829183178e-09, + "loss": 0.1976, + "step": 17446 + }, + { + "epoch": 4.64262905800958, + "grad_norm": 0.2811489403247833, + "learning_rate": 2.5649778636354203e-09, + "loss": 0.1882, + "step": 17447 + }, + { + "epoch": 4.642895156998403, + "grad_norm": 0.27574196457862854, + "learning_rate": 2.5611787233973393e-09, + "loss": 0.181, + "step": 17448 + }, + { + "epoch": 4.643161255987227, + "grad_norm": 0.31359773874282837, + "learning_rate": 2.5573823623124656e-09, + "loss": 0.1781, + "step": 17449 + }, + { + "epoch": 4.643427354976051, + "grad_norm": 0.28584206104278564, + "learning_rate": 2.5535887804890354e-09, + "loss": 0.177, + "step": 17450 + }, + { + "epoch": 4.643693453964875, + "grad_norm": 0.28267690539360046, + "learning_rate": 2.5497979780352952e-09, + "loss": 0.1764, + "step": 17451 + }, + { + "epoch": 4.6439595529536986, + "grad_norm": 0.3181234896183014, + "learning_rate": 2.546009955059325e-09, + "loss": 0.1693, + "step": 17452 + }, + { + "epoch": 4.644225651942523, + "grad_norm": 0.34406596422195435, + "learning_rate": 2.5422247116691943e-09, + "loss": 0.1824, + "step": 17453 + }, + { + "epoch": 4.644491750931347, + "grad_norm": 0.34558284282684326, + "learning_rate": 2.538442247972861e-09, + "loss": 0.1888, + "step": 17454 + }, + { + "epoch": 4.64475784992017, + "grad_norm": 0.29540860652923584, + "learning_rate": 2.534662564078205e-09, + "loss": 0.1683, + "step": 17455 + }, + { + "epoch": 4.645023948908994, + "grad_norm": 0.26047784090042114, + "learning_rate": 2.5308856600930295e-09, + "loss": 0.1694, + "step": 17456 + }, + { + "epoch": 4.645290047897818, + "grad_norm": 0.4799012541770935, + "learning_rate": 2.5271115361250706e-09, + "loss": 0.1613, + "step": 17457 + }, + { + "epoch": 4.6455561468866415, + "grad_norm": 0.2692324221134186, + "learning_rate": 2.523340192281953e-09, + "loss": 0.1482, + "step": 17458 + }, + { + "epoch": 4.645822245875466, + "grad_norm": 0.3269442021846771, + "learning_rate": 2.5195716286712574e-09, + "loss": 0.1835, + "step": 17459 + }, + { + "epoch": 4.64608834486429, + "grad_norm": 0.28581032156944275, + "learning_rate": 2.515805845400476e-09, + "loss": 0.1853, + "step": 17460 + }, + { + "epoch": 4.646354443853113, + "grad_norm": 0.32371985912323, + "learning_rate": 2.5120428425770003e-09, + "loss": 0.1846, + "step": 17461 + }, + { + "epoch": 4.646620542841937, + "grad_norm": 0.30991095304489136, + "learning_rate": 2.5082826203081773e-09, + "loss": 0.1825, + "step": 17462 + }, + { + "epoch": 4.646886641830761, + "grad_norm": 0.3377618193626404, + "learning_rate": 2.5045251787012332e-09, + "loss": 0.1772, + "step": 17463 + }, + { + "epoch": 4.6471527408195845, + "grad_norm": 0.2823123335838318, + "learning_rate": 2.500770517863371e-09, + "loss": 0.1687, + "step": 17464 + }, + { + "epoch": 4.647418839808409, + "grad_norm": 0.34905943274497986, + "learning_rate": 2.497018637901649e-09, + "loss": 0.1714, + "step": 17465 + }, + { + "epoch": 4.647684938797233, + "grad_norm": 0.3314775228500366, + "learning_rate": 2.4932695389230928e-09, + "loss": 0.1815, + "step": 17466 + }, + { + "epoch": 4.647951037786056, + "grad_norm": 0.3070540726184845, + "learning_rate": 2.4895232210346393e-09, + "loss": 0.1649, + "step": 17467 + }, + { + "epoch": 4.64821713677488, + "grad_norm": 0.33420422673225403, + "learning_rate": 2.4857796843431146e-09, + "loss": 0.1761, + "step": 17468 + }, + { + "epoch": 4.648483235763704, + "grad_norm": 0.2779368460178375, + "learning_rate": 2.482038928955321e-09, + "loss": 0.1711, + "step": 17469 + }, + { + "epoch": 4.648749334752528, + "grad_norm": 0.3106711506843567, + "learning_rate": 2.478300954977952e-09, + "loss": 0.185, + "step": 17470 + }, + { + "epoch": 4.649015433741352, + "grad_norm": 0.36953479051589966, + "learning_rate": 2.474565762517611e-09, + "loss": 0.1707, + "step": 17471 + }, + { + "epoch": 4.649281532730176, + "grad_norm": 0.3024110794067383, + "learning_rate": 2.4708333516808344e-09, + "loss": 0.1883, + "step": 17472 + }, + { + "epoch": 4.649547631718999, + "grad_norm": 0.34549200534820557, + "learning_rate": 2.467103722574082e-09, + "loss": 0.1805, + "step": 17473 + }, + { + "epoch": 4.649813730707823, + "grad_norm": 0.40836620330810547, + "learning_rate": 2.463376875303713e-09, + "loss": 0.1909, + "step": 17474 + }, + { + "epoch": 4.650079829696647, + "grad_norm": 0.2667628824710846, + "learning_rate": 2.4596528099760426e-09, + "loss": 0.1562, + "step": 17475 + }, + { + "epoch": 4.650345928685471, + "grad_norm": 0.40583834052085876, + "learning_rate": 2.4559315266972967e-09, + "loss": 0.1938, + "step": 17476 + }, + { + "epoch": 4.650612027674295, + "grad_norm": 0.2642618417739868, + "learning_rate": 2.4522130255736017e-09, + "loss": 0.1591, + "step": 17477 + }, + { + "epoch": 4.650878126663119, + "grad_norm": 0.30098533630371094, + "learning_rate": 2.448497306711017e-09, + "loss": 0.1684, + "step": 17478 + }, + { + "epoch": 4.651144225651943, + "grad_norm": 0.3076719045639038, + "learning_rate": 2.4447843702155245e-09, + "loss": 0.1971, + "step": 17479 + }, + { + "epoch": 4.651410324640766, + "grad_norm": 0.2629909813404083, + "learning_rate": 2.4410742161930178e-09, + "loss": 0.1583, + "step": 17480 + }, + { + "epoch": 4.65167642362959, + "grad_norm": 0.38751664757728577, + "learning_rate": 2.4373668447493222e-09, + "loss": 0.1777, + "step": 17481 + }, + { + "epoch": 4.651942522618414, + "grad_norm": 0.26796096563339233, + "learning_rate": 2.4336622559901764e-09, + "loss": 0.1675, + "step": 17482 + }, + { + "epoch": 4.652208621607238, + "grad_norm": 0.3240567147731781, + "learning_rate": 2.429960450021262e-09, + "loss": 0.1878, + "step": 17483 + }, + { + "epoch": 4.652474720596062, + "grad_norm": 0.2825808525085449, + "learning_rate": 2.4262614269481397e-09, + "loss": 0.1883, + "step": 17484 + }, + { + "epoch": 4.652740819584886, + "grad_norm": 0.25872519612312317, + "learning_rate": 2.422565186876324e-09, + "loss": 0.1574, + "step": 17485 + }, + { + "epoch": 4.653006918573709, + "grad_norm": 0.42140135169029236, + "learning_rate": 2.4188717299112315e-09, + "loss": 0.206, + "step": 17486 + }, + { + "epoch": 4.653273017562533, + "grad_norm": 0.31902164220809937, + "learning_rate": 2.4151810561582218e-09, + "loss": 0.1944, + "step": 17487 + }, + { + "epoch": 4.653539116551357, + "grad_norm": 0.33469581604003906, + "learning_rate": 2.4114931657225334e-09, + "loss": 0.1783, + "step": 17488 + }, + { + "epoch": 4.653805215540181, + "grad_norm": 0.41600608825683594, + "learning_rate": 2.4078080587093818e-09, + "loss": 0.1864, + "step": 17489 + }, + { + "epoch": 4.654071314529005, + "grad_norm": 0.35134363174438477, + "learning_rate": 2.4041257352238497e-09, + "loss": 0.1786, + "step": 17490 + }, + { + "epoch": 4.654337413517829, + "grad_norm": 0.3191401958465576, + "learning_rate": 2.4004461953709976e-09, + "loss": 0.1787, + "step": 17491 + }, + { + "epoch": 4.654603512506652, + "grad_norm": 0.26710832118988037, + "learning_rate": 2.3967694392557304e-09, + "loss": 0.1469, + "step": 17492 + }, + { + "epoch": 4.654869611495476, + "grad_norm": 0.2889067232608795, + "learning_rate": 2.393095466982953e-09, + "loss": 0.1685, + "step": 17493 + }, + { + "epoch": 4.6551357104843, + "grad_norm": 0.3250776529312134, + "learning_rate": 2.3894242786574257e-09, + "loss": 0.1758, + "step": 17494 + }, + { + "epoch": 4.655401809473124, + "grad_norm": 0.32655858993530273, + "learning_rate": 2.3857558743838768e-09, + "loss": 0.1794, + "step": 17495 + }, + { + "epoch": 4.655667908461948, + "grad_norm": 0.36373570561408997, + "learning_rate": 2.382090254266933e-09, + "loss": 0.1709, + "step": 17496 + }, + { + "epoch": 4.655934007450772, + "grad_norm": 0.3539358675479889, + "learning_rate": 2.378427418411144e-09, + "loss": 0.1909, + "step": 17497 + }, + { + "epoch": 4.656200106439595, + "grad_norm": 0.26490136981010437, + "learning_rate": 2.3747673669209824e-09, + "loss": 0.1685, + "step": 17498 + }, + { + "epoch": 4.656466205428419, + "grad_norm": 0.31627804040908813, + "learning_rate": 2.3711100999008414e-09, + "loss": 0.1977, + "step": 17499 + }, + { + "epoch": 4.656732304417243, + "grad_norm": 0.3563539683818817, + "learning_rate": 2.3674556174550386e-09, + "loss": 0.1718, + "step": 17500 + }, + { + "epoch": 4.656998403406067, + "grad_norm": 0.31694096326828003, + "learning_rate": 2.363803919687779e-09, + "loss": 0.1745, + "step": 17501 + }, + { + "epoch": 4.657264502394891, + "grad_norm": 0.32174187898635864, + "learning_rate": 2.3601550067032573e-09, + "loss": 0.194, + "step": 17502 + }, + { + "epoch": 4.657530601383715, + "grad_norm": 0.27108797430992126, + "learning_rate": 2.3565088786055233e-09, + "loss": 0.1933, + "step": 17503 + }, + { + "epoch": 4.657796700372539, + "grad_norm": 0.2695676386356354, + "learning_rate": 2.3528655354985716e-09, + "loss": 0.1775, + "step": 17504 + }, + { + "epoch": 4.658062799361362, + "grad_norm": 0.2667158246040344, + "learning_rate": 2.3492249774863194e-09, + "loss": 0.1778, + "step": 17505 + }, + { + "epoch": 4.658328898350186, + "grad_norm": 0.3605421185493469, + "learning_rate": 2.3455872046726278e-09, + "loss": 0.188, + "step": 17506 + }, + { + "epoch": 4.65859499733901, + "grad_norm": 0.26007014513015747, + "learning_rate": 2.341952217161214e-09, + "loss": 0.1747, + "step": 17507 + }, + { + "epoch": 4.658861096327834, + "grad_norm": 0.367636501789093, + "learning_rate": 2.3383200150557834e-09, + "loss": 0.1901, + "step": 17508 + }, + { + "epoch": 4.659127195316658, + "grad_norm": 0.2863537669181824, + "learning_rate": 2.334690598459921e-09, + "loss": 0.1726, + "step": 17509 + }, + { + "epoch": 4.659393294305482, + "grad_norm": 0.2727130353450775, + "learning_rate": 2.3310639674771427e-09, + "loss": 0.1675, + "step": 17510 + }, + { + "epoch": 4.659659393294305, + "grad_norm": 0.3056371808052063, + "learning_rate": 2.3274401222108884e-09, + "loss": 0.1841, + "step": 17511 + }, + { + "epoch": 4.659925492283129, + "grad_norm": 0.30135515332221985, + "learning_rate": 2.3238190627645314e-09, + "loss": 0.1713, + "step": 17512 + }, + { + "epoch": 4.660191591271953, + "grad_norm": 0.29596370458602905, + "learning_rate": 2.3202007892413446e-09, + "loss": 0.1775, + "step": 17513 + }, + { + "epoch": 4.6604576902607775, + "grad_norm": 0.37793299555778503, + "learning_rate": 2.3165853017445226e-09, + "loss": 0.1733, + "step": 17514 + }, + { + "epoch": 4.660723789249601, + "grad_norm": 0.29609841108322144, + "learning_rate": 2.3129726003771945e-09, + "loss": 0.1773, + "step": 17515 + }, + { + "epoch": 4.660989888238425, + "grad_norm": 0.3355032205581665, + "learning_rate": 2.3093626852423886e-09, + "loss": 0.1684, + "step": 17516 + }, + { + "epoch": 4.661255987227248, + "grad_norm": 0.2999800443649292, + "learning_rate": 2.3057555564430896e-09, + "loss": 0.1656, + "step": 17517 + }, + { + "epoch": 4.661522086216072, + "grad_norm": 0.2836611270904541, + "learning_rate": 2.3021512140821485e-09, + "loss": 0.1803, + "step": 17518 + }, + { + "epoch": 4.661788185204896, + "grad_norm": 0.3245740234851837, + "learning_rate": 2.298549658262394e-09, + "loss": 0.1829, + "step": 17519 + }, + { + "epoch": 4.6620542841937205, + "grad_norm": 0.25741246342658997, + "learning_rate": 2.2949508890865443e-09, + "loss": 0.1731, + "step": 17520 + }, + { + "epoch": 4.662320383182544, + "grad_norm": 0.2900558114051819, + "learning_rate": 2.291354906657239e-09, + "loss": 0.1605, + "step": 17521 + }, + { + "epoch": 4.662586482171368, + "grad_norm": 0.32108214497566223, + "learning_rate": 2.2877617110770408e-09, + "loss": 0.1863, + "step": 17522 + }, + { + "epoch": 4.662852581160192, + "grad_norm": 0.28415319323539734, + "learning_rate": 2.2841713024484567e-09, + "loss": 0.1772, + "step": 17523 + }, + { + "epoch": 4.663118680149015, + "grad_norm": 0.2569797933101654, + "learning_rate": 2.28058368087386e-09, + "loss": 0.1645, + "step": 17524 + }, + { + "epoch": 4.663384779137839, + "grad_norm": 0.3322335183620453, + "learning_rate": 2.276998846455591e-09, + "loss": 0.1781, + "step": 17525 + }, + { + "epoch": 4.6636508781266635, + "grad_norm": 0.260166198015213, + "learning_rate": 2.273416799295902e-09, + "loss": 0.155, + "step": 17526 + }, + { + "epoch": 4.663916977115487, + "grad_norm": 0.2868058681488037, + "learning_rate": 2.269837539496955e-09, + "loss": 0.1682, + "step": 17527 + }, + { + "epoch": 4.664183076104311, + "grad_norm": 0.3056665062904358, + "learning_rate": 2.266261067160835e-09, + "loss": 0.2041, + "step": 17528 + }, + { + "epoch": 4.664449175093135, + "grad_norm": 0.29698964953422546, + "learning_rate": 2.262687382389572e-09, + "loss": 0.2008, + "step": 17529 + }, + { + "epoch": 4.664715274081958, + "grad_norm": 0.30339646339416504, + "learning_rate": 2.259116485285051e-09, + "loss": 0.1915, + "step": 17530 + }, + { + "epoch": 4.664981373070782, + "grad_norm": 0.27078673243522644, + "learning_rate": 2.2555483759491567e-09, + "loss": 0.1723, + "step": 17531 + }, + { + "epoch": 4.665247472059606, + "grad_norm": 0.2902340590953827, + "learning_rate": 2.251983054483664e-09, + "loss": 0.1907, + "step": 17532 + }, + { + "epoch": 4.66551357104843, + "grad_norm": 0.3014819920063019, + "learning_rate": 2.2484205209902128e-09, + "loss": 0.1882, + "step": 17533 + }, + { + "epoch": 4.665779670037254, + "grad_norm": 0.2892927825450897, + "learning_rate": 2.2448607755704786e-09, + "loss": 0.1663, + "step": 17534 + }, + { + "epoch": 4.666045769026078, + "grad_norm": 3.98738431930542, + "learning_rate": 2.241303818325957e-09, + "loss": 0.1934, + "step": 17535 + }, + { + "epoch": 4.666311868014901, + "grad_norm": 0.2739620506763458, + "learning_rate": 2.2377496493581115e-09, + "loss": 0.1769, + "step": 17536 + }, + { + "epoch": 4.666577967003725, + "grad_norm": 0.3880872130393982, + "learning_rate": 2.234198268768295e-09, + "loss": 0.1801, + "step": 17537 + }, + { + "epoch": 4.666844065992549, + "grad_norm": 0.2936553955078125, + "learning_rate": 2.2306496766578253e-09, + "loss": 0.1818, + "step": 17538 + }, + { + "epoch": 4.6671101649813735, + "grad_norm": 0.27792441844940186, + "learning_rate": 2.227103873127889e-09, + "loss": 0.1704, + "step": 17539 + }, + { + "epoch": 4.667376263970197, + "grad_norm": 0.38362541794776917, + "learning_rate": 2.2235608582796494e-09, + "loss": 0.188, + "step": 17540 + }, + { + "epoch": 4.667642362959021, + "grad_norm": 0.279319167137146, + "learning_rate": 2.2200206322141256e-09, + "loss": 0.1493, + "step": 17541 + }, + { + "epoch": 4.667908461947844, + "grad_norm": 0.3306357264518738, + "learning_rate": 2.216483195032337e-09, + "loss": 0.179, + "step": 17542 + }, + { + "epoch": 4.668174560936668, + "grad_norm": 0.29329603910446167, + "learning_rate": 2.212948546835136e-09, + "loss": 0.176, + "step": 17543 + }, + { + "epoch": 4.668440659925492, + "grad_norm": 0.3418359160423279, + "learning_rate": 2.2094166877233644e-09, + "loss": 0.1881, + "step": 17544 + }, + { + "epoch": 4.6687067589143165, + "grad_norm": 0.3732384145259857, + "learning_rate": 2.2058876177977415e-09, + "loss": 0.1803, + "step": 17545 + }, + { + "epoch": 4.66897285790314, + "grad_norm": 0.2762294411659241, + "learning_rate": 2.2023613371589423e-09, + "loss": 0.1698, + "step": 17546 + }, + { + "epoch": 4.669238956891964, + "grad_norm": 0.3623819947242737, + "learning_rate": 2.1988378459075197e-09, + "loss": 0.1763, + "step": 17547 + }, + { + "epoch": 4.669505055880788, + "grad_norm": 0.34258803725242615, + "learning_rate": 2.195317144143993e-09, + "loss": 0.1778, + "step": 17548 + }, + { + "epoch": 4.669771154869611, + "grad_norm": 0.25771570205688477, + "learning_rate": 2.1917992319687605e-09, + "loss": 0.1572, + "step": 17549 + }, + { + "epoch": 4.670037253858435, + "grad_norm": 0.2728486955165863, + "learning_rate": 2.1882841094821636e-09, + "loss": 0.1587, + "step": 17550 + }, + { + "epoch": 4.6703033528472595, + "grad_norm": 0.28160154819488525, + "learning_rate": 2.1847717767844886e-09, + "loss": 0.1834, + "step": 17551 + }, + { + "epoch": 4.670569451836083, + "grad_norm": 0.27978911995887756, + "learning_rate": 2.181262233975878e-09, + "loss": 0.1826, + "step": 17552 + }, + { + "epoch": 4.670835550824907, + "grad_norm": 0.32432088255882263, + "learning_rate": 2.1777554811564513e-09, + "loss": 0.1844, + "step": 17553 + }, + { + "epoch": 4.671101649813731, + "grad_norm": 0.31465768814086914, + "learning_rate": 2.174251518426218e-09, + "loss": 0.1783, + "step": 17554 + }, + { + "epoch": 4.671367748802554, + "grad_norm": 0.5477420687675476, + "learning_rate": 2.1707503458851196e-09, + "loss": 0.1708, + "step": 17555 + }, + { + "epoch": 4.671633847791378, + "grad_norm": 0.3826209008693695, + "learning_rate": 2.167251963633021e-09, + "loss": 0.1952, + "step": 17556 + }, + { + "epoch": 4.6718999467802025, + "grad_norm": 0.3555263578891754, + "learning_rate": 2.1637563717697204e-09, + "loss": 0.1807, + "step": 17557 + }, + { + "epoch": 4.672166045769026, + "grad_norm": 0.3358272612094879, + "learning_rate": 2.1602635703948825e-09, + "loss": 0.1691, + "step": 17558 + }, + { + "epoch": 4.67243214475785, + "grad_norm": 0.2733518183231354, + "learning_rate": 2.156773559608149e-09, + "loss": 0.1775, + "step": 17559 + }, + { + "epoch": 4.672698243746674, + "grad_norm": 0.3025569021701813, + "learning_rate": 2.153286339509064e-09, + "loss": 0.1792, + "step": 17560 + }, + { + "epoch": 4.672964342735497, + "grad_norm": 0.4358270466327667, + "learning_rate": 2.1498019101970797e-09, + "loss": 0.2083, + "step": 17561 + }, + { + "epoch": 4.673230441724321, + "grad_norm": 0.30516400933265686, + "learning_rate": 2.1463202717715957e-09, + "loss": 0.1752, + "step": 17562 + }, + { + "epoch": 4.6734965407131455, + "grad_norm": 0.3167216181755066, + "learning_rate": 2.1428414243319094e-09, + "loss": 0.1906, + "step": 17563 + }, + { + "epoch": 4.67376263970197, + "grad_norm": 0.28937670588493347, + "learning_rate": 2.1393653679772304e-09, + "loss": 0.1699, + "step": 17564 + }, + { + "epoch": 4.674028738690793, + "grad_norm": 0.2634050250053406, + "learning_rate": 2.1358921028067243e-09, + "loss": 0.1754, + "step": 17565 + }, + { + "epoch": 4.674294837679617, + "grad_norm": 0.2518257796764374, + "learning_rate": 2.132421628919434e-09, + "loss": 0.1539, + "step": 17566 + }, + { + "epoch": 4.67456093666844, + "grad_norm": 0.27956318855285645, + "learning_rate": 2.1289539464143468e-09, + "loss": 0.1767, + "step": 17567 + }, + { + "epoch": 4.674827035657264, + "grad_norm": 0.36781615018844604, + "learning_rate": 2.125489055390395e-09, + "loss": 0.1853, + "step": 17568 + }, + { + "epoch": 4.6750931346460884, + "grad_norm": 0.27600643038749695, + "learning_rate": 2.1220269559463767e-09, + "loss": 0.1704, + "step": 17569 + }, + { + "epoch": 4.675359233634913, + "grad_norm": 0.3471939265727997, + "learning_rate": 2.1185676481810465e-09, + "loss": 0.2089, + "step": 17570 + }, + { + "epoch": 4.675625332623736, + "grad_norm": 0.2783059775829315, + "learning_rate": 2.1151111321930705e-09, + "loss": 0.1646, + "step": 17571 + }, + { + "epoch": 4.67589143161256, + "grad_norm": 0.3170014023780823, + "learning_rate": 2.111657408081047e-09, + "loss": 0.1715, + "step": 17572 + }, + { + "epoch": 4.676157530601384, + "grad_norm": 0.2939298748970032, + "learning_rate": 2.108206475943464e-09, + "loss": 0.1715, + "step": 17573 + }, + { + "epoch": 4.676423629590207, + "grad_norm": 0.30406367778778076, + "learning_rate": 2.1047583358787646e-09, + "loss": 0.1786, + "step": 17574 + }, + { + "epoch": 4.676689728579031, + "grad_norm": 0.27266615629196167, + "learning_rate": 2.101312987985282e-09, + "loss": 0.1675, + "step": 17575 + }, + { + "epoch": 4.6769558275678556, + "grad_norm": 0.26478177309036255, + "learning_rate": 2.097870432361293e-09, + "loss": 0.1556, + "step": 17576 + }, + { + "epoch": 4.677221926556679, + "grad_norm": 0.2757377624511719, + "learning_rate": 2.094430669104996e-09, + "loss": 0.1817, + "step": 17577 + }, + { + "epoch": 4.677488025545503, + "grad_norm": 0.2764863669872284, + "learning_rate": 2.0909936983144805e-09, + "loss": 0.1757, + "step": 17578 + }, + { + "epoch": 4.677754124534327, + "grad_norm": 0.4093594551086426, + "learning_rate": 2.087559520087789e-09, + "loss": 0.1752, + "step": 17579 + }, + { + "epoch": 4.67802022352315, + "grad_norm": 0.2952505052089691, + "learning_rate": 2.0841281345228777e-09, + "loss": 0.1703, + "step": 17580 + }, + { + "epoch": 4.678286322511974, + "grad_norm": 0.38139617443084717, + "learning_rate": 2.0806995417175898e-09, + "loss": 0.1876, + "step": 17581 + }, + { + "epoch": 4.6785524215007985, + "grad_norm": 0.2890412509441376, + "learning_rate": 2.077273741769747e-09, + "loss": 0.2088, + "step": 17582 + }, + { + "epoch": 4.678818520489622, + "grad_norm": 0.26220566034317017, + "learning_rate": 2.073850734777027e-09, + "loss": 0.1695, + "step": 17583 + }, + { + "epoch": 4.679084619478446, + "grad_norm": 0.3262954354286194, + "learning_rate": 2.0704305208370854e-09, + "loss": 0.1723, + "step": 17584 + }, + { + "epoch": 4.67935071846727, + "grad_norm": 0.46361783146858215, + "learning_rate": 2.0670131000474765e-09, + "loss": 0.171, + "step": 17585 + }, + { + "epoch": 4.679616817456093, + "grad_norm": 0.27912601828575134, + "learning_rate": 2.0635984725056564e-09, + "loss": 0.1738, + "step": 17586 + }, + { + "epoch": 4.679882916444917, + "grad_norm": 0.2872318625450134, + "learning_rate": 2.060186638309025e-09, + "loss": 0.1673, + "step": 17587 + }, + { + "epoch": 4.6801490154337415, + "grad_norm": 0.31146642565727234, + "learning_rate": 2.056777597554893e-09, + "loss": 0.1815, + "step": 17588 + }, + { + "epoch": 4.680415114422566, + "grad_norm": 0.277842253446579, + "learning_rate": 2.0533713503405047e-09, + "loss": 0.1776, + "step": 17589 + }, + { + "epoch": 4.680681213411389, + "grad_norm": 0.2875843048095703, + "learning_rate": 2.049967896762994e-09, + "loss": 0.1775, + "step": 17590 + }, + { + "epoch": 4.680947312400213, + "grad_norm": 0.3100593686103821, + "learning_rate": 2.0465672369194388e-09, + "loss": 0.1968, + "step": 17591 + }, + { + "epoch": 4.681213411389036, + "grad_norm": 0.2728612720966339, + "learning_rate": 2.043169370906839e-09, + "loss": 0.1673, + "step": 17592 + }, + { + "epoch": 4.68147951037786, + "grad_norm": 0.2733800709247589, + "learning_rate": 2.0397742988221167e-09, + "loss": 0.1677, + "step": 17593 + }, + { + "epoch": 4.6817456093666845, + "grad_norm": 0.27531445026397705, + "learning_rate": 2.0363820207620842e-09, + "loss": 0.1713, + "step": 17594 + }, + { + "epoch": 4.682011708355509, + "grad_norm": 0.2961927354335785, + "learning_rate": 2.032992536823519e-09, + "loss": 0.1653, + "step": 17595 + }, + { + "epoch": 4.682277807344332, + "grad_norm": 0.4979577660560608, + "learning_rate": 2.0296058471030773e-09, + "loss": 0.1864, + "step": 17596 + }, + { + "epoch": 4.682543906333156, + "grad_norm": 0.27448561787605286, + "learning_rate": 2.02622195169736e-09, + "loss": 0.1583, + "step": 17597 + }, + { + "epoch": 4.68281000532198, + "grad_norm": 0.3633078336715698, + "learning_rate": 2.0228408507028784e-09, + "loss": 0.1909, + "step": 17598 + }, + { + "epoch": 4.683076104310803, + "grad_norm": 0.35122567415237427, + "learning_rate": 2.0194625442160884e-09, + "loss": 0.17, + "step": 17599 + }, + { + "epoch": 4.6833422032996275, + "grad_norm": 0.30745160579681396, + "learning_rate": 2.0160870323333247e-09, + "loss": 0.1946, + "step": 17600 + }, + { + "epoch": 4.683608302288452, + "grad_norm": 0.3212744891643524, + "learning_rate": 2.0127143151508873e-09, + "loss": 0.1917, + "step": 17601 + }, + { + "epoch": 4.683874401277275, + "grad_norm": 0.2864493727684021, + "learning_rate": 2.0093443927649446e-09, + "loss": 0.1552, + "step": 17602 + }, + { + "epoch": 4.684140500266099, + "grad_norm": 0.2795279920101166, + "learning_rate": 2.00597726527163e-09, + "loss": 0.1622, + "step": 17603 + }, + { + "epoch": 4.684406599254923, + "grad_norm": 0.3316841125488281, + "learning_rate": 2.002612932766967e-09, + "loss": 0.1833, + "step": 17604 + }, + { + "epoch": 4.684672698243746, + "grad_norm": 0.2860625088214874, + "learning_rate": 1.9992513953469347e-09, + "loss": 0.1771, + "step": 17605 + }, + { + "epoch": 4.6849387972325705, + "grad_norm": 0.42993414402008057, + "learning_rate": 1.9958926531073894e-09, + "loss": 0.1814, + "step": 17606 + }, + { + "epoch": 4.685204896221395, + "grad_norm": 0.3476696312427521, + "learning_rate": 1.9925367061441433e-09, + "loss": 0.1786, + "step": 17607 + }, + { + "epoch": 4.685470995210218, + "grad_norm": 0.28632915019989014, + "learning_rate": 1.9891835545529202e-09, + "loss": 0.1744, + "step": 17608 + }, + { + "epoch": 4.685737094199042, + "grad_norm": 0.3385772407054901, + "learning_rate": 1.985833198429343e-09, + "loss": 0.167, + "step": 17609 + }, + { + "epoch": 4.686003193187866, + "grad_norm": 0.315306693315506, + "learning_rate": 1.9824856378689804e-09, + "loss": 0.1857, + "step": 17610 + }, + { + "epoch": 4.686269292176689, + "grad_norm": 0.3179927170276642, + "learning_rate": 1.9791408729672997e-09, + "loss": 0.1765, + "step": 17611 + }, + { + "epoch": 4.686535391165513, + "grad_norm": 0.3082359731197357, + "learning_rate": 1.9757989038197143e-09, + "loss": 0.1824, + "step": 17612 + }, + { + "epoch": 4.686801490154338, + "grad_norm": 0.30607154965400696, + "learning_rate": 1.9724597305215364e-09, + "loss": 0.18, + "step": 17613 + }, + { + "epoch": 4.687067589143162, + "grad_norm": 0.31207916140556335, + "learning_rate": 1.969123353168023e-09, + "loss": 0.1909, + "step": 17614 + }, + { + "epoch": 4.687333688131985, + "grad_norm": 0.2803654968738556, + "learning_rate": 1.9657897718542983e-09, + "loss": 0.1783, + "step": 17615 + }, + { + "epoch": 4.687599787120809, + "grad_norm": 0.28431034088134766, + "learning_rate": 1.962458986675486e-09, + "loss": 0.1678, + "step": 17616 + }, + { + "epoch": 4.687865886109632, + "grad_norm": 0.34267115592956543, + "learning_rate": 1.959130997726555e-09, + "loss": 0.1784, + "step": 17617 + }, + { + "epoch": 4.688131985098456, + "grad_norm": 0.2819506525993347, + "learning_rate": 1.9558058051024396e-09, + "loss": 0.1803, + "step": 17618 + }, + { + "epoch": 4.6883980840872805, + "grad_norm": 0.44167521595954895, + "learning_rate": 1.9524834088979646e-09, + "loss": 0.1645, + "step": 17619 + }, + { + "epoch": 4.688664183076105, + "grad_norm": 0.2999723553657532, + "learning_rate": 1.9491638092079098e-09, + "loss": 0.1913, + "step": 17620 + }, + { + "epoch": 4.688930282064928, + "grad_norm": 0.37005189061164856, + "learning_rate": 1.945847006126955e-09, + "loss": 0.1866, + "step": 17621 + }, + { + "epoch": 4.689196381053752, + "grad_norm": 0.4148271679878235, + "learning_rate": 1.942532999749702e-09, + "loss": 0.1918, + "step": 17622 + }, + { + "epoch": 4.689462480042576, + "grad_norm": 0.2792099416255951, + "learning_rate": 1.9392217901706752e-09, + "loss": 0.1585, + "step": 17623 + }, + { + "epoch": 4.689728579031399, + "grad_norm": 0.2591613829135895, + "learning_rate": 1.9359133774842997e-09, + "loss": 0.1634, + "step": 17624 + }, + { + "epoch": 4.6899946780202235, + "grad_norm": 0.33153462409973145, + "learning_rate": 1.9326077617849656e-09, + "loss": 0.1805, + "step": 17625 + }, + { + "epoch": 4.690260777009048, + "grad_norm": 0.27648216485977173, + "learning_rate": 1.9293049431669206e-09, + "loss": 0.1785, + "step": 17626 + }, + { + "epoch": 4.690526875997871, + "grad_norm": 0.3610570728778839, + "learning_rate": 1.9260049217244e-09, + "loss": 0.1859, + "step": 17627 + }, + { + "epoch": 4.690792974986695, + "grad_norm": 0.2705322206020355, + "learning_rate": 1.9227076975515065e-09, + "loss": 0.1611, + "step": 17628 + }, + { + "epoch": 4.691059073975519, + "grad_norm": 0.2835841178894043, + "learning_rate": 1.9194132707423094e-09, + "loss": 0.1862, + "step": 17629 + }, + { + "epoch": 4.691325172964342, + "grad_norm": 0.2863401472568512, + "learning_rate": 1.916121641390744e-09, + "loss": 0.1845, + "step": 17630 + }, + { + "epoch": 4.6915912719531665, + "grad_norm": 0.5018600225448608, + "learning_rate": 1.912832809590703e-09, + "loss": 0.2156, + "step": 17631 + }, + { + "epoch": 4.691857370941991, + "grad_norm": 0.29091939330101013, + "learning_rate": 1.9095467754359996e-09, + "loss": 0.1792, + "step": 17632 + }, + { + "epoch": 4.692123469930815, + "grad_norm": 0.31018635630607605, + "learning_rate": 1.9062635390203473e-09, + "loss": 0.1935, + "step": 17633 + }, + { + "epoch": 4.692389568919638, + "grad_norm": 0.2795073390007019, + "learning_rate": 1.9029831004373942e-09, + "loss": 0.17, + "step": 17634 + }, + { + "epoch": 4.692655667908462, + "grad_norm": 0.31270408630371094, + "learning_rate": 1.8997054597807093e-09, + "loss": 0.1873, + "step": 17635 + }, + { + "epoch": 4.692921766897285, + "grad_norm": 0.2929442822933197, + "learning_rate": 1.896430617143763e-09, + "loss": 0.1914, + "step": 17636 + }, + { + "epoch": 4.6931878658861095, + "grad_norm": 0.2756897211074829, + "learning_rate": 1.8931585726199905e-09, + "loss": 0.1799, + "step": 17637 + }, + { + "epoch": 4.693453964874934, + "grad_norm": 0.26355987787246704, + "learning_rate": 1.8898893263026847e-09, + "loss": 0.1685, + "step": 17638 + }, + { + "epoch": 4.693720063863758, + "grad_norm": 0.2897646725177765, + "learning_rate": 1.886622878285116e-09, + "loss": 0.1745, + "step": 17639 + }, + { + "epoch": 4.693986162852581, + "grad_norm": 0.2946001887321472, + "learning_rate": 1.8833592286604195e-09, + "loss": 0.1683, + "step": 17640 + }, + { + "epoch": 4.694252261841405, + "grad_norm": 0.29031768441200256, + "learning_rate": 1.8800983775217106e-09, + "loss": 0.174, + "step": 17641 + }, + { + "epoch": 4.694518360830229, + "grad_norm": 0.27300208806991577, + "learning_rate": 1.876840324961981e-09, + "loss": 0.1692, + "step": 17642 + }, + { + "epoch": 4.6947844598190525, + "grad_norm": 0.3847001791000366, + "learning_rate": 1.8735850710741686e-09, + "loss": 0.1804, + "step": 17643 + }, + { + "epoch": 4.695050558807877, + "grad_norm": 0.9943684339523315, + "learning_rate": 1.870332615951109e-09, + "loss": 0.1631, + "step": 17644 + }, + { + "epoch": 4.695316657796701, + "grad_norm": 0.3380570411682129, + "learning_rate": 1.867082959685573e-09, + "loss": 0.1784, + "step": 17645 + }, + { + "epoch": 4.695582756785524, + "grad_norm": 0.2788968086242676, + "learning_rate": 1.863836102370253e-09, + "loss": 0.1703, + "step": 17646 + }, + { + "epoch": 4.695848855774348, + "grad_norm": 0.26000547409057617, + "learning_rate": 1.8605920440977418e-09, + "loss": 0.156, + "step": 17647 + }, + { + "epoch": 4.696114954763172, + "grad_norm": 0.3578112721443176, + "learning_rate": 1.8573507849605653e-09, + "loss": 0.1862, + "step": 17648 + }, + { + "epoch": 4.696381053751995, + "grad_norm": 0.33038005232810974, + "learning_rate": 1.8541123250511936e-09, + "loss": 0.1967, + "step": 17649 + }, + { + "epoch": 4.69664715274082, + "grad_norm": 0.28363874554634094, + "learning_rate": 1.8508766644619867e-09, + "loss": 0.1728, + "step": 17650 + }, + { + "epoch": 4.696913251729644, + "grad_norm": 0.3272211253643036, + "learning_rate": 1.8476438032852037e-09, + "loss": 0.1803, + "step": 17651 + }, + { + "epoch": 4.697179350718467, + "grad_norm": 0.3598760664463043, + "learning_rate": 1.8444137416130933e-09, + "loss": 0.1856, + "step": 17652 + }, + { + "epoch": 4.697445449707291, + "grad_norm": 0.36096274852752686, + "learning_rate": 1.8411864795377596e-09, + "loss": 0.1909, + "step": 17653 + }, + { + "epoch": 4.697711548696115, + "grad_norm": 0.40317389369010925, + "learning_rate": 1.8379620171512622e-09, + "loss": 0.1809, + "step": 17654 + }, + { + "epoch": 4.697977647684938, + "grad_norm": 0.27542153000831604, + "learning_rate": 1.8347403545455497e-09, + "loss": 0.1831, + "step": 17655 + }, + { + "epoch": 4.6982437466737625, + "grad_norm": 0.289325475692749, + "learning_rate": 1.8315214918125266e-09, + "loss": 0.1668, + "step": 17656 + }, + { + "epoch": 4.698509845662587, + "grad_norm": 0.3866308629512787, + "learning_rate": 1.828305429043997e-09, + "loss": 0.1937, + "step": 17657 + }, + { + "epoch": 4.698775944651411, + "grad_norm": 0.3003455400466919, + "learning_rate": 1.8250921663316987e-09, + "loss": 0.175, + "step": 17658 + }, + { + "epoch": 4.699042043640234, + "grad_norm": 0.36937686800956726, + "learning_rate": 1.8218817037672697e-09, + "loss": 0.1735, + "step": 17659 + }, + { + "epoch": 4.699308142629058, + "grad_norm": 0.6377903819084167, + "learning_rate": 1.8186740414422807e-09, + "loss": 0.1707, + "step": 17660 + }, + { + "epoch": 4.699574241617881, + "grad_norm": 0.28088071942329407, + "learning_rate": 1.8154691794482257e-09, + "loss": 0.1749, + "step": 17661 + }, + { + "epoch": 4.6998403406067055, + "grad_norm": 0.4097791910171509, + "learning_rate": 1.812267117876498e-09, + "loss": 0.1875, + "step": 17662 + }, + { + "epoch": 4.70010643959553, + "grad_norm": 0.3111652433872223, + "learning_rate": 1.809067856818447e-09, + "loss": 0.1855, + "step": 17663 + }, + { + "epoch": 4.700372538584354, + "grad_norm": 0.3178160786628723, + "learning_rate": 1.8058713963653216e-09, + "loss": 0.1923, + "step": 17664 + }, + { + "epoch": 4.700638637573177, + "grad_norm": 0.362361878156662, + "learning_rate": 1.8026777366082825e-09, + "loss": 0.1592, + "step": 17665 + }, + { + "epoch": 4.700904736562001, + "grad_norm": 0.3787827789783478, + "learning_rate": 1.799486877638412e-09, + "loss": 0.166, + "step": 17666 + }, + { + "epoch": 4.701170835550825, + "grad_norm": 0.25980308651924133, + "learning_rate": 1.7962988195467377e-09, + "loss": 0.1527, + "step": 17667 + }, + { + "epoch": 4.7014369345396485, + "grad_norm": 0.270721971988678, + "learning_rate": 1.7931135624241756e-09, + "loss": 0.1784, + "step": 17668 + }, + { + "epoch": 4.701703033528473, + "grad_norm": 0.3686656355857849, + "learning_rate": 1.7899311063615863e-09, + "loss": 0.1806, + "step": 17669 + }, + { + "epoch": 4.701969132517297, + "grad_norm": 0.297717809677124, + "learning_rate": 1.7867514514497195e-09, + "loss": 0.1769, + "step": 17670 + }, + { + "epoch": 4.70223523150612, + "grad_norm": 0.2707907557487488, + "learning_rate": 1.7835745977792915e-09, + "loss": 0.1608, + "step": 17671 + }, + { + "epoch": 4.702501330494944, + "grad_norm": 0.3712916076183319, + "learning_rate": 1.7804005454408966e-09, + "loss": 0.1846, + "step": 17672 + }, + { + "epoch": 4.702767429483768, + "grad_norm": 0.32093334197998047, + "learning_rate": 1.7772292945250734e-09, + "loss": 0.1851, + "step": 17673 + }, + { + "epoch": 4.7030335284725915, + "grad_norm": 0.2756301164627075, + "learning_rate": 1.7740608451222717e-09, + "loss": 0.1567, + "step": 17674 + }, + { + "epoch": 4.703299627461416, + "grad_norm": 0.2886989414691925, + "learning_rate": 1.7708951973228526e-09, + "loss": 0.1671, + "step": 17675 + }, + { + "epoch": 4.70356572645024, + "grad_norm": 0.43924033641815186, + "learning_rate": 1.7677323512171216e-09, + "loss": 0.1877, + "step": 17676 + }, + { + "epoch": 4.703831825439063, + "grad_norm": 0.301803320646286, + "learning_rate": 1.7645723068952733e-09, + "loss": 0.1748, + "step": 17677 + }, + { + "epoch": 4.704097924427887, + "grad_norm": 0.36034631729125977, + "learning_rate": 1.7614150644474468e-09, + "loss": 0.1791, + "step": 17678 + }, + { + "epoch": 4.704364023416711, + "grad_norm": 0.34212571382522583, + "learning_rate": 1.758260623963692e-09, + "loss": 0.1615, + "step": 17679 + }, + { + "epoch": 4.7046301224055345, + "grad_norm": 0.30497506260871887, + "learning_rate": 1.755108985533993e-09, + "loss": 0.1739, + "step": 17680 + }, + { + "epoch": 4.704896221394359, + "grad_norm": 0.34941238164901733, + "learning_rate": 1.7519601492482106e-09, + "loss": 0.1658, + "step": 17681 + }, + { + "epoch": 4.705162320383183, + "grad_norm": 0.26293501257896423, + "learning_rate": 1.7488141151961956e-09, + "loss": 0.1717, + "step": 17682 + }, + { + "epoch": 4.705428419372007, + "grad_norm": 0.2994249165058136, + "learning_rate": 1.7456708834676425e-09, + "loss": 0.1776, + "step": 17683 + }, + { + "epoch": 4.70569451836083, + "grad_norm": 0.28634053468704224, + "learning_rate": 1.7425304541522245e-09, + "loss": 0.173, + "step": 17684 + }, + { + "epoch": 4.705960617349654, + "grad_norm": 0.2720731496810913, + "learning_rate": 1.7393928273395031e-09, + "loss": 0.1737, + "step": 17685 + }, + { + "epoch": 4.7062267163384774, + "grad_norm": 0.3486601710319519, + "learning_rate": 1.7362580031189734e-09, + "loss": 0.1795, + "step": 17686 + }, + { + "epoch": 4.706492815327302, + "grad_norm": 0.29293686151504517, + "learning_rate": 1.7331259815800526e-09, + "loss": 0.1934, + "step": 17687 + }, + { + "epoch": 4.706758914316126, + "grad_norm": 0.2848157286643982, + "learning_rate": 1.7299967628120692e-09, + "loss": 0.1643, + "step": 17688 + }, + { + "epoch": 4.70702501330495, + "grad_norm": 0.2889368236064911, + "learning_rate": 1.7268703469042633e-09, + "loss": 0.175, + "step": 17689 + }, + { + "epoch": 4.707291112293773, + "grad_norm": 0.2571830451488495, + "learning_rate": 1.723746733945819e-09, + "loss": 0.1669, + "step": 17690 + }, + { + "epoch": 4.707557211282597, + "grad_norm": 0.3309634029865265, + "learning_rate": 1.7206259240258203e-09, + "loss": 0.1702, + "step": 17691 + }, + { + "epoch": 4.707823310271421, + "grad_norm": 0.2942509949207306, + "learning_rate": 1.7175079172332852e-09, + "loss": 0.1768, + "step": 17692 + }, + { + "epoch": 4.7080894092602446, + "grad_norm": 0.31506454944610596, + "learning_rate": 1.7143927136571423e-09, + "loss": 0.155, + "step": 17693 + }, + { + "epoch": 4.708355508249069, + "grad_norm": 0.34612980484962463, + "learning_rate": 1.7112803133862541e-09, + "loss": 0.1699, + "step": 17694 + }, + { + "epoch": 4.708621607237893, + "grad_norm": 0.34769293665885925, + "learning_rate": 1.7081707165093828e-09, + "loss": 0.1903, + "step": 17695 + }, + { + "epoch": 4.708887706226716, + "grad_norm": 0.2880905568599701, + "learning_rate": 1.705063923115213e-09, + "loss": 0.1873, + "step": 17696 + }, + { + "epoch": 4.70915380521554, + "grad_norm": 0.28999775648117065, + "learning_rate": 1.701959933292374e-09, + "loss": 0.1868, + "step": 17697 + }, + { + "epoch": 4.709419904204364, + "grad_norm": 0.2600330114364624, + "learning_rate": 1.6988587471293725e-09, + "loss": 0.1523, + "step": 17698 + }, + { + "epoch": 4.7096860031931875, + "grad_norm": 0.34408310055732727, + "learning_rate": 1.6957603647146934e-09, + "loss": 0.165, + "step": 17699 + }, + { + "epoch": 4.709952102182012, + "grad_norm": 0.2992124557495117, + "learning_rate": 1.6926647861366772e-09, + "loss": 0.1675, + "step": 17700 + }, + { + "epoch": 4.710218201170836, + "grad_norm": 4.1468892097473145, + "learning_rate": 1.6895720114836531e-09, + "loss": 0.1985, + "step": 17701 + }, + { + "epoch": 4.710484300159659, + "grad_norm": 0.28776031732559204, + "learning_rate": 1.686482040843795e-09, + "loss": 0.1806, + "step": 17702 + }, + { + "epoch": 4.710750399148483, + "grad_norm": 0.29495733976364136, + "learning_rate": 1.6833948743052661e-09, + "loss": 0.1775, + "step": 17703 + }, + { + "epoch": 4.711016498137307, + "grad_norm": 0.3129133880138397, + "learning_rate": 1.6803105119560846e-09, + "loss": 0.1716, + "step": 17704 + }, + { + "epoch": 4.7112825971261305, + "grad_norm": 0.26638445258140564, + "learning_rate": 1.6772289538842576e-09, + "loss": 0.1495, + "step": 17705 + }, + { + "epoch": 4.711548696114955, + "grad_norm": 0.3843393325805664, + "learning_rate": 1.6741502001776597e-09, + "loss": 0.1667, + "step": 17706 + }, + { + "epoch": 4.711814795103779, + "grad_norm": 0.24765735864639282, + "learning_rate": 1.6710742509240983e-09, + "loss": 0.1526, + "step": 17707 + }, + { + "epoch": 4.712080894092603, + "grad_norm": 0.3453879952430725, + "learning_rate": 1.668001106211314e-09, + "loss": 0.161, + "step": 17708 + }, + { + "epoch": 4.712346993081426, + "grad_norm": 0.3042939007282257, + "learning_rate": 1.6649307661269707e-09, + "loss": 0.1704, + "step": 17709 + }, + { + "epoch": 4.71261309207025, + "grad_norm": 0.2995397448539734, + "learning_rate": 1.66186323075862e-09, + "loss": 0.1796, + "step": 17710 + }, + { + "epoch": 4.7128791910590735, + "grad_norm": 0.29681238532066345, + "learning_rate": 1.6587985001937698e-09, + "loss": 0.1752, + "step": 17711 + }, + { + "epoch": 4.713145290047898, + "grad_norm": 0.26836448907852173, + "learning_rate": 1.6557365745198172e-09, + "loss": 0.165, + "step": 17712 + }, + { + "epoch": 4.713411389036722, + "grad_norm": 0.28919246792793274, + "learning_rate": 1.6526774538240917e-09, + "loss": 0.1772, + "step": 17713 + }, + { + "epoch": 4.713677488025546, + "grad_norm": 0.3090716302394867, + "learning_rate": 1.6496211381938686e-09, + "loss": 0.1743, + "step": 17714 + }, + { + "epoch": 4.713943587014369, + "grad_norm": 0.2870621085166931, + "learning_rate": 1.6465676277163109e-09, + "loss": 0.1698, + "step": 17715 + }, + { + "epoch": 4.714209686003193, + "grad_norm": 0.26933443546295166, + "learning_rate": 1.6435169224785051e-09, + "loss": 0.1807, + "step": 17716 + }, + { + "epoch": 4.714475784992017, + "grad_norm": 0.2782144844532013, + "learning_rate": 1.64046902256747e-09, + "loss": 0.1601, + "step": 17717 + }, + { + "epoch": 4.714741883980841, + "grad_norm": 0.42266416549682617, + "learning_rate": 1.6374239280701364e-09, + "loss": 0.1823, + "step": 17718 + }, + { + "epoch": 4.715007982969665, + "grad_norm": 0.28811630606651306, + "learning_rate": 1.6343816390733455e-09, + "loss": 0.1754, + "step": 17719 + }, + { + "epoch": 4.715274081958489, + "grad_norm": 0.30324631929397583, + "learning_rate": 1.631342155663884e-09, + "loss": 0.1727, + "step": 17720 + }, + { + "epoch": 4.715540180947312, + "grad_norm": 0.3846995532512665, + "learning_rate": 1.6283054779284377e-09, + "loss": 0.1987, + "step": 17721 + }, + { + "epoch": 4.715806279936136, + "grad_norm": 0.28672224283218384, + "learning_rate": 1.6252716059536153e-09, + "loss": 0.1516, + "step": 17722 + }, + { + "epoch": 4.71607237892496, + "grad_norm": 0.424733966588974, + "learning_rate": 1.6222405398259586e-09, + "loss": 0.1835, + "step": 17723 + }, + { + "epoch": 4.716338477913784, + "grad_norm": 0.2779199182987213, + "learning_rate": 1.6192122796319208e-09, + "loss": 0.1632, + "step": 17724 + }, + { + "epoch": 4.716604576902608, + "grad_norm": 0.39627349376678467, + "learning_rate": 1.6161868254578548e-09, + "loss": 0.1864, + "step": 17725 + }, + { + "epoch": 4.716870675891432, + "grad_norm": 0.33906289935112, + "learning_rate": 1.6131641773900806e-09, + "loss": 0.1896, + "step": 17726 + }, + { + "epoch": 4.717136774880255, + "grad_norm": 0.27162423729896545, + "learning_rate": 1.6101443355147849e-09, + "loss": 0.1672, + "step": 17727 + }, + { + "epoch": 4.717402873869079, + "grad_norm": 0.2953760027885437, + "learning_rate": 1.6071272999181095e-09, + "loss": 0.1667, + "step": 17728 + }, + { + "epoch": 4.717668972857903, + "grad_norm": 0.3776039779186249, + "learning_rate": 1.6041130706861082e-09, + "loss": 0.1792, + "step": 17729 + }, + { + "epoch": 4.717935071846727, + "grad_norm": 0.40893277525901794, + "learning_rate": 1.601101647904768e-09, + "loss": 0.1729, + "step": 17730 + }, + { + "epoch": 4.718201170835551, + "grad_norm": 0.2793261706829071, + "learning_rate": 1.5980930316599528e-09, + "loss": 0.1646, + "step": 17731 + }, + { + "epoch": 4.718467269824375, + "grad_norm": 0.38611891865730286, + "learning_rate": 1.5950872220374833e-09, + "loss": 0.1976, + "step": 17732 + }, + { + "epoch": 4.718733368813199, + "grad_norm": 0.3761187493801117, + "learning_rate": 1.592084219123102e-09, + "loss": 0.1687, + "step": 17733 + }, + { + "epoch": 4.718999467802022, + "grad_norm": 0.2766231298446655, + "learning_rate": 1.5890840230024404e-09, + "loss": 0.1813, + "step": 17734 + }, + { + "epoch": 4.719265566790846, + "grad_norm": 0.272944837808609, + "learning_rate": 1.5860866337610967e-09, + "loss": 0.1648, + "step": 17735 + }, + { + "epoch": 4.7195316657796695, + "grad_norm": 0.29904982447624207, + "learning_rate": 1.5830920514845359e-09, + "loss": 0.1786, + "step": 17736 + }, + { + "epoch": 4.719797764768494, + "grad_norm": 0.34499871730804443, + "learning_rate": 1.5801002762582005e-09, + "loss": 0.188, + "step": 17737 + }, + { + "epoch": 4.720063863757318, + "grad_norm": 0.4432208240032196, + "learning_rate": 1.5771113081673892e-09, + "loss": 0.17, + "step": 17738 + }, + { + "epoch": 4.720329962746142, + "grad_norm": 0.3658180832862854, + "learning_rate": 1.5741251472973781e-09, + "loss": 0.18, + "step": 17739 + }, + { + "epoch": 4.720596061734965, + "grad_norm": 0.28705984354019165, + "learning_rate": 1.5711417937333216e-09, + "loss": 0.193, + "step": 17740 + }, + { + "epoch": 4.720862160723789, + "grad_norm": 0.32938987016677856, + "learning_rate": 1.5681612475603289e-09, + "loss": 0.1791, + "step": 17741 + }, + { + "epoch": 4.721128259712613, + "grad_norm": 0.2751769423484802, + "learning_rate": 1.565183508863388e-09, + "loss": 0.1663, + "step": 17742 + }, + { + "epoch": 4.721394358701437, + "grad_norm": 0.2550356984138489, + "learning_rate": 1.5622085777274418e-09, + "loss": 0.1663, + "step": 17743 + }, + { + "epoch": 4.721660457690261, + "grad_norm": 0.36674872040748596, + "learning_rate": 1.5592364542373448e-09, + "loss": 0.1911, + "step": 17744 + }, + { + "epoch": 4.721926556679085, + "grad_norm": 0.3151037395000458, + "learning_rate": 1.5562671384778736e-09, + "loss": 0.2021, + "step": 17745 + }, + { + "epoch": 4.722192655667908, + "grad_norm": 0.27500197291374207, + "learning_rate": 1.5533006305336937e-09, + "loss": 0.1736, + "step": 17746 + }, + { + "epoch": 4.722458754656732, + "grad_norm": 0.5361608862876892, + "learning_rate": 1.5503369304894488e-09, + "loss": 0.1659, + "step": 17747 + }, + { + "epoch": 4.722724853645556, + "grad_norm": 0.37194082140922546, + "learning_rate": 1.5473760384296485e-09, + "loss": 0.17, + "step": 17748 + }, + { + "epoch": 4.72299095263438, + "grad_norm": 0.2773572504520416, + "learning_rate": 1.5444179544387592e-09, + "loss": 0.1652, + "step": 17749 + }, + { + "epoch": 4.723257051623204, + "grad_norm": 0.3641378879547119, + "learning_rate": 1.5414626786011242e-09, + "loss": 0.18, + "step": 17750 + }, + { + "epoch": 4.723523150612028, + "grad_norm": 0.3064199388027191, + "learning_rate": 1.538510211001054e-09, + "loss": 0.1921, + "step": 17751 + }, + { + "epoch": 4.723789249600852, + "grad_norm": 0.3588404953479767, + "learning_rate": 1.5355605517227587e-09, + "loss": 0.1906, + "step": 17752 + }, + { + "epoch": 4.724055348589675, + "grad_norm": 0.34262916445732117, + "learning_rate": 1.5326137008503715e-09, + "loss": 0.1709, + "step": 17753 + }, + { + "epoch": 4.724321447578499, + "grad_norm": 0.3457978069782257, + "learning_rate": 1.5296696584679359e-09, + "loss": 0.1733, + "step": 17754 + }, + { + "epoch": 4.724587546567323, + "grad_norm": 0.3051776587963104, + "learning_rate": 1.5267284246594182e-09, + "loss": 0.1836, + "step": 17755 + }, + { + "epoch": 4.724853645556147, + "grad_norm": 0.3677484393119812, + "learning_rate": 1.5237899995087178e-09, + "loss": 0.1776, + "step": 17756 + }, + { + "epoch": 4.725119744544971, + "grad_norm": 0.3516346216201782, + "learning_rate": 1.5208543830996346e-09, + "loss": 0.1663, + "step": 17757 + }, + { + "epoch": 4.725385843533795, + "grad_norm": 0.25572025775909424, + "learning_rate": 1.5179215755159013e-09, + "loss": 0.1572, + "step": 17758 + }, + { + "epoch": 4.725651942522618, + "grad_norm": 0.3813213109970093, + "learning_rate": 1.5149915768411736e-09, + "loss": 0.1948, + "step": 17759 + }, + { + "epoch": 4.725918041511442, + "grad_norm": 0.37774455547332764, + "learning_rate": 1.5120643871590177e-09, + "loss": 0.1844, + "step": 17760 + }, + { + "epoch": 4.7261841405002665, + "grad_norm": 0.27475208044052124, + "learning_rate": 1.5091400065529226e-09, + "loss": 0.1801, + "step": 17761 + }, + { + "epoch": 4.72645023948909, + "grad_norm": 0.3131062686443329, + "learning_rate": 1.506218435106299e-09, + "loss": 0.1933, + "step": 17762 + }, + { + "epoch": 4.726716338477914, + "grad_norm": 0.2709468603134155, + "learning_rate": 1.5032996729024694e-09, + "loss": 0.1672, + "step": 17763 + }, + { + "epoch": 4.726982437466738, + "grad_norm": 0.4142789840698242, + "learning_rate": 1.5003837200246894e-09, + "loss": 0.1764, + "step": 17764 + }, + { + "epoch": 4.727248536455561, + "grad_norm": 0.2729603052139282, + "learning_rate": 1.4974705765561256e-09, + "loss": 0.178, + "step": 17765 + }, + { + "epoch": 4.727514635444385, + "grad_norm": 0.30984994769096375, + "learning_rate": 1.4945602425798786e-09, + "loss": 0.1682, + "step": 17766 + }, + { + "epoch": 4.7277807344332095, + "grad_norm": 0.34583303332328796, + "learning_rate": 1.4916527181789373e-09, + "loss": 0.1627, + "step": 17767 + }, + { + "epoch": 4.728046833422033, + "grad_norm": 0.43235349655151367, + "learning_rate": 1.4887480034362355e-09, + "loss": 0.2018, + "step": 17768 + }, + { + "epoch": 4.728312932410857, + "grad_norm": 0.2665811777114868, + "learning_rate": 1.4858460984346399e-09, + "loss": 0.1657, + "step": 17769 + }, + { + "epoch": 4.728579031399681, + "grad_norm": 0.2774021327495575, + "learning_rate": 1.4829470032568847e-09, + "loss": 0.1755, + "step": 17770 + }, + { + "epoch": 4.728845130388504, + "grad_norm": 0.28952789306640625, + "learning_rate": 1.4800507179856926e-09, + "loss": 0.1577, + "step": 17771 + }, + { + "epoch": 4.729111229377328, + "grad_norm": 0.3529317080974579, + "learning_rate": 1.4771572427036417e-09, + "loss": 0.18, + "step": 17772 + }, + { + "epoch": 4.729377328366152, + "grad_norm": 0.3169468343257904, + "learning_rate": 1.4742665774932883e-09, + "loss": 0.164, + "step": 17773 + }, + { + "epoch": 4.729643427354976, + "grad_norm": 0.37129929661750793, + "learning_rate": 1.4713787224370555e-09, + "loss": 0.184, + "step": 17774 + }, + { + "epoch": 4.7299095263438, + "grad_norm": 0.268008828163147, + "learning_rate": 1.468493677617333e-09, + "loss": 0.1729, + "step": 17775 + }, + { + "epoch": 4.730175625332624, + "grad_norm": 0.2757715880870819, + "learning_rate": 1.4656114431163768e-09, + "loss": 0.1857, + "step": 17776 + }, + { + "epoch": 4.730441724321448, + "grad_norm": 0.2793898582458496, + "learning_rate": 1.4627320190164327e-09, + "loss": 0.1819, + "step": 17777 + }, + { + "epoch": 4.730707823310271, + "grad_norm": 0.280086874961853, + "learning_rate": 1.4598554053996015e-09, + "loss": 0.1706, + "step": 17778 + }, + { + "epoch": 4.730973922299095, + "grad_norm": 0.25854912400245667, + "learning_rate": 1.4569816023479397e-09, + "loss": 0.1545, + "step": 17779 + }, + { + "epoch": 4.731240021287919, + "grad_norm": 0.2715533375740051, + "learning_rate": 1.4541106099434042e-09, + "loss": 0.1752, + "step": 17780 + }, + { + "epoch": 4.731506120276743, + "grad_norm": 0.2790507376194, + "learning_rate": 1.451242428267896e-09, + "loss": 0.1775, + "step": 17781 + }, + { + "epoch": 4.731772219265567, + "grad_norm": 0.28007325530052185, + "learning_rate": 1.4483770574032162e-09, + "loss": 0.172, + "step": 17782 + }, + { + "epoch": 4.732038318254391, + "grad_norm": 0.27082329988479614, + "learning_rate": 1.4455144974310883e-09, + "loss": 0.1689, + "step": 17783 + }, + { + "epoch": 4.732304417243214, + "grad_norm": 0.314773291349411, + "learning_rate": 1.4426547484331587e-09, + "loss": 0.1797, + "step": 17784 + }, + { + "epoch": 4.732570516232038, + "grad_norm": 0.42654120922088623, + "learning_rate": 1.4397978104909947e-09, + "loss": 0.1599, + "step": 17785 + }, + { + "epoch": 4.7328366152208625, + "grad_norm": 0.2715577483177185, + "learning_rate": 1.4369436836860759e-09, + "loss": 0.1769, + "step": 17786 + }, + { + "epoch": 4.733102714209686, + "grad_norm": 0.33580508828163147, + "learning_rate": 1.434092368099804e-09, + "loss": 0.1637, + "step": 17787 + }, + { + "epoch": 4.73336881319851, + "grad_norm": 0.6024379134178162, + "learning_rate": 1.4312438638135249e-09, + "loss": 0.1712, + "step": 17788 + }, + { + "epoch": 4.733634912187334, + "grad_norm": 0.31424078345298767, + "learning_rate": 1.4283981709084735e-09, + "loss": 0.1536, + "step": 17789 + }, + { + "epoch": 4.733901011176157, + "grad_norm": 0.28871244192123413, + "learning_rate": 1.4255552894658184e-09, + "loss": 0.1815, + "step": 17790 + }, + { + "epoch": 4.734167110164981, + "grad_norm": 0.44698256254196167, + "learning_rate": 1.4227152195666281e-09, + "loss": 0.1745, + "step": 17791 + }, + { + "epoch": 4.7344332091538055, + "grad_norm": 0.287891685962677, + "learning_rate": 1.4198779612919265e-09, + "loss": 0.1904, + "step": 17792 + }, + { + "epoch": 4.734699308142629, + "grad_norm": 0.26727887988090515, + "learning_rate": 1.4170435147226157e-09, + "loss": 0.1648, + "step": 17793 + }, + { + "epoch": 4.734965407131453, + "grad_norm": 0.3026300072669983, + "learning_rate": 1.4142118799395642e-09, + "loss": 0.1853, + "step": 17794 + }, + { + "epoch": 4.735231506120277, + "grad_norm": 0.3512625992298126, + "learning_rate": 1.4113830570235297e-09, + "loss": 0.1903, + "step": 17795 + }, + { + "epoch": 4.7354976051091, + "grad_norm": 0.34810179471969604, + "learning_rate": 1.408557046055192e-09, + "loss": 0.1917, + "step": 17796 + }, + { + "epoch": 4.735763704097924, + "grad_norm": 0.4088067412376404, + "learning_rate": 1.4057338471151425e-09, + "loss": 0.1835, + "step": 17797 + }, + { + "epoch": 4.7360298030867485, + "grad_norm": 0.28854915499687195, + "learning_rate": 1.4029134602839388e-09, + "loss": 0.1631, + "step": 17798 + }, + { + "epoch": 4.736295902075572, + "grad_norm": 0.3155415654182434, + "learning_rate": 1.400095885641983e-09, + "loss": 0.1691, + "step": 17799 + }, + { + "epoch": 4.736562001064396, + "grad_norm": 0.3465333580970764, + "learning_rate": 1.3972811232696669e-09, + "loss": 0.1753, + "step": 17800 + }, + { + "epoch": 4.73682810005322, + "grad_norm": 0.37767279148101807, + "learning_rate": 1.394469173247248e-09, + "loss": 0.2016, + "step": 17801 + }, + { + "epoch": 4.737094199042044, + "grad_norm": 0.44992366433143616, + "learning_rate": 1.3916600356549624e-09, + "loss": 0.1867, + "step": 17802 + }, + { + "epoch": 4.737360298030867, + "grad_norm": 0.3038089871406555, + "learning_rate": 1.3888537105729014e-09, + "loss": 0.1684, + "step": 17803 + }, + { + "epoch": 4.7376263970196915, + "grad_norm": 0.37486493587493896, + "learning_rate": 1.3860501980811346e-09, + "loss": 0.1964, + "step": 17804 + }, + { + "epoch": 4.737892496008515, + "grad_norm": 0.36093375086784363, + "learning_rate": 1.3832494982595977e-09, + "loss": 0.1823, + "step": 17805 + }, + { + "epoch": 4.738158594997339, + "grad_norm": 0.27546244859695435, + "learning_rate": 1.3804516111881826e-09, + "loss": 0.1843, + "step": 17806 + }, + { + "epoch": 4.738424693986163, + "grad_norm": 0.26822367310523987, + "learning_rate": 1.377656536946703e-09, + "loss": 0.1919, + "step": 17807 + }, + { + "epoch": 4.738690792974987, + "grad_norm": 0.2878105938434601, + "learning_rate": 1.374864275614862e-09, + "loss": 0.1863, + "step": 17808 + }, + { + "epoch": 4.73895689196381, + "grad_norm": 0.3339576721191406, + "learning_rate": 1.372074827272307e-09, + "loss": 0.1714, + "step": 17809 + }, + { + "epoch": 4.739222990952634, + "grad_norm": 0.33117491006851196, + "learning_rate": 1.3692881919985965e-09, + "loss": 0.18, + "step": 17810 + }, + { + "epoch": 4.739489089941459, + "grad_norm": 0.2682839334011078, + "learning_rate": 1.3665043698732226e-09, + "loss": 0.1691, + "step": 17811 + }, + { + "epoch": 4.739755188930282, + "grad_norm": 0.27253732085227966, + "learning_rate": 1.363723360975566e-09, + "loss": 0.1632, + "step": 17812 + }, + { + "epoch": 4.740021287919106, + "grad_norm": 0.3684050738811493, + "learning_rate": 1.3609451653849746e-09, + "loss": 0.1874, + "step": 17813 + }, + { + "epoch": 4.74028738690793, + "grad_norm": 0.41336578130722046, + "learning_rate": 1.3581697831806517e-09, + "loss": 0.171, + "step": 17814 + }, + { + "epoch": 4.740553485896753, + "grad_norm": 0.3713347911834717, + "learning_rate": 1.3553972144417892e-09, + "loss": 0.1775, + "step": 17815 + }, + { + "epoch": 4.740819584885577, + "grad_norm": 0.2777341902256012, + "learning_rate": 1.3526274592474463e-09, + "loss": 0.1682, + "step": 17816 + }, + { + "epoch": 4.7410856838744015, + "grad_norm": 0.33536291122436523, + "learning_rate": 1.3498605176766376e-09, + "loss": 0.1836, + "step": 17817 + }, + { + "epoch": 4.741351782863225, + "grad_norm": 0.28101158142089844, + "learning_rate": 1.3470963898082666e-09, + "loss": 0.1692, + "step": 17818 + }, + { + "epoch": 4.741617881852049, + "grad_norm": 0.43144792318344116, + "learning_rate": 1.3443350757211814e-09, + "loss": 0.2061, + "step": 17819 + }, + { + "epoch": 4.741883980840873, + "grad_norm": 0.28937000036239624, + "learning_rate": 1.3415765754941411e-09, + "loss": 0.2, + "step": 17820 + }, + { + "epoch": 4.742150079829696, + "grad_norm": 0.28382959961891174, + "learning_rate": 1.3388208892058162e-09, + "loss": 0.1756, + "step": 17821 + }, + { + "epoch": 4.74241617881852, + "grad_norm": 0.30103224515914917, + "learning_rate": 1.3360680169348105e-09, + "loss": 0.184, + "step": 17822 + }, + { + "epoch": 4.7426822778073445, + "grad_norm": 0.4137514531612396, + "learning_rate": 1.3333179587596389e-09, + "loss": 0.1799, + "step": 17823 + }, + { + "epoch": 4.742948376796168, + "grad_norm": 0.2506806254386902, + "learning_rate": 1.3305707147587274e-09, + "loss": 0.1633, + "step": 17824 + }, + { + "epoch": 4.743214475784992, + "grad_norm": 0.3835969865322113, + "learning_rate": 1.3278262850104583e-09, + "loss": 0.1796, + "step": 17825 + }, + { + "epoch": 4.743480574773816, + "grad_norm": 0.36111679673194885, + "learning_rate": 1.3250846695931016e-09, + "loss": 0.1683, + "step": 17826 + }, + { + "epoch": 4.74374667376264, + "grad_norm": 0.3484448492527008, + "learning_rate": 1.3223458685848287e-09, + "loss": 0.2084, + "step": 17827 + }, + { + "epoch": 4.744012772751463, + "grad_norm": 0.35561302304267883, + "learning_rate": 1.319609882063788e-09, + "loss": 0.1901, + "step": 17828 + }, + { + "epoch": 4.7442788717402875, + "grad_norm": 0.32116201519966125, + "learning_rate": 1.3168767101080057e-09, + "loss": 0.1661, + "step": 17829 + }, + { + "epoch": 4.744544970729111, + "grad_norm": 0.2740694582462311, + "learning_rate": 1.31414635279542e-09, + "loss": 0.1716, + "step": 17830 + }, + { + "epoch": 4.744811069717935, + "grad_norm": 0.25807005167007446, + "learning_rate": 1.3114188102039237e-09, + "loss": 0.1484, + "step": 17831 + }, + { + "epoch": 4.745077168706759, + "grad_norm": 0.29819729924201965, + "learning_rate": 1.3086940824113102e-09, + "loss": 0.1916, + "step": 17832 + }, + { + "epoch": 4.745343267695583, + "grad_norm": 0.32215452194213867, + "learning_rate": 1.3059721694952841e-09, + "loss": 0.1718, + "step": 17833 + }, + { + "epoch": 4.745609366684406, + "grad_norm": 0.2928013503551483, + "learning_rate": 1.3032530715335054e-09, + "loss": 0.1647, + "step": 17834 + }, + { + "epoch": 4.7458754656732305, + "grad_norm": 0.4106881022453308, + "learning_rate": 1.3005367886034901e-09, + "loss": 0.1936, + "step": 17835 + }, + { + "epoch": 4.746141564662055, + "grad_norm": 0.34319502115249634, + "learning_rate": 1.2978233207827539e-09, + "loss": 0.1539, + "step": 17836 + }, + { + "epoch": 4.746407663650878, + "grad_norm": 0.36955735087394714, + "learning_rate": 1.2951126681486569e-09, + "loss": 0.1784, + "step": 17837 + }, + { + "epoch": 4.746673762639702, + "grad_norm": 0.2637960910797119, + "learning_rate": 1.2924048307785151e-09, + "loss": 0.1653, + "step": 17838 + }, + { + "epoch": 4.746939861628526, + "grad_norm": 0.3043431043624878, + "learning_rate": 1.2896998087495781e-09, + "loss": 0.1862, + "step": 17839 + }, + { + "epoch": 4.747205960617349, + "grad_norm": 0.33565375208854675, + "learning_rate": 1.286997602138995e-09, + "loss": 0.1806, + "step": 17840 + }, + { + "epoch": 4.7474720596061735, + "grad_norm": 0.25753605365753174, + "learning_rate": 1.2842982110238375e-09, + "loss": 0.1586, + "step": 17841 + }, + { + "epoch": 4.747738158594998, + "grad_norm": 0.3896840810775757, + "learning_rate": 1.2816016354810888e-09, + "loss": 0.1853, + "step": 17842 + }, + { + "epoch": 4.748004257583821, + "grad_norm": 0.26700296998023987, + "learning_rate": 1.2789078755876759e-09, + "loss": 0.1556, + "step": 17843 + }, + { + "epoch": 4.748270356572645, + "grad_norm": 0.35201096534729004, + "learning_rate": 1.276216931420404e-09, + "loss": 0.1932, + "step": 17844 + }, + { + "epoch": 4.748536455561469, + "grad_norm": 0.26877519488334656, + "learning_rate": 1.2735288030560565e-09, + "loss": 0.1825, + "step": 17845 + }, + { + "epoch": 4.748802554550292, + "grad_norm": 0.38033327460289, + "learning_rate": 1.2708434905712827e-09, + "loss": 0.1801, + "step": 17846 + }, + { + "epoch": 4.7490686535391164, + "grad_norm": 0.29658856987953186, + "learning_rate": 1.2681609940426775e-09, + "loss": 0.1792, + "step": 17847 + }, + { + "epoch": 4.749334752527941, + "grad_norm": 0.26994261145591736, + "learning_rate": 1.2654813135467568e-09, + "loss": 0.1681, + "step": 17848 + }, + { + "epoch": 4.749600851516764, + "grad_norm": 0.2949330806732178, + "learning_rate": 1.2628044491599598e-09, + "loss": 0.182, + "step": 17849 + }, + { + "epoch": 4.749866950505588, + "grad_norm": 0.27710822224617004, + "learning_rate": 1.2601304009586033e-09, + "loss": 0.1884, + "step": 17850 + }, + { + "epoch": 4.750133049494412, + "grad_norm": 0.31626781821250916, + "learning_rate": 1.2574591690189929e-09, + "loss": 0.1631, + "step": 17851 + }, + { + "epoch": 4.750399148483236, + "grad_norm": 0.2742205262184143, + "learning_rate": 1.2547907534172785e-09, + "loss": 0.1792, + "step": 17852 + }, + { + "epoch": 4.750665247472059, + "grad_norm": 0.28727295994758606, + "learning_rate": 1.2521251542296106e-09, + "loss": 0.2033, + "step": 17853 + }, + { + "epoch": 4.7509313464608836, + "grad_norm": 0.29191920161247253, + "learning_rate": 1.2494623715319951e-09, + "loss": 0.1831, + "step": 17854 + }, + { + "epoch": 4.751197445449708, + "grad_norm": 0.28292301297187805, + "learning_rate": 1.2468024054003934e-09, + "loss": 0.1756, + "step": 17855 + }, + { + "epoch": 4.751463544438531, + "grad_norm": 0.2898082435131073, + "learning_rate": 1.2441452559106447e-09, + "loss": 0.1646, + "step": 17856 + }, + { + "epoch": 4.751729643427355, + "grad_norm": 0.2864111065864563, + "learning_rate": 1.2414909231385662e-09, + "loss": 0.1757, + "step": 17857 + }, + { + "epoch": 4.751995742416179, + "grad_norm": 0.3270157277584076, + "learning_rate": 1.238839407159853e-09, + "loss": 0.1814, + "step": 17858 + }, + { + "epoch": 4.752261841405002, + "grad_norm": 0.4033474922180176, + "learning_rate": 1.2361907080501222e-09, + "loss": 0.1666, + "step": 17859 + }, + { + "epoch": 4.7525279403938265, + "grad_norm": 0.2827843725681305, + "learning_rate": 1.2335448258849358e-09, + "loss": 0.1669, + "step": 17860 + }, + { + "epoch": 4.752794039382651, + "grad_norm": 0.28087881207466125, + "learning_rate": 1.2309017607397554e-09, + "loss": 0.1845, + "step": 17861 + }, + { + "epoch": 4.753060138371474, + "grad_norm": 0.2836396396160126, + "learning_rate": 1.2282615126899655e-09, + "loss": 0.1947, + "step": 17862 + }, + { + "epoch": 4.753326237360298, + "grad_norm": 0.36048951745033264, + "learning_rate": 1.225624081810872e-09, + "loss": 0.1762, + "step": 17863 + }, + { + "epoch": 4.753592336349122, + "grad_norm": 0.2754611074924469, + "learning_rate": 1.2229894681777042e-09, + "loss": 0.1823, + "step": 17864 + }, + { + "epoch": 4.753858435337945, + "grad_norm": 0.3639529347419739, + "learning_rate": 1.2203576718655906e-09, + "loss": 0.185, + "step": 17865 + }, + { + "epoch": 4.7541245343267695, + "grad_norm": 0.3687724471092224, + "learning_rate": 1.2177286929496045e-09, + "loss": 0.1944, + "step": 17866 + }, + { + "epoch": 4.754390633315594, + "grad_norm": 0.3605091869831085, + "learning_rate": 1.2151025315047414e-09, + "loss": 0.2034, + "step": 17867 + }, + { + "epoch": 4.754656732304417, + "grad_norm": 0.27737540006637573, + "learning_rate": 1.212479187605897e-09, + "loss": 0.1904, + "step": 17868 + }, + { + "epoch": 4.754922831293241, + "grad_norm": 0.2928125858306885, + "learning_rate": 1.2098586613278782e-09, + "loss": 0.175, + "step": 17869 + }, + { + "epoch": 4.755188930282065, + "grad_norm": 0.3797127604484558, + "learning_rate": 1.2072409527454585e-09, + "loss": 0.1615, + "step": 17870 + }, + { + "epoch": 4.755455029270889, + "grad_norm": 0.2645503580570221, + "learning_rate": 1.2046260619332671e-09, + "loss": 0.1664, + "step": 17871 + }, + { + "epoch": 4.7557211282597125, + "grad_norm": 0.40284034609794617, + "learning_rate": 1.202013988965911e-09, + "loss": 0.1843, + "step": 17872 + }, + { + "epoch": 4.755987227248537, + "grad_norm": 0.28975802659988403, + "learning_rate": 1.1994047339178748e-09, + "loss": 0.1918, + "step": 17873 + }, + { + "epoch": 4.75625332623736, + "grad_norm": 0.2835935354232788, + "learning_rate": 1.1967982968635992e-09, + "loss": 0.1868, + "step": 17874 + }, + { + "epoch": 4.756519425226184, + "grad_norm": 0.4113791286945343, + "learning_rate": 1.1941946778774026e-09, + "loss": 0.1933, + "step": 17875 + }, + { + "epoch": 4.756785524215008, + "grad_norm": 0.37162384390830994, + "learning_rate": 1.1915938770335698e-09, + "loss": 0.1864, + "step": 17876 + }, + { + "epoch": 4.757051623203832, + "grad_norm": 0.2937052249908447, + "learning_rate": 1.1889958944062528e-09, + "loss": 0.1885, + "step": 17877 + }, + { + "epoch": 4.7573177221926555, + "grad_norm": 0.3825890123844147, + "learning_rate": 1.18640073006957e-09, + "loss": 0.1878, + "step": 17878 + }, + { + "epoch": 4.75758382118148, + "grad_norm": 0.3327023983001709, + "learning_rate": 1.183808384097551e-09, + "loss": 0.1762, + "step": 17879 + }, + { + "epoch": 4.757849920170304, + "grad_norm": 0.360503613948822, + "learning_rate": 1.1812188565641145e-09, + "loss": 0.1815, + "step": 17880 + }, + { + "epoch": 4.758116019159127, + "grad_norm": 0.36162763833999634, + "learning_rate": 1.1786321475431126e-09, + "loss": 0.174, + "step": 17881 + }, + { + "epoch": 4.758382118147951, + "grad_norm": 0.27665069699287415, + "learning_rate": 1.1760482571083418e-09, + "loss": 0.187, + "step": 17882 + }, + { + "epoch": 4.758648217136775, + "grad_norm": 0.3771215081214905, + "learning_rate": 1.1734671853335098e-09, + "loss": 0.1886, + "step": 17883 + }, + { + "epoch": 4.7589143161255985, + "grad_norm": 0.3024269640445709, + "learning_rate": 1.1708889322922021e-09, + "loss": 0.1847, + "step": 17884 + }, + { + "epoch": 4.759180415114423, + "grad_norm": 0.8521347641944885, + "learning_rate": 1.1683134980579823e-09, + "loss": 0.1924, + "step": 17885 + }, + { + "epoch": 4.759446514103247, + "grad_norm": 0.33915990591049194, + "learning_rate": 1.1657408827042914e-09, + "loss": 0.189, + "step": 17886 + }, + { + "epoch": 4.75971261309207, + "grad_norm": 0.2826217710971832, + "learning_rate": 1.1631710863045263e-09, + "loss": 0.1616, + "step": 17887 + }, + { + "epoch": 4.759978712080894, + "grad_norm": 0.30181175470352173, + "learning_rate": 1.1606041089319506e-09, + "loss": 0.1675, + "step": 17888 + }, + { + "epoch": 4.760244811069718, + "grad_norm": 0.3977470099925995, + "learning_rate": 1.1580399506597949e-09, + "loss": 0.1923, + "step": 17889 + }, + { + "epoch": 4.760510910058541, + "grad_norm": 0.408793181180954, + "learning_rate": 1.1554786115612003e-09, + "loss": 0.2106, + "step": 17890 + }, + { + "epoch": 4.760777009047366, + "grad_norm": 0.4769190847873688, + "learning_rate": 1.1529200917092308e-09, + "loss": 0.1798, + "step": 17891 + }, + { + "epoch": 4.76104310803619, + "grad_norm": 0.28526219725608826, + "learning_rate": 1.1503643911768279e-09, + "loss": 0.1775, + "step": 17892 + }, + { + "epoch": 4.761309207025013, + "grad_norm": 0.259052038192749, + "learning_rate": 1.1478115100369223e-09, + "loss": 0.1593, + "step": 17893 + }, + { + "epoch": 4.761575306013837, + "grad_norm": 0.37700992822647095, + "learning_rate": 1.1452614483623001e-09, + "loss": 0.2018, + "step": 17894 + }, + { + "epoch": 4.761841405002661, + "grad_norm": 0.3047180771827698, + "learning_rate": 1.1427142062256923e-09, + "loss": 0.1925, + "step": 17895 + }, + { + "epoch": 4.762107503991485, + "grad_norm": 0.3106410503387451, + "learning_rate": 1.1401697836997736e-09, + "loss": 0.1921, + "step": 17896 + }, + { + "epoch": 4.7623736029803085, + "grad_norm": 0.32427188754081726, + "learning_rate": 1.1376281808571087e-09, + "loss": 0.1937, + "step": 17897 + }, + { + "epoch": 4.762639701969133, + "grad_norm": 0.2536241412162781, + "learning_rate": 1.1350893977701836e-09, + "loss": 0.162, + "step": 17898 + }, + { + "epoch": 4.762905800957956, + "grad_norm": 0.39098262786865234, + "learning_rate": 1.1325534345114184e-09, + "loss": 0.1642, + "step": 17899 + }, + { + "epoch": 4.76317189994678, + "grad_norm": 0.3408176600933075, + "learning_rate": 1.130020291153133e-09, + "loss": 0.1794, + "step": 17900 + }, + { + "epoch": 4.763437998935604, + "grad_norm": 0.269589364528656, + "learning_rate": 1.1274899677675808e-09, + "loss": 0.1659, + "step": 17901 + }, + { + "epoch": 4.763704097924428, + "grad_norm": 0.269794225692749, + "learning_rate": 1.1249624644269263e-09, + "loss": 0.1724, + "step": 17902 + }, + { + "epoch": 4.7639701969132515, + "grad_norm": 0.2648991048336029, + "learning_rate": 1.1224377812032671e-09, + "loss": 0.1658, + "step": 17903 + }, + { + "epoch": 4.764236295902076, + "grad_norm": 0.254517138004303, + "learning_rate": 1.1199159181686236e-09, + "loss": 0.1654, + "step": 17904 + }, + { + "epoch": 4.7645023948909, + "grad_norm": 0.32046228647232056, + "learning_rate": 1.1173968753949048e-09, + "loss": 0.1794, + "step": 17905 + }, + { + "epoch": 4.764768493879723, + "grad_norm": 0.26117268204689026, + "learning_rate": 1.1148806529539757e-09, + "loss": 0.1682, + "step": 17906 + }, + { + "epoch": 4.765034592868547, + "grad_norm": 0.29071635007858276, + "learning_rate": 1.1123672509175785e-09, + "loss": 0.1894, + "step": 17907 + }, + { + "epoch": 4.765300691857371, + "grad_norm": 0.32097968459129333, + "learning_rate": 1.109856669357423e-09, + "loss": 0.1687, + "step": 17908 + }, + { + "epoch": 4.7655667908461945, + "grad_norm": 0.2586449980735779, + "learning_rate": 1.1073489083451183e-09, + "loss": 0.1701, + "step": 17909 + }, + { + "epoch": 4.765832889835019, + "grad_norm": 0.277692049741745, + "learning_rate": 1.1048439679521626e-09, + "loss": 0.1716, + "step": 17910 + }, + { + "epoch": 4.766098988823843, + "grad_norm": 0.5864109992980957, + "learning_rate": 1.1023418482500323e-09, + "loss": 0.1761, + "step": 17911 + }, + { + "epoch": 4.766365087812666, + "grad_norm": 0.2947007417678833, + "learning_rate": 1.0998425493100927e-09, + "loss": 0.1835, + "step": 17912 + }, + { + "epoch": 4.76663118680149, + "grad_norm": 0.28880172967910767, + "learning_rate": 1.0973460712036086e-09, + "loss": 0.1714, + "step": 17913 + }, + { + "epoch": 4.766897285790314, + "grad_norm": 0.2713455557823181, + "learning_rate": 1.09485241400179e-09, + "loss": 0.1742, + "step": 17914 + }, + { + "epoch": 4.7671633847791375, + "grad_norm": 0.32984599471092224, + "learning_rate": 1.092361577775769e-09, + "loss": 0.175, + "step": 17915 + }, + { + "epoch": 4.767429483767962, + "grad_norm": 0.2715993821620941, + "learning_rate": 1.0898735625965882e-09, + "loss": 0.1746, + "step": 17916 + }, + { + "epoch": 4.767695582756786, + "grad_norm": 0.38885611295700073, + "learning_rate": 1.0873883685352026e-09, + "loss": 0.1788, + "step": 17917 + }, + { + "epoch": 4.767961681745609, + "grad_norm": 0.4150637686252594, + "learning_rate": 1.0849059956624995e-09, + "loss": 0.1838, + "step": 17918 + }, + { + "epoch": 4.768227780734433, + "grad_norm": 0.2890927493572235, + "learning_rate": 1.0824264440493004e-09, + "loss": 0.1846, + "step": 17919 + }, + { + "epoch": 4.768493879723257, + "grad_norm": 0.29485172033309937, + "learning_rate": 1.0799497137662926e-09, + "loss": 0.1912, + "step": 17920 + }, + { + "epoch": 4.768759978712081, + "grad_norm": 0.38673126697540283, + "learning_rate": 1.0774758048841425e-09, + "loss": 0.1811, + "step": 17921 + }, + { + "epoch": 4.769026077700905, + "grad_norm": 0.2479974776506424, + "learning_rate": 1.0750047174733934e-09, + "loss": 0.1715, + "step": 17922 + }, + { + "epoch": 4.769292176689729, + "grad_norm": 0.2788880467414856, + "learning_rate": 1.0725364516045443e-09, + "loss": 0.1901, + "step": 17923 + }, + { + "epoch": 4.769558275678552, + "grad_norm": 0.41831734776496887, + "learning_rate": 1.0700710073479834e-09, + "loss": 0.2045, + "step": 17924 + }, + { + "epoch": 4.769824374667376, + "grad_norm": 0.35295382142066956, + "learning_rate": 1.0676083847740214e-09, + "loss": 0.1811, + "step": 17925 + }, + { + "epoch": 4.7700904736562, + "grad_norm": 0.27431729435920715, + "learning_rate": 1.065148583952924e-09, + "loss": 0.1787, + "step": 17926 + }, + { + "epoch": 4.770356572645024, + "grad_norm": 0.3791525363922119, + "learning_rate": 1.062691604954824e-09, + "loss": 0.1782, + "step": 17927 + }, + { + "epoch": 4.770622671633848, + "grad_norm": 0.39720115065574646, + "learning_rate": 1.060237447849821e-09, + "loss": 0.1799, + "step": 17928 + }, + { + "epoch": 4.770888770622672, + "grad_norm": 0.26998472213745117, + "learning_rate": 1.0577861127078923e-09, + "loss": 0.1637, + "step": 17929 + }, + { + "epoch": 4.771154869611496, + "grad_norm": 0.289828360080719, + "learning_rate": 1.05533759959896e-09, + "loss": 0.1747, + "step": 17930 + }, + { + "epoch": 4.771420968600319, + "grad_norm": 0.3700976073741913, + "learning_rate": 1.0528919085928679e-09, + "loss": 0.1822, + "step": 17931 + }, + { + "epoch": 4.771687067589143, + "grad_norm": 0.27768653631210327, + "learning_rate": 1.0504490397593712e-09, + "loss": 0.1823, + "step": 17932 + }, + { + "epoch": 4.771953166577967, + "grad_norm": 0.32754769921302795, + "learning_rate": 1.0480089931681479e-09, + "loss": 0.167, + "step": 17933 + }, + { + "epoch": 4.7722192655667905, + "grad_norm": 0.32236695289611816, + "learning_rate": 1.0455717688887866e-09, + "loss": 0.1862, + "step": 17934 + }, + { + "epoch": 4.772485364555615, + "grad_norm": 0.36016398668289185, + "learning_rate": 1.0431373669907983e-09, + "loss": 0.1941, + "step": 17935 + }, + { + "epoch": 4.772751463544439, + "grad_norm": 0.2762526273727417, + "learning_rate": 1.0407057875436275e-09, + "loss": 0.1637, + "step": 17936 + }, + { + "epoch": 4.773017562533262, + "grad_norm": 0.41714945435523987, + "learning_rate": 1.0382770306166078e-09, + "loss": 0.1911, + "step": 17937 + }, + { + "epoch": 4.773283661522086, + "grad_norm": 0.3492903411388397, + "learning_rate": 1.03585109627905e-09, + "loss": 0.1752, + "step": 17938 + }, + { + "epoch": 4.77354976051091, + "grad_norm": 0.2952713966369629, + "learning_rate": 1.0334279846001103e-09, + "loss": 0.1726, + "step": 17939 + }, + { + "epoch": 4.7738158594997335, + "grad_norm": 0.42240241169929504, + "learning_rate": 1.031007695648911e-09, + "loss": 0.2067, + "step": 17940 + }, + { + "epoch": 4.774081958488558, + "grad_norm": 0.2677404582500458, + "learning_rate": 1.028590229494486e-09, + "loss": 0.1729, + "step": 17941 + }, + { + "epoch": 4.774348057477382, + "grad_norm": 0.27627065777778625, + "learning_rate": 1.026175586205802e-09, + "loss": 0.1685, + "step": 17942 + }, + { + "epoch": 4.774614156466205, + "grad_norm": 0.3675800561904907, + "learning_rate": 1.023763765851704e-09, + "loss": 0.1676, + "step": 17943 + }, + { + "epoch": 4.774880255455029, + "grad_norm": 1.241326093673706, + "learning_rate": 1.0213547685009927e-09, + "loss": 0.149, + "step": 17944 + }, + { + "epoch": 4.775146354443853, + "grad_norm": 0.2957425117492676, + "learning_rate": 1.0189485942223796e-09, + "loss": 0.1817, + "step": 17945 + }, + { + "epoch": 4.775412453432677, + "grad_norm": 0.30163952708244324, + "learning_rate": 1.016545243084488e-09, + "loss": 0.1761, + "step": 17946 + }, + { + "epoch": 4.775678552421501, + "grad_norm": 0.3870004117488861, + "learning_rate": 1.0141447151558735e-09, + "loss": 0.1856, + "step": 17947 + }, + { + "epoch": 4.775944651410325, + "grad_norm": 0.35562676191329956, + "learning_rate": 1.0117470105050042e-09, + "loss": 0.1665, + "step": 17948 + }, + { + "epoch": 4.776210750399148, + "grad_norm": 0.36631789803504944, + "learning_rate": 1.0093521292002582e-09, + "loss": 0.1868, + "step": 17949 + }, + { + "epoch": 4.776476849387972, + "grad_norm": 0.29961204528808594, + "learning_rate": 1.006960071309948e-09, + "loss": 0.1802, + "step": 17950 + }, + { + "epoch": 4.776742948376796, + "grad_norm": 0.2676502466201782, + "learning_rate": 1.0045708369023077e-09, + "loss": 0.1626, + "step": 17951 + }, + { + "epoch": 4.77700904736562, + "grad_norm": 0.28212210536003113, + "learning_rate": 1.0021844260454605e-09, + "loss": 0.1859, + "step": 17952 + }, + { + "epoch": 4.777275146354444, + "grad_norm": 0.34504666924476624, + "learning_rate": 9.998008388074963e-10, + "loss": 0.1775, + "step": 17953 + }, + { + "epoch": 4.777541245343268, + "grad_norm": 0.3679888844490051, + "learning_rate": 9.974200752563833e-10, + "loss": 0.1829, + "step": 17954 + }, + { + "epoch": 4.777807344332092, + "grad_norm": 0.29696857929229736, + "learning_rate": 9.950421354600448e-10, + "loss": 0.1744, + "step": 17955 + }, + { + "epoch": 4.778073443320915, + "grad_norm": 0.25178107619285583, + "learning_rate": 9.92667019486293e-10, + "loss": 0.1421, + "step": 17956 + }, + { + "epoch": 4.778339542309739, + "grad_norm": 0.39056074619293213, + "learning_rate": 9.902947274028629e-10, + "loss": 0.1732, + "step": 17957 + }, + { + "epoch": 4.778605641298563, + "grad_norm": 0.34277722239494324, + "learning_rate": 9.879252592774223e-10, + "loss": 0.2007, + "step": 17958 + }, + { + "epoch": 4.778871740287387, + "grad_norm": 0.4378242790699005, + "learning_rate": 9.85558615177562e-10, + "loss": 0.1944, + "step": 17959 + }, + { + "epoch": 4.779137839276211, + "grad_norm": 0.2899444103240967, + "learning_rate": 9.83194795170772e-10, + "loss": 0.1895, + "step": 17960 + }, + { + "epoch": 4.779403938265035, + "grad_norm": 0.282821387052536, + "learning_rate": 9.808337993244763e-10, + "loss": 0.1607, + "step": 17961 + }, + { + "epoch": 4.779670037253858, + "grad_norm": 0.27145034074783325, + "learning_rate": 9.784756277060213e-10, + "loss": 0.1674, + "step": 17962 + }, + { + "epoch": 4.779936136242682, + "grad_norm": 0.25930291414260864, + "learning_rate": 9.76120280382664e-10, + "loss": 0.1598, + "step": 17963 + }, + { + "epoch": 4.780202235231506, + "grad_norm": 0.30908575654029846, + "learning_rate": 9.737677574215841e-10, + "loss": 0.1651, + "step": 17964 + }, + { + "epoch": 4.7804683342203305, + "grad_norm": 0.3389740288257599, + "learning_rate": 9.714180588898723e-10, + "loss": 0.1805, + "step": 17965 + }, + { + "epoch": 4.780734433209154, + "grad_norm": 0.4307357966899872, + "learning_rate": 9.690711848545641e-10, + "loss": 0.1974, + "step": 17966 + }, + { + "epoch": 4.781000532197978, + "grad_norm": 0.3361338973045349, + "learning_rate": 9.667271353825723e-10, + "loss": 0.1805, + "step": 17967 + }, + { + "epoch": 4.781266631186801, + "grad_norm": 0.256766140460968, + "learning_rate": 9.643859105407658e-10, + "loss": 0.1661, + "step": 17968 + }, + { + "epoch": 4.781532730175625, + "grad_norm": 0.38640159368515015, + "learning_rate": 9.620475103959359e-10, + "loss": 0.1883, + "step": 17969 + }, + { + "epoch": 4.781798829164449, + "grad_norm": 0.3705112338066101, + "learning_rate": 9.59711935014762e-10, + "loss": 0.1754, + "step": 17970 + }, + { + "epoch": 4.782064928153273, + "grad_norm": 0.3833259344100952, + "learning_rate": 9.57379184463858e-10, + "loss": 0.1863, + "step": 17971 + }, + { + "epoch": 4.782331027142097, + "grad_norm": 1.9345262050628662, + "learning_rate": 9.550492588097704e-10, + "loss": 0.176, + "step": 17972 + }, + { + "epoch": 4.782597126130921, + "grad_norm": 0.2785209119319916, + "learning_rate": 9.527221581189459e-10, + "loss": 0.1648, + "step": 17973 + }, + { + "epoch": 4.782863225119745, + "grad_norm": 0.36931905150413513, + "learning_rate": 9.503978824577652e-10, + "loss": 0.1907, + "step": 17974 + }, + { + "epoch": 4.783129324108568, + "grad_norm": 0.31815093755722046, + "learning_rate": 9.480764318925083e-10, + "loss": 0.1814, + "step": 17975 + }, + { + "epoch": 4.783395423097392, + "grad_norm": 0.3955938220024109, + "learning_rate": 9.45757806489389e-10, + "loss": 0.1734, + "step": 17976 + }, + { + "epoch": 4.783661522086216, + "grad_norm": 0.29465994238853455, + "learning_rate": 9.434420063145432e-10, + "loss": 0.162, + "step": 17977 + }, + { + "epoch": 4.78392762107504, + "grad_norm": 0.3393709361553192, + "learning_rate": 9.411290314340403e-10, + "loss": 0.1871, + "step": 17978 + }, + { + "epoch": 4.784193720063864, + "grad_norm": 0.27062734961509705, + "learning_rate": 9.388188819138276e-10, + "loss": 0.1687, + "step": 17979 + }, + { + "epoch": 4.784459819052688, + "grad_norm": 0.2578332722187042, + "learning_rate": 9.365115578197968e-10, + "loss": 0.1563, + "step": 17980 + }, + { + "epoch": 4.784725918041511, + "grad_norm": 0.30066823959350586, + "learning_rate": 9.34207059217762e-10, + "loss": 0.1734, + "step": 17981 + }, + { + "epoch": 4.784992017030335, + "grad_norm": 0.2608277201652527, + "learning_rate": 9.319053861734595e-10, + "loss": 0.1672, + "step": 17982 + }, + { + "epoch": 4.785258116019159, + "grad_norm": 0.30984681844711304, + "learning_rate": 9.296065387525254e-10, + "loss": 0.1721, + "step": 17983 + }, + { + "epoch": 4.785524215007983, + "grad_norm": 0.3696594536304474, + "learning_rate": 9.273105170205408e-10, + "loss": 0.1952, + "step": 17984 + }, + { + "epoch": 4.785790313996807, + "grad_norm": 0.2846361994743347, + "learning_rate": 9.250173210429868e-10, + "loss": 0.1591, + "step": 17985 + }, + { + "epoch": 4.786056412985631, + "grad_norm": 0.3768218159675598, + "learning_rate": 9.227269508852664e-10, + "loss": 0.1943, + "step": 17986 + }, + { + "epoch": 4.786322511974454, + "grad_norm": 0.2735759913921356, + "learning_rate": 9.204394066127053e-10, + "loss": 0.1677, + "step": 17987 + }, + { + "epoch": 4.786588610963278, + "grad_norm": 0.2782868444919586, + "learning_rate": 9.181546882905511e-10, + "loss": 0.1852, + "step": 17988 + }, + { + "epoch": 4.786854709952102, + "grad_norm": 0.28093385696411133, + "learning_rate": 9.15872795983974e-10, + "loss": 0.1564, + "step": 17989 + }, + { + "epoch": 4.7871208089409265, + "grad_norm": 0.2572181224822998, + "learning_rate": 9.135937297580331e-10, + "loss": 0.1564, + "step": 17990 + }, + { + "epoch": 4.78738690792975, + "grad_norm": 0.371281236410141, + "learning_rate": 9.113174896777764e-10, + "loss": 0.194, + "step": 17991 + }, + { + "epoch": 4.787653006918574, + "grad_norm": 0.4020276665687561, + "learning_rate": 9.090440758080964e-10, + "loss": 0.1881, + "step": 17992 + }, + { + "epoch": 4.787919105907397, + "grad_norm": 0.3449333906173706, + "learning_rate": 9.067734882138412e-10, + "loss": 0.1813, + "step": 17993 + }, + { + "epoch": 4.788185204896221, + "grad_norm": 0.46357759833335876, + "learning_rate": 9.045057269597701e-10, + "loss": 0.1841, + "step": 17994 + }, + { + "epoch": 4.788451303885045, + "grad_norm": 0.26005834341049194, + "learning_rate": 9.022407921105757e-10, + "loss": 0.1617, + "step": 17995 + }, + { + "epoch": 4.7887174028738695, + "grad_norm": 0.2865428328514099, + "learning_rate": 8.999786837308399e-10, + "loss": 0.191, + "step": 17996 + }, + { + "epoch": 4.788983501862693, + "grad_norm": 0.2764632999897003, + "learning_rate": 8.977194018850997e-10, + "loss": 0.1865, + "step": 17997 + }, + { + "epoch": 4.789249600851517, + "grad_norm": 0.281294047832489, + "learning_rate": 8.954629466377816e-10, + "loss": 0.1757, + "step": 17998 + }, + { + "epoch": 4.789515699840341, + "grad_norm": 0.31661680340766907, + "learning_rate": 8.932093180532563e-10, + "loss": 0.1773, + "step": 17999 + }, + { + "epoch": 4.789781798829164, + "grad_norm": 0.33413267135620117, + "learning_rate": 8.909585161957944e-10, + "loss": 0.1655, + "step": 18000 + }, + { + "epoch": 4.790047897817988, + "grad_norm": 0.28705185651779175, + "learning_rate": 8.887105411296003e-10, + "loss": 0.1519, + "step": 18001 + }, + { + "epoch": 4.7903139968068125, + "grad_norm": 0.3926074504852295, + "learning_rate": 8.86465392918767e-10, + "loss": 0.1945, + "step": 18002 + }, + { + "epoch": 4.790580095795636, + "grad_norm": 0.31096935272216797, + "learning_rate": 8.842230716273658e-10, + "loss": 0.1949, + "step": 18003 + }, + { + "epoch": 4.79084619478446, + "grad_norm": 0.26832929253578186, + "learning_rate": 8.819835773193229e-10, + "loss": 0.1683, + "step": 18004 + }, + { + "epoch": 4.791112293773284, + "grad_norm": 0.4481515884399414, + "learning_rate": 8.797469100585431e-10, + "loss": 0.1498, + "step": 18005 + }, + { + "epoch": 4.791378392762107, + "grad_norm": 0.33855095505714417, + "learning_rate": 8.775130699087863e-10, + "loss": 0.1785, + "step": 18006 + }, + { + "epoch": 4.791644491750931, + "grad_norm": 0.3245139420032501, + "learning_rate": 8.752820569337904e-10, + "loss": 0.1713, + "step": 18007 + }, + { + "epoch": 4.7919105907397554, + "grad_norm": 0.2619127929210663, + "learning_rate": 8.730538711971936e-10, + "loss": 0.1525, + "step": 18008 + }, + { + "epoch": 4.792176689728579, + "grad_norm": 0.3227079510688782, + "learning_rate": 8.708285127625115e-10, + "loss": 0.1797, + "step": 18009 + }, + { + "epoch": 4.792442788717403, + "grad_norm": 0.2907863259315491, + "learning_rate": 8.686059816932601e-10, + "loss": 0.1752, + "step": 18010 + }, + { + "epoch": 4.792708887706227, + "grad_norm": 0.40364375710487366, + "learning_rate": 8.663862780527997e-10, + "loss": 0.1962, + "step": 18011 + }, + { + "epoch": 4.79297498669505, + "grad_norm": 0.27521324157714844, + "learning_rate": 8.641694019044577e-10, + "loss": 0.1737, + "step": 18012 + }, + { + "epoch": 4.793241085683874, + "grad_norm": 0.31705766916275024, + "learning_rate": 8.619553533114498e-10, + "loss": 0.179, + "step": 18013 + }, + { + "epoch": 4.793507184672698, + "grad_norm": 0.3786308169364929, + "learning_rate": 8.59744132336948e-10, + "loss": 0.1648, + "step": 18014 + }, + { + "epoch": 4.7937732836615226, + "grad_norm": 0.27572616934776306, + "learning_rate": 8.575357390439908e-10, + "loss": 0.1585, + "step": 18015 + }, + { + "epoch": 4.794039382650346, + "grad_norm": 0.3598576486110687, + "learning_rate": 8.553301734955942e-10, + "loss": 0.1827, + "step": 18016 + }, + { + "epoch": 4.79430548163917, + "grad_norm": 0.3515278398990631, + "learning_rate": 8.531274357546525e-10, + "loss": 0.169, + "step": 18017 + }, + { + "epoch": 4.794571580627993, + "grad_norm": 0.2856527268886566, + "learning_rate": 8.509275258839821e-10, + "loss": 0.1684, + "step": 18018 + }, + { + "epoch": 4.794837679616817, + "grad_norm": 0.2998988926410675, + "learning_rate": 8.487304439463439e-10, + "loss": 0.1702, + "step": 18019 + }, + { + "epoch": 4.795103778605641, + "grad_norm": 0.3332345187664032, + "learning_rate": 8.465361900043988e-10, + "loss": 0.1785, + "step": 18020 + }, + { + "epoch": 4.7953698775944655, + "grad_norm": 0.3037926256656647, + "learning_rate": 8.443447641207302e-10, + "loss": 0.1707, + "step": 18021 + }, + { + "epoch": 4.795635976583289, + "grad_norm": 0.4124709665775299, + "learning_rate": 8.421561663578435e-10, + "loss": 0.1739, + "step": 18022 + }, + { + "epoch": 4.795902075572113, + "grad_norm": 0.40807220339775085, + "learning_rate": 8.399703967781558e-10, + "loss": 0.1753, + "step": 18023 + }, + { + "epoch": 4.796168174560937, + "grad_norm": 0.26048630475997925, + "learning_rate": 8.377874554440279e-10, + "loss": 0.1654, + "step": 18024 + }, + { + "epoch": 4.79643427354976, + "grad_norm": 0.37888485193252563, + "learning_rate": 8.35607342417699e-10, + "loss": 0.1861, + "step": 18025 + }, + { + "epoch": 4.796700372538584, + "grad_norm": 0.2862781584262848, + "learning_rate": 8.33430057761364e-10, + "loss": 0.1833, + "step": 18026 + }, + { + "epoch": 4.7969664715274085, + "grad_norm": 0.3548813462257385, + "learning_rate": 8.312556015371175e-10, + "loss": 0.1707, + "step": 18027 + }, + { + "epoch": 4.797232570516232, + "grad_norm": 0.3430313169956207, + "learning_rate": 8.290839738069766e-10, + "loss": 0.1992, + "step": 18028 + }, + { + "epoch": 4.797498669505056, + "grad_norm": 0.2993917167186737, + "learning_rate": 8.269151746329028e-10, + "loss": 0.1696, + "step": 18029 + }, + { + "epoch": 4.79776476849388, + "grad_norm": 0.2812010943889618, + "learning_rate": 8.247492040767245e-10, + "loss": 0.177, + "step": 18030 + }, + { + "epoch": 4.798030867482703, + "grad_norm": 0.24078454077243805, + "learning_rate": 8.225860622002367e-10, + "loss": 0.1565, + "step": 18031 + }, + { + "epoch": 4.798296966471527, + "grad_norm": 0.36108624935150146, + "learning_rate": 8.204257490651346e-10, + "loss": 0.1724, + "step": 18032 + }, + { + "epoch": 4.7985630654603515, + "grad_norm": 0.5713040828704834, + "learning_rate": 8.182682647330241e-10, + "loss": 0.1746, + "step": 18033 + }, + { + "epoch": 4.798829164449175, + "grad_norm": 0.2767219841480255, + "learning_rate": 8.161136092654563e-10, + "loss": 0.1435, + "step": 18034 + }, + { + "epoch": 4.799095263437999, + "grad_norm": 0.41606804728507996, + "learning_rate": 8.139617827238821e-10, + "loss": 0.1802, + "step": 18035 + }, + { + "epoch": 4.799361362426823, + "grad_norm": 0.2640732228755951, + "learning_rate": 8.118127851696632e-10, + "loss": 0.1804, + "step": 18036 + }, + { + "epoch": 4.799627461415646, + "grad_norm": 0.2991763949394226, + "learning_rate": 8.096666166641175e-10, + "loss": 0.1793, + "step": 18037 + }, + { + "epoch": 4.79989356040447, + "grad_norm": 0.2784406244754791, + "learning_rate": 8.075232772684293e-10, + "loss": 0.1887, + "step": 18038 + }, + { + "epoch": 4.8001596593932945, + "grad_norm": 0.2870814800262451, + "learning_rate": 8.053827670437607e-10, + "loss": 0.1813, + "step": 18039 + }, + { + "epoch": 4.800425758382119, + "grad_norm": 0.3020758330821991, + "learning_rate": 8.032450860511408e-10, + "loss": 0.1781, + "step": 18040 + }, + { + "epoch": 4.800691857370942, + "grad_norm": 0.29763495922088623, + "learning_rate": 8.01110234351543e-10, + "loss": 0.1834, + "step": 18041 + }, + { + "epoch": 4.800957956359766, + "grad_norm": 0.2843228578567505, + "learning_rate": 7.98978212005863e-10, + "loss": 0.1706, + "step": 18042 + }, + { + "epoch": 4.801224055348589, + "grad_norm": 0.39042794704437256, + "learning_rate": 7.968490190749188e-10, + "loss": 0.16, + "step": 18043 + }, + { + "epoch": 4.801490154337413, + "grad_norm": 0.29991674423217773, + "learning_rate": 7.947226556194287e-10, + "loss": 0.2017, + "step": 18044 + }, + { + "epoch": 4.8017562533262375, + "grad_norm": 0.2527492046356201, + "learning_rate": 7.925991217000439e-10, + "loss": 0.1523, + "step": 18045 + }, + { + "epoch": 4.802022352315062, + "grad_norm": 0.2922094166278839, + "learning_rate": 7.904784173773382e-10, + "loss": 0.1708, + "step": 18046 + }, + { + "epoch": 4.802288451303885, + "grad_norm": 0.3786606788635254, + "learning_rate": 7.883605427117857e-10, + "loss": 0.1729, + "step": 18047 + }, + { + "epoch": 4.802554550292709, + "grad_norm": 0.3897441327571869, + "learning_rate": 7.862454977637934e-10, + "loss": 0.1809, + "step": 18048 + }, + { + "epoch": 4.802820649281533, + "grad_norm": 0.400331050157547, + "learning_rate": 7.841332825936909e-10, + "loss": 0.1702, + "step": 18049 + }, + { + "epoch": 4.803086748270356, + "grad_norm": 0.34916141629219055, + "learning_rate": 7.820238972617299e-10, + "loss": 0.2073, + "step": 18050 + }, + { + "epoch": 4.80335284725918, + "grad_norm": 0.3572029173374176, + "learning_rate": 7.799173418280625e-10, + "loss": 0.1741, + "step": 18051 + }, + { + "epoch": 4.803618946248005, + "grad_norm": 0.27472323179244995, + "learning_rate": 7.778136163527848e-10, + "loss": 0.1856, + "step": 18052 + }, + { + "epoch": 4.803885045236828, + "grad_norm": 0.28928351402282715, + "learning_rate": 7.757127208958824e-10, + "loss": 0.1467, + "step": 18053 + }, + { + "epoch": 4.804151144225652, + "grad_norm": 0.28196993470191956, + "learning_rate": 7.736146555172963e-10, + "loss": 0.1788, + "step": 18054 + }, + { + "epoch": 4.804417243214476, + "grad_norm": 0.33022525906562805, + "learning_rate": 7.715194202768449e-10, + "loss": 0.1795, + "step": 18055 + }, + { + "epoch": 4.804683342203299, + "grad_norm": 0.25220492482185364, + "learning_rate": 7.69427015234314e-10, + "loss": 0.1631, + "step": 18056 + }, + { + "epoch": 4.804949441192123, + "grad_norm": 0.29037341475486755, + "learning_rate": 7.673374404493671e-10, + "loss": 0.1893, + "step": 18057 + }, + { + "epoch": 4.8052155401809475, + "grad_norm": 0.2719508111476898, + "learning_rate": 7.652506959816118e-10, + "loss": 0.158, + "step": 18058 + }, + { + "epoch": 4.805481639169771, + "grad_norm": 0.38308921456336975, + "learning_rate": 7.631667818905562e-10, + "loss": 0.1828, + "step": 18059 + }, + { + "epoch": 4.805747738158595, + "grad_norm": 0.2615853250026703, + "learning_rate": 7.610856982356417e-10, + "loss": 0.1666, + "step": 18060 + }, + { + "epoch": 4.806013837147419, + "grad_norm": 0.260436475276947, + "learning_rate": 7.590074450762207e-10, + "loss": 0.1598, + "step": 18061 + }, + { + "epoch": 4.806279936136242, + "grad_norm": 0.2589226961135864, + "learning_rate": 7.569320224715792e-10, + "loss": 0.1602, + "step": 18062 + }, + { + "epoch": 4.806546035125066, + "grad_norm": 0.2600536048412323, + "learning_rate": 7.548594304809142e-10, + "loss": 0.1668, + "step": 18063 + }, + { + "epoch": 4.8068121341138905, + "grad_norm": 0.3744671046733856, + "learning_rate": 7.527896691633229e-10, + "loss": 0.1905, + "step": 18064 + }, + { + "epoch": 4.807078233102715, + "grad_norm": 0.308859258890152, + "learning_rate": 7.50722738577858e-10, + "loss": 0.1764, + "step": 18065 + }, + { + "epoch": 4.807344332091538, + "grad_norm": 0.2756747603416443, + "learning_rate": 7.486586387834615e-10, + "loss": 0.1669, + "step": 18066 + }, + { + "epoch": 4.807610431080362, + "grad_norm": 0.27520349621772766, + "learning_rate": 7.465973698390193e-10, + "loss": 0.1724, + "step": 18067 + }, + { + "epoch": 4.807876530069185, + "grad_norm": 0.27755191922187805, + "learning_rate": 7.445389318033069e-10, + "loss": 0.1823, + "step": 18068 + }, + { + "epoch": 4.808142629058009, + "grad_norm": 0.3710086941719055, + "learning_rate": 7.424833247350326e-10, + "loss": 0.1789, + "step": 18069 + }, + { + "epoch": 4.8084087280468335, + "grad_norm": 0.30246445536613464, + "learning_rate": 7.404305486928386e-10, + "loss": 0.1684, + "step": 18070 + }, + { + "epoch": 4.808674827035658, + "grad_norm": 0.6287772059440613, + "learning_rate": 7.38380603735278e-10, + "loss": 0.1725, + "step": 18071 + }, + { + "epoch": 4.808940926024481, + "grad_norm": 0.46407413482666016, + "learning_rate": 7.36333489920804e-10, + "loss": 0.1621, + "step": 18072 + }, + { + "epoch": 4.809207025013305, + "grad_norm": 0.24830232560634613, + "learning_rate": 7.342892073078144e-10, + "loss": 0.1552, + "step": 18073 + }, + { + "epoch": 4.809473124002129, + "grad_norm": 0.3061382472515106, + "learning_rate": 7.322477559546181e-10, + "loss": 0.1756, + "step": 18074 + }, + { + "epoch": 4.809739222990952, + "grad_norm": 0.29925107955932617, + "learning_rate": 7.302091359194351e-10, + "loss": 0.1781, + "step": 18075 + }, + { + "epoch": 4.8100053219797765, + "grad_norm": 0.2813846170902252, + "learning_rate": 7.281733472604079e-10, + "loss": 0.173, + "step": 18076 + }, + { + "epoch": 4.810271420968601, + "grad_norm": 0.2764217257499695, + "learning_rate": 7.261403900356122e-10, + "loss": 0.1655, + "step": 18077 + }, + { + "epoch": 4.810537519957424, + "grad_norm": 0.27725750207901, + "learning_rate": 7.24110264303024e-10, + "loss": 0.1932, + "step": 18078 + }, + { + "epoch": 4.810803618946248, + "grad_norm": 0.26362788677215576, + "learning_rate": 7.220829701205522e-10, + "loss": 0.1581, + "step": 18079 + }, + { + "epoch": 4.811069717935072, + "grad_norm": 0.2829144597053528, + "learning_rate": 7.200585075460175e-10, + "loss": 0.1844, + "step": 18080 + }, + { + "epoch": 4.811335816923895, + "grad_norm": 0.29137471318244934, + "learning_rate": 7.180368766371514e-10, + "loss": 0.1718, + "step": 18081 + }, + { + "epoch": 4.8116019159127195, + "grad_norm": 0.3439844250679016, + "learning_rate": 7.160180774516411e-10, + "loss": 0.1788, + "step": 18082 + }, + { + "epoch": 4.811868014901544, + "grad_norm": 0.38809412717819214, + "learning_rate": 7.140021100470295e-10, + "loss": 0.1707, + "step": 18083 + }, + { + "epoch": 4.812134113890368, + "grad_norm": 0.31311649084091187, + "learning_rate": 7.119889744808482e-10, + "loss": 0.1702, + "step": 18084 + }, + { + "epoch": 4.812400212879191, + "grad_norm": 0.3443812429904938, + "learning_rate": 7.099786708105071e-10, + "loss": 0.1755, + "step": 18085 + }, + { + "epoch": 4.812666311868015, + "grad_norm": 0.2897148132324219, + "learning_rate": 7.079711990933379e-10, + "loss": 0.1861, + "step": 18086 + }, + { + "epoch": 4.812932410856838, + "grad_norm": 0.4273642599582672, + "learning_rate": 7.059665593865949e-10, + "loss": 0.2054, + "step": 18087 + }, + { + "epoch": 4.813198509845662, + "grad_norm": 0.29217734932899475, + "learning_rate": 7.039647517474767e-10, + "loss": 0.1794, + "step": 18088 + }, + { + "epoch": 4.813464608834487, + "grad_norm": 0.3192504644393921, + "learning_rate": 7.019657762330489e-10, + "loss": 0.1688, + "step": 18089 + }, + { + "epoch": 4.813730707823311, + "grad_norm": 0.2611043453216553, + "learning_rate": 6.999696329003435e-10, + "loss": 0.1623, + "step": 18090 + }, + { + "epoch": 4.813996806812134, + "grad_norm": 0.30351728200912476, + "learning_rate": 6.979763218062928e-10, + "loss": 0.1867, + "step": 18091 + }, + { + "epoch": 4.814262905800958, + "grad_norm": 0.3230254054069519, + "learning_rate": 6.959858430077514e-10, + "loss": 0.1883, + "step": 18092 + }, + { + "epoch": 4.814529004789782, + "grad_norm": 0.30692651867866516, + "learning_rate": 6.939981965614738e-10, + "loss": 0.1931, + "step": 18093 + }, + { + "epoch": 4.814795103778605, + "grad_norm": 0.28584128618240356, + "learning_rate": 6.920133825241925e-10, + "loss": 0.178, + "step": 18094 + }, + { + "epoch": 4.8150612027674295, + "grad_norm": 0.45335885882377625, + "learning_rate": 6.900314009524732e-10, + "loss": 0.1894, + "step": 18095 + }, + { + "epoch": 4.815327301756254, + "grad_norm": 0.26249560713768005, + "learning_rate": 6.880522519028709e-10, + "loss": 0.1683, + "step": 18096 + }, + { + "epoch": 4.815593400745077, + "grad_norm": 0.28124895691871643, + "learning_rate": 6.860759354318291e-10, + "loss": 0.1796, + "step": 18097 + }, + { + "epoch": 4.815859499733901, + "grad_norm": 0.3007746934890747, + "learning_rate": 6.84102451595725e-10, + "loss": 0.168, + "step": 18098 + }, + { + "epoch": 4.816125598722725, + "grad_norm": 0.27024808526039124, + "learning_rate": 6.821318004508248e-10, + "loss": 0.1815, + "step": 18099 + }, + { + "epoch": 4.816391697711548, + "grad_norm": 0.3563704192638397, + "learning_rate": 6.801639820533611e-10, + "loss": 0.1719, + "step": 18100 + }, + { + "epoch": 4.8166577967003725, + "grad_norm": 0.3197290897369385, + "learning_rate": 6.781989964594448e-10, + "loss": 0.1761, + "step": 18101 + }, + { + "epoch": 4.816923895689197, + "grad_norm": 0.29435887932777405, + "learning_rate": 6.762368437251198e-10, + "loss": 0.1816, + "step": 18102 + }, + { + "epoch": 4.81718999467802, + "grad_norm": 0.29335036873817444, + "learning_rate": 6.742775239063636e-10, + "loss": 0.1674, + "step": 18103 + }, + { + "epoch": 4.817456093666844, + "grad_norm": 0.32403630018234253, + "learning_rate": 6.723210370590537e-10, + "loss": 0.1681, + "step": 18104 + }, + { + "epoch": 4.817722192655668, + "grad_norm": 0.251022070646286, + "learning_rate": 6.7036738323899e-10, + "loss": 0.158, + "step": 18105 + }, + { + "epoch": 4.817988291644491, + "grad_norm": 0.3607088029384613, + "learning_rate": 6.684165625018834e-10, + "loss": 0.1819, + "step": 18106 + }, + { + "epoch": 4.8182543906333155, + "grad_norm": 0.36988139152526855, + "learning_rate": 6.664685749034004e-10, + "loss": 0.1575, + "step": 18107 + }, + { + "epoch": 4.81852048962214, + "grad_norm": 0.2935183346271515, + "learning_rate": 6.645234204990858e-10, + "loss": 0.167, + "step": 18108 + }, + { + "epoch": 4.818786588610964, + "grad_norm": 0.2933616638183594, + "learning_rate": 6.62581099344417e-10, + "loss": 0.1758, + "step": 18109 + }, + { + "epoch": 4.819052687599787, + "grad_norm": 0.2918505370616913, + "learning_rate": 6.606416114948055e-10, + "loss": 0.1725, + "step": 18110 + }, + { + "epoch": 4.819318786588611, + "grad_norm": 0.3684498369693756, + "learning_rate": 6.587049570055625e-10, + "loss": 0.172, + "step": 18111 + }, + { + "epoch": 4.819584885577434, + "grad_norm": 0.3827940821647644, + "learning_rate": 6.567711359319217e-10, + "loss": 0.1852, + "step": 18112 + }, + { + "epoch": 4.8198509845662585, + "grad_norm": 0.2901495397090912, + "learning_rate": 6.5484014832905e-10, + "loss": 0.1704, + "step": 18113 + }, + { + "epoch": 4.820117083555083, + "grad_norm": 0.2888476252555847, + "learning_rate": 6.529119942520034e-10, + "loss": 0.1669, + "step": 18114 + }, + { + "epoch": 4.820383182543907, + "grad_norm": 0.3452639579772949, + "learning_rate": 6.509866737557934e-10, + "loss": 0.1764, + "step": 18115 + }, + { + "epoch": 4.82064928153273, + "grad_norm": 0.3621864914894104, + "learning_rate": 6.490641868953428e-10, + "loss": 0.1811, + "step": 18116 + }, + { + "epoch": 4.820915380521554, + "grad_norm": 0.3702150881290436, + "learning_rate": 6.471445337254633e-10, + "loss": 0.1791, + "step": 18117 + }, + { + "epoch": 4.821181479510378, + "grad_norm": 0.3642030656337738, + "learning_rate": 6.452277143009222e-10, + "loss": 0.1746, + "step": 18118 + }, + { + "epoch": 4.8214475784992015, + "grad_norm": 0.3114771544933319, + "learning_rate": 6.433137286763757e-10, + "loss": 0.1672, + "step": 18119 + }, + { + "epoch": 4.821713677488026, + "grad_norm": 0.2764360308647156, + "learning_rate": 6.414025769064246e-10, + "loss": 0.1804, + "step": 18120 + }, + { + "epoch": 4.82197977647685, + "grad_norm": 0.28401046991348267, + "learning_rate": 6.39494259045581e-10, + "loss": 0.1797, + "step": 18121 + }, + { + "epoch": 4.822245875465673, + "grad_norm": 0.26049214601516724, + "learning_rate": 6.375887751482789e-10, + "loss": 0.1594, + "step": 18122 + }, + { + "epoch": 4.822511974454497, + "grad_norm": 0.34412235021591187, + "learning_rate": 6.356861252688417e-10, + "loss": 0.1824, + "step": 18123 + }, + { + "epoch": 4.822778073443321, + "grad_norm": 0.2904921770095825, + "learning_rate": 6.337863094615703e-10, + "loss": 0.1553, + "step": 18124 + }, + { + "epoch": 4.8230441724321444, + "grad_norm": 0.3651595711708069, + "learning_rate": 6.318893277806325e-10, + "loss": 0.2042, + "step": 18125 + }, + { + "epoch": 4.823310271420969, + "grad_norm": 0.40352678298950195, + "learning_rate": 6.299951802801407e-10, + "loss": 0.1807, + "step": 18126 + }, + { + "epoch": 4.823576370409793, + "grad_norm": 0.2764662206172943, + "learning_rate": 6.28103867014107e-10, + "loss": 0.1777, + "step": 18127 + }, + { + "epoch": 4.823842469398616, + "grad_norm": 0.29732292890548706, + "learning_rate": 6.262153880364773e-10, + "loss": 0.1871, + "step": 18128 + }, + { + "epoch": 4.82410856838744, + "grad_norm": 0.45409095287323, + "learning_rate": 6.243297434011308e-10, + "loss": 0.1774, + "step": 18129 + }, + { + "epoch": 4.824374667376264, + "grad_norm": 3.539318323135376, + "learning_rate": 6.224469331618465e-10, + "loss": 0.1807, + "step": 18130 + }, + { + "epoch": 4.824640766365087, + "grad_norm": 0.28063759207725525, + "learning_rate": 6.20566957372315e-10, + "loss": 0.1754, + "step": 18131 + }, + { + "epoch": 4.8249068653539116, + "grad_norm": 0.28054583072662354, + "learning_rate": 6.186898160861709e-10, + "loss": 0.1599, + "step": 18132 + }, + { + "epoch": 4.825172964342736, + "grad_norm": 0.29270848631858826, + "learning_rate": 6.168155093569271e-10, + "loss": 0.1959, + "step": 18133 + }, + { + "epoch": 4.82543906333156, + "grad_norm": 0.2535054087638855, + "learning_rate": 6.149440372380743e-10, + "loss": 0.1417, + "step": 18134 + }, + { + "epoch": 4.825705162320383, + "grad_norm": 0.3018147945404053, + "learning_rate": 6.130753997829696e-10, + "loss": 0.1615, + "step": 18135 + }, + { + "epoch": 4.825971261309207, + "grad_norm": 0.3652266561985016, + "learning_rate": 6.11209597044926e-10, + "loss": 0.2056, + "step": 18136 + }, + { + "epoch": 4.82623736029803, + "grad_norm": 0.3383673429489136, + "learning_rate": 6.093466290771565e-10, + "loss": 0.1607, + "step": 18137 + }, + { + "epoch": 4.8265034592868545, + "grad_norm": 0.4036216139793396, + "learning_rate": 6.074864959327852e-10, + "loss": 0.1725, + "step": 18138 + }, + { + "epoch": 4.826769558275679, + "grad_norm": 0.3430623412132263, + "learning_rate": 6.056291976648809e-10, + "loss": 0.1593, + "step": 18139 + }, + { + "epoch": 4.827035657264503, + "grad_norm": 0.32670527696609497, + "learning_rate": 6.037747343264011e-10, + "loss": 0.1759, + "step": 18140 + }, + { + "epoch": 4.827301756253326, + "grad_norm": 0.34950941801071167, + "learning_rate": 6.01923105970259e-10, + "loss": 0.1671, + "step": 18141 + }, + { + "epoch": 4.82756785524215, + "grad_norm": 0.3576698303222656, + "learning_rate": 6.000743126492458e-10, + "loss": 0.2002, + "step": 18142 + }, + { + "epoch": 4.827833954230974, + "grad_norm": 0.36139050126075745, + "learning_rate": 5.982283544161082e-10, + "loss": 0.1719, + "step": 18143 + }, + { + "epoch": 4.8281000532197975, + "grad_norm": 0.3792920708656311, + "learning_rate": 5.963852313234929e-10, + "loss": 0.1818, + "step": 18144 + }, + { + "epoch": 4.828366152208622, + "grad_norm": 0.3457352817058563, + "learning_rate": 5.945449434239691e-10, + "loss": 0.179, + "step": 18145 + }, + { + "epoch": 4.828632251197446, + "grad_norm": 0.27386870980262756, + "learning_rate": 5.92707490770017e-10, + "loss": 0.1804, + "step": 18146 + }, + { + "epoch": 4.828898350186269, + "grad_norm": 0.29378652572631836, + "learning_rate": 5.908728734140611e-10, + "loss": 0.1937, + "step": 18147 + }, + { + "epoch": 4.829164449175093, + "grad_norm": 0.28546756505966187, + "learning_rate": 5.890410914084265e-10, + "loss": 0.1625, + "step": 18148 + }, + { + "epoch": 4.829430548163917, + "grad_norm": 0.3863827884197235, + "learning_rate": 5.87212144805338e-10, + "loss": 0.1732, + "step": 18149 + }, + { + "epoch": 4.8296966471527405, + "grad_norm": 0.31872081756591797, + "learning_rate": 5.85386033656976e-10, + "loss": 0.1729, + "step": 18150 + }, + { + "epoch": 4.829962746141565, + "grad_norm": 0.3050846755504608, + "learning_rate": 5.835627580154435e-10, + "loss": 0.1646, + "step": 18151 + }, + { + "epoch": 4.830228845130389, + "grad_norm": 0.34707608819007874, + "learning_rate": 5.817423179327097e-10, + "loss": 0.1776, + "step": 18152 + }, + { + "epoch": 4.830494944119212, + "grad_norm": 0.30658799409866333, + "learning_rate": 5.79924713460711e-10, + "loss": 0.1801, + "step": 18153 + }, + { + "epoch": 4.830761043108036, + "grad_norm": 0.3729839622974396, + "learning_rate": 5.781099446512949e-10, + "loss": 0.196, + "step": 18154 + }, + { + "epoch": 4.83102714209686, + "grad_norm": 0.36572280526161194, + "learning_rate": 5.7629801155622e-10, + "loss": 0.1831, + "step": 18155 + }, + { + "epoch": 4.8312932410856835, + "grad_norm": 0.3503606915473938, + "learning_rate": 5.744889142271558e-10, + "loss": 0.1735, + "step": 18156 + }, + { + "epoch": 4.831559340074508, + "grad_norm": 0.3213267922401428, + "learning_rate": 5.726826527157169e-10, + "loss": 0.202, + "step": 18157 + }, + { + "epoch": 4.831825439063332, + "grad_norm": 0.5461715459823608, + "learning_rate": 5.708792270734175e-10, + "loss": 0.1878, + "step": 18158 + }, + { + "epoch": 4.832091538052156, + "grad_norm": 0.24477337300777435, + "learning_rate": 5.690786373516831e-10, + "loss": 0.1488, + "step": 18159 + }, + { + "epoch": 4.832357637040979, + "grad_norm": 0.3130444884300232, + "learning_rate": 5.672808836018949e-10, + "loss": 0.1759, + "step": 18160 + }, + { + "epoch": 4.832623736029803, + "grad_norm": 0.45510345697402954, + "learning_rate": 5.654859658753008e-10, + "loss": 0.1959, + "step": 18161 + }, + { + "epoch": 4.8328898350186265, + "grad_norm": 0.29054224491119385, + "learning_rate": 5.636938842231154e-10, + "loss": 0.1716, + "step": 18162 + }, + { + "epoch": 4.833155934007451, + "grad_norm": 0.2963685393333435, + "learning_rate": 5.61904638696431e-10, + "loss": 0.1724, + "step": 18163 + }, + { + "epoch": 4.833422032996275, + "grad_norm": 0.26626238226890564, + "learning_rate": 5.60118229346307e-10, + "loss": 0.1558, + "step": 18164 + }, + { + "epoch": 4.833688131985099, + "grad_norm": 0.30983972549438477, + "learning_rate": 5.583346562236691e-10, + "loss": 0.1807, + "step": 18165 + }, + { + "epoch": 4.833954230973922, + "grad_norm": 0.26758164167404175, + "learning_rate": 5.565539193794211e-10, + "loss": 0.1694, + "step": 18166 + }, + { + "epoch": 4.834220329962746, + "grad_norm": 0.2660999894142151, + "learning_rate": 5.547760188643225e-10, + "loss": 0.1758, + "step": 18167 + }, + { + "epoch": 4.83448642895157, + "grad_norm": 0.46392127871513367, + "learning_rate": 5.530009547290992e-10, + "loss": 0.1734, + "step": 18168 + }, + { + "epoch": 4.834752527940394, + "grad_norm": 0.31879180669784546, + "learning_rate": 5.512287270243665e-10, + "loss": 0.1754, + "step": 18169 + }, + { + "epoch": 4.835018626929218, + "grad_norm": 0.2910238802433014, + "learning_rate": 5.494593358006839e-10, + "loss": 0.1643, + "step": 18170 + }, + { + "epoch": 4.835284725918042, + "grad_norm": 0.2897898852825165, + "learning_rate": 5.476927811085108e-10, + "loss": 0.1531, + "step": 18171 + }, + { + "epoch": 4.835550824906865, + "grad_norm": 0.47645726799964905, + "learning_rate": 5.459290629982405e-10, + "loss": 0.1768, + "step": 18172 + }, + { + "epoch": 4.835816923895689, + "grad_norm": 0.28157684206962585, + "learning_rate": 5.441681815201771e-10, + "loss": 0.1778, + "step": 18173 + }, + { + "epoch": 4.836083022884513, + "grad_norm": 0.28602853417396545, + "learning_rate": 5.42410136724536e-10, + "loss": 0.1683, + "step": 18174 + }, + { + "epoch": 4.8363491218733365, + "grad_norm": 0.3796902298927307, + "learning_rate": 5.406549286614659e-10, + "loss": 0.1754, + "step": 18175 + }, + { + "epoch": 4.836615220862161, + "grad_norm": 0.8395867943763733, + "learning_rate": 5.389025573810269e-10, + "loss": 0.1916, + "step": 18176 + }, + { + "epoch": 4.836881319850985, + "grad_norm": 0.3798242211341858, + "learning_rate": 5.371530229332122e-10, + "loss": 0.1877, + "step": 18177 + }, + { + "epoch": 4.837147418839808, + "grad_norm": 0.4071282744407654, + "learning_rate": 5.354063253679042e-10, + "loss": 0.1736, + "step": 18178 + }, + { + "epoch": 4.837413517828632, + "grad_norm": 0.3952648341655731, + "learning_rate": 5.336624647349186e-10, + "loss": 0.1826, + "step": 18179 + }, + { + "epoch": 4.837679616817456, + "grad_norm": 0.3561742603778839, + "learning_rate": 5.319214410840156e-10, + "loss": 0.1851, + "step": 18180 + }, + { + "epoch": 4.8379457158062795, + "grad_norm": 0.35495346784591675, + "learning_rate": 5.301832544648444e-10, + "loss": 0.1664, + "step": 18181 + }, + { + "epoch": 4.838211814795104, + "grad_norm": 0.37261995673179626, + "learning_rate": 5.284479049269763e-10, + "loss": 0.1788, + "step": 18182 + }, + { + "epoch": 4.838477913783928, + "grad_norm": 0.2750149965286255, + "learning_rate": 5.267153925199164e-10, + "loss": 0.1772, + "step": 18183 + }, + { + "epoch": 4.838744012772752, + "grad_norm": 0.3015384376049042, + "learning_rate": 5.249857172930583e-10, + "loss": 0.1848, + "step": 18184 + }, + { + "epoch": 4.839010111761575, + "grad_norm": 0.26365214586257935, + "learning_rate": 5.232588792957626e-10, + "loss": 0.1686, + "step": 18185 + }, + { + "epoch": 4.839276210750399, + "grad_norm": 0.268608957529068, + "learning_rate": 5.215348785772567e-10, + "loss": 0.1569, + "step": 18186 + }, + { + "epoch": 4.8395423097392225, + "grad_norm": 0.2940000295639038, + "learning_rate": 5.198137151867344e-10, + "loss": 0.1789, + "step": 18187 + }, + { + "epoch": 4.839808408728047, + "grad_norm": 0.2580329179763794, + "learning_rate": 5.180953891732786e-10, + "loss": 0.1689, + "step": 18188 + }, + { + "epoch": 4.840074507716871, + "grad_norm": 0.29248011112213135, + "learning_rate": 5.16379900585906e-10, + "loss": 0.1685, + "step": 18189 + }, + { + "epoch": 4.840340606705695, + "grad_norm": 0.338885098695755, + "learning_rate": 5.146672494735327e-10, + "loss": 0.1738, + "step": 18190 + }, + { + "epoch": 4.840606705694518, + "grad_norm": 0.38430190086364746, + "learning_rate": 5.129574358850086e-10, + "loss": 0.17, + "step": 18191 + }, + { + "epoch": 4.840872804683342, + "grad_norm": 0.3398773670196533, + "learning_rate": 5.112504598691169e-10, + "loss": 0.1846, + "step": 18192 + }, + { + "epoch": 4.841138903672166, + "grad_norm": 0.2793777883052826, + "learning_rate": 5.095463214745188e-10, + "loss": 0.1807, + "step": 18193 + }, + { + "epoch": 4.84140500266099, + "grad_norm": 0.3680970370769501, + "learning_rate": 5.078450207498419e-10, + "loss": 0.1862, + "step": 18194 + }, + { + "epoch": 4.841671101649814, + "grad_norm": 0.28958743810653687, + "learning_rate": 5.06146557743603e-10, + "loss": 0.1884, + "step": 18195 + }, + { + "epoch": 4.841937200638638, + "grad_norm": 0.26446714997291565, + "learning_rate": 5.044509325042412e-10, + "loss": 0.1585, + "step": 18196 + }, + { + "epoch": 4.842203299627461, + "grad_norm": 0.27950385212898254, + "learning_rate": 5.027581450801288e-10, + "loss": 0.1646, + "step": 18197 + }, + { + "epoch": 4.842469398616285, + "grad_norm": 0.380510151386261, + "learning_rate": 5.010681955195384e-10, + "loss": 0.1922, + "step": 18198 + }, + { + "epoch": 4.842735497605109, + "grad_norm": 0.2720518112182617, + "learning_rate": 4.993810838706758e-10, + "loss": 0.1625, + "step": 18199 + }, + { + "epoch": 4.843001596593933, + "grad_norm": 0.29007983207702637, + "learning_rate": 4.976968101816581e-10, + "loss": 0.1792, + "step": 18200 + }, + { + "epoch": 4.843267695582757, + "grad_norm": 0.3151227533817291, + "learning_rate": 4.960153745005247e-10, + "loss": 0.1806, + "step": 18201 + }, + { + "epoch": 4.843533794571581, + "grad_norm": 0.29012331366539, + "learning_rate": 4.943367768752482e-10, + "loss": 0.1917, + "step": 18202 + }, + { + "epoch": 4.843799893560405, + "grad_norm": 0.29355326294898987, + "learning_rate": 4.926610173536794e-10, + "loss": 0.1858, + "step": 18203 + }, + { + "epoch": 4.844065992549228, + "grad_norm": 0.4514481723308563, + "learning_rate": 4.909880959836243e-10, + "loss": 0.1728, + "step": 18204 + }, + { + "epoch": 4.844332091538052, + "grad_norm": 0.32227855920791626, + "learning_rate": 4.893180128128116e-10, + "loss": 0.1643, + "step": 18205 + }, + { + "epoch": 4.844598190526876, + "grad_norm": 0.3999951481819153, + "learning_rate": 4.876507678888587e-10, + "loss": 0.1944, + "step": 18206 + }, + { + "epoch": 4.8448642895157, + "grad_norm": 0.2978564500808716, + "learning_rate": 4.859863612593162e-10, + "loss": 0.1708, + "step": 18207 + }, + { + "epoch": 4.845130388504524, + "grad_norm": 0.37594032287597656, + "learning_rate": 4.843247929716688e-10, + "loss": 0.1882, + "step": 18208 + }, + { + "epoch": 4.845396487493348, + "grad_norm": 0.47270694375038147, + "learning_rate": 4.826660630733115e-10, + "loss": 0.1812, + "step": 18209 + }, + { + "epoch": 4.845662586482171, + "grad_norm": 0.42149239778518677, + "learning_rate": 4.8101017161154e-10, + "loss": 0.1859, + "step": 18210 + }, + { + "epoch": 4.845928685470995, + "grad_norm": 0.31853097677230835, + "learning_rate": 4.793571186336053e-10, + "loss": 0.1753, + "step": 18211 + }, + { + "epoch": 4.846194784459819, + "grad_norm": 0.27779442071914673, + "learning_rate": 4.777069041866255e-10, + "loss": 0.1805, + "step": 18212 + }, + { + "epoch": 4.846460883448643, + "grad_norm": 0.261227011680603, + "learning_rate": 4.760595283176849e-10, + "loss": 0.1654, + "step": 18213 + }, + { + "epoch": 4.846726982437467, + "grad_norm": 0.3102070391178131, + "learning_rate": 4.744149910737794e-10, + "loss": 0.1757, + "step": 18214 + }, + { + "epoch": 4.846993081426291, + "grad_norm": 0.3374248147010803, + "learning_rate": 4.727732925017936e-10, + "loss": 0.1664, + "step": 18215 + }, + { + "epoch": 4.847259180415114, + "grad_norm": 0.2822561264038086, + "learning_rate": 4.711344326485567e-10, + "loss": 0.1729, + "step": 18216 + }, + { + "epoch": 4.847525279403938, + "grad_norm": 0.29700592160224915, + "learning_rate": 4.694984115608313e-10, + "loss": 0.1559, + "step": 18217 + }, + { + "epoch": 4.847791378392762, + "grad_norm": 0.2716532051563263, + "learning_rate": 4.678652292852691e-10, + "loss": 0.1756, + "step": 18218 + }, + { + "epoch": 4.848057477381586, + "grad_norm": 0.29896804690361023, + "learning_rate": 4.66234885868444e-10, + "loss": 0.1747, + "step": 18219 + }, + { + "epoch": 4.84832357637041, + "grad_norm": 0.3710137903690338, + "learning_rate": 4.6460738135685184e-10, + "loss": 0.1946, + "step": 18220 + }, + { + "epoch": 4.848589675359234, + "grad_norm": 0.30336177349090576, + "learning_rate": 4.629827157969335e-10, + "loss": 0.1883, + "step": 18221 + }, + { + "epoch": 4.848855774348057, + "grad_norm": 0.3894135653972626, + "learning_rate": 4.613608892350185e-10, + "loss": 0.2004, + "step": 18222 + }, + { + "epoch": 4.849121873336881, + "grad_norm": 0.33576500415802, + "learning_rate": 4.597419017173587e-10, + "loss": 0.1781, + "step": 18223 + }, + { + "epoch": 4.849387972325705, + "grad_norm": 0.2623612880706787, + "learning_rate": 4.5812575329013946e-10, + "loss": 0.1609, + "step": 18224 + }, + { + "epoch": 4.849654071314529, + "grad_norm": 0.35305142402648926, + "learning_rate": 4.565124439994461e-10, + "loss": 0.1974, + "step": 18225 + }, + { + "epoch": 4.849920170303353, + "grad_norm": 0.29641297459602356, + "learning_rate": 4.549019738913085e-10, + "loss": 0.1616, + "step": 18226 + }, + { + "epoch": 4.850186269292177, + "grad_norm": 0.2554256021976471, + "learning_rate": 4.532943430116454e-10, + "loss": 0.1712, + "step": 18227 + }, + { + "epoch": 4.850452368281001, + "grad_norm": 0.2814469039440155, + "learning_rate": 4.5168955140632014e-10, + "loss": 0.166, + "step": 18228 + }, + { + "epoch": 4.850718467269824, + "grad_norm": 0.34057822823524475, + "learning_rate": 4.5008759912109615e-10, + "loss": 0.1894, + "step": 18229 + }, + { + "epoch": 4.850984566258648, + "grad_norm": 0.2777947783470154, + "learning_rate": 4.4848848620167024e-10, + "loss": 0.1714, + "step": 18230 + }, + { + "epoch": 4.851250665247472, + "grad_norm": 0.33479854464530945, + "learning_rate": 4.468922126936503e-10, + "loss": 0.1826, + "step": 18231 + }, + { + "epoch": 4.851516764236296, + "grad_norm": 0.28151997923851013, + "learning_rate": 4.452987786425777e-10, + "loss": 0.1572, + "step": 18232 + }, + { + "epoch": 4.85178286322512, + "grad_norm": 0.4098174273967743, + "learning_rate": 4.4370818409387166e-10, + "loss": 0.1964, + "step": 18233 + }, + { + "epoch": 4.852048962213944, + "grad_norm": 0.4242161810398102, + "learning_rate": 4.421204290929292e-10, + "loss": 0.1924, + "step": 18234 + }, + { + "epoch": 4.852315061202767, + "grad_norm": 0.3353584408760071, + "learning_rate": 4.4053551368501417e-10, + "loss": 0.1782, + "step": 18235 + }, + { + "epoch": 4.852581160191591, + "grad_norm": 0.2675326466560364, + "learning_rate": 4.3895343791534587e-10, + "loss": 0.1662, + "step": 18236 + }, + { + "epoch": 4.8528472591804155, + "grad_norm": 0.2847924828529358, + "learning_rate": 4.373742018290327e-10, + "loss": 0.1618, + "step": 18237 + }, + { + "epoch": 4.853113358169239, + "grad_norm": 0.29408150911331177, + "learning_rate": 4.3579780547114975e-10, + "loss": 0.1567, + "step": 18238 + }, + { + "epoch": 4.853379457158063, + "grad_norm": 0.3848411440849304, + "learning_rate": 4.342242488866166e-10, + "loss": 0.1824, + "step": 18239 + }, + { + "epoch": 4.853645556146887, + "grad_norm": 0.3599056303501129, + "learning_rate": 4.326535321203528e-10, + "loss": 0.1658, + "step": 18240 + }, + { + "epoch": 4.85391165513571, + "grad_norm": 0.3678988814353943, + "learning_rate": 4.310856552171227e-10, + "loss": 0.1903, + "step": 18241 + }, + { + "epoch": 4.854177754124534, + "grad_norm": 0.36562007665634155, + "learning_rate": 4.295206182216793e-10, + "loss": 0.1774, + "step": 18242 + }, + { + "epoch": 4.8544438531133585, + "grad_norm": 0.29460805654525757, + "learning_rate": 4.2795842117863133e-10, + "loss": 0.1937, + "step": 18243 + }, + { + "epoch": 4.854709952102182, + "grad_norm": 0.3268750011920929, + "learning_rate": 4.263990641325543e-10, + "loss": 0.1829, + "step": 18244 + }, + { + "epoch": 4.854976051091006, + "grad_norm": 0.28630948066711426, + "learning_rate": 4.248425471279238e-10, + "loss": 0.1705, + "step": 18245 + }, + { + "epoch": 4.85524215007983, + "grad_norm": 0.35829752683639526, + "learning_rate": 4.232888702091264e-10, + "loss": 0.1709, + "step": 18246 + }, + { + "epoch": 4.855508249068653, + "grad_norm": 0.33073270320892334, + "learning_rate": 4.217380334204823e-10, + "loss": 0.1741, + "step": 18247 + }, + { + "epoch": 4.855774348057477, + "grad_norm": 0.3135761320590973, + "learning_rate": 4.2019003680622277e-10, + "loss": 0.1729, + "step": 18248 + }, + { + "epoch": 4.8560404470463014, + "grad_norm": 0.3265696167945862, + "learning_rate": 4.186448804105014e-10, + "loss": 0.1753, + "step": 18249 + }, + { + "epoch": 4.856306546035125, + "grad_norm": 0.49089333415031433, + "learning_rate": 4.171025642773718e-10, + "loss": 0.1815, + "step": 18250 + }, + { + "epoch": 4.856572645023949, + "grad_norm": 0.3937915563583374, + "learning_rate": 4.155630884508543e-10, + "loss": 0.1704, + "step": 18251 + }, + { + "epoch": 4.856838744012773, + "grad_norm": 0.27395910024642944, + "learning_rate": 4.140264529748361e-10, + "loss": 0.172, + "step": 18252 + }, + { + "epoch": 4.857104843001597, + "grad_norm": 0.30012163519859314, + "learning_rate": 4.124926578931598e-10, + "loss": 0.1712, + "step": 18253 + }, + { + "epoch": 4.85737094199042, + "grad_norm": 0.2984602451324463, + "learning_rate": 4.1096170324955714e-10, + "loss": 0.1962, + "step": 18254 + }, + { + "epoch": 4.857637040979244, + "grad_norm": 0.3478955030441284, + "learning_rate": 4.0943358908770433e-10, + "loss": 0.1699, + "step": 18255 + }, + { + "epoch": 4.857903139968068, + "grad_norm": 0.34119224548339844, + "learning_rate": 4.0790831545117753e-10, + "loss": 0.2031, + "step": 18256 + }, + { + "epoch": 4.858169238956892, + "grad_norm": 0.2925213873386383, + "learning_rate": 4.063858823834976e-10, + "loss": 0.1891, + "step": 18257 + }, + { + "epoch": 4.858435337945716, + "grad_norm": 0.2723603844642639, + "learning_rate": 4.0486628992806303e-10, + "loss": 0.1653, + "step": 18258 + }, + { + "epoch": 4.85870143693454, + "grad_norm": 0.33663210272789, + "learning_rate": 4.0334953812823924e-10, + "loss": 0.1874, + "step": 18259 + }, + { + "epoch": 4.858967535923363, + "grad_norm": 0.27479127049446106, + "learning_rate": 4.0183562702726937e-10, + "loss": 0.1567, + "step": 18260 + }, + { + "epoch": 4.859233634912187, + "grad_norm": 0.4073953330516815, + "learning_rate": 4.003245566683522e-10, + "loss": 0.1793, + "step": 18261 + }, + { + "epoch": 4.8594997339010115, + "grad_norm": 0.33524656295776367, + "learning_rate": 3.9881632709456435e-10, + "loss": 0.182, + "step": 18262 + }, + { + "epoch": 4.859765832889835, + "grad_norm": 0.41235601902008057, + "learning_rate": 3.97310938348927e-10, + "loss": 0.1931, + "step": 18263 + }, + { + "epoch": 4.860031931878659, + "grad_norm": 0.24593903124332428, + "learning_rate": 3.9580839047440583e-10, + "loss": 0.1608, + "step": 18264 + }, + { + "epoch": 4.860298030867483, + "grad_norm": 0.25542935729026794, + "learning_rate": 3.9430868351382206e-10, + "loss": 0.1565, + "step": 18265 + }, + { + "epoch": 4.860564129856306, + "grad_norm": 0.2904559373855591, + "learning_rate": 3.9281181750995263e-10, + "loss": 0.16, + "step": 18266 + }, + { + "epoch": 4.86083022884513, + "grad_norm": 0.5259557962417603, + "learning_rate": 3.913177925055189e-10, + "loss": 0.1619, + "step": 18267 + }, + { + "epoch": 4.8610963278339545, + "grad_norm": 0.2713858187198639, + "learning_rate": 3.89826608543109e-10, + "loss": 0.1742, + "step": 18268 + }, + { + "epoch": 4.861362426822778, + "grad_norm": 0.2796783447265625, + "learning_rate": 3.8833826566526673e-10, + "loss": 0.173, + "step": 18269 + }, + { + "epoch": 4.861628525811602, + "grad_norm": 0.3737056255340576, + "learning_rate": 3.868527639144359e-10, + "loss": 0.1999, + "step": 18270 + }, + { + "epoch": 4.861894624800426, + "grad_norm": 0.2915538549423218, + "learning_rate": 3.853701033329826e-10, + "loss": 0.172, + "step": 18271 + }, + { + "epoch": 4.862160723789249, + "grad_norm": 0.27116110920906067, + "learning_rate": 3.838902839631952e-10, + "loss": 0.1637, + "step": 18272 + }, + { + "epoch": 4.862426822778073, + "grad_norm": 0.2837781012058258, + "learning_rate": 3.824133058472956e-10, + "loss": 0.1913, + "step": 18273 + }, + { + "epoch": 4.8626929217668975, + "grad_norm": 0.2758031487464905, + "learning_rate": 3.8093916902740554e-10, + "loss": 0.1604, + "step": 18274 + }, + { + "epoch": 4.862959020755721, + "grad_norm": 0.2832943797111511, + "learning_rate": 3.7946787354555807e-10, + "loss": 0.1689, + "step": 18275 + }, + { + "epoch": 4.863225119744545, + "grad_norm": 0.33452358841896057, + "learning_rate": 3.779994194437308e-10, + "loss": 0.1839, + "step": 18276 + }, + { + "epoch": 4.863491218733369, + "grad_norm": 0.2932380437850952, + "learning_rate": 3.765338067638013e-10, + "loss": 0.1656, + "step": 18277 + }, + { + "epoch": 4.863757317722193, + "grad_norm": 0.2837543189525604, + "learning_rate": 3.7507103554756946e-10, + "loss": 0.1598, + "step": 18278 + }, + { + "epoch": 4.864023416711016, + "grad_norm": 0.27237144112586975, + "learning_rate": 3.736111058367575e-10, + "loss": 0.1759, + "step": 18279 + }, + { + "epoch": 4.8642895156998405, + "grad_norm": 0.2948760688304901, + "learning_rate": 3.721540176730098e-10, + "loss": 0.1668, + "step": 18280 + }, + { + "epoch": 4.864555614688664, + "grad_norm": 0.3400571644306183, + "learning_rate": 3.7069977109788207e-10, + "loss": 0.1858, + "step": 18281 + }, + { + "epoch": 4.864821713677488, + "grad_norm": 0.37162965536117554, + "learning_rate": 3.6924836615285227e-10, + "loss": 0.1809, + "step": 18282 + }, + { + "epoch": 4.865087812666312, + "grad_norm": 0.3198734223842621, + "learning_rate": 3.6779980287932053e-10, + "loss": 0.1761, + "step": 18283 + }, + { + "epoch": 4.865353911655136, + "grad_norm": 0.2649148106575012, + "learning_rate": 3.6635408131859834e-10, + "loss": 0.1776, + "step": 18284 + }, + { + "epoch": 4.865620010643959, + "grad_norm": 0.27444154024124146, + "learning_rate": 3.6491120151193045e-10, + "loss": 0.1932, + "step": 18285 + }, + { + "epoch": 4.8658861096327835, + "grad_norm": 0.46424201130867004, + "learning_rate": 3.634711635004617e-10, + "loss": 0.1905, + "step": 18286 + }, + { + "epoch": 4.866152208621608, + "grad_norm": 0.2792642116546631, + "learning_rate": 3.6203396732525927e-10, + "loss": 0.1735, + "step": 18287 + }, + { + "epoch": 4.866418307610431, + "grad_norm": 0.2725597321987152, + "learning_rate": 3.6059961302732367e-10, + "loss": 0.1761, + "step": 18288 + }, + { + "epoch": 4.866684406599255, + "grad_norm": 0.2785884439945221, + "learning_rate": 3.5916810064756664e-10, + "loss": 0.1706, + "step": 18289 + }, + { + "epoch": 4.866950505588079, + "grad_norm": 0.28378570079803467, + "learning_rate": 3.5773943022681105e-10, + "loss": 0.1803, + "step": 18290 + }, + { + "epoch": 4.867216604576902, + "grad_norm": 0.28638628125190735, + "learning_rate": 3.563136018058133e-10, + "loss": 0.1734, + "step": 18291 + }, + { + "epoch": 4.867482703565726, + "grad_norm": 0.3199619650840759, + "learning_rate": 3.548906154252407e-10, + "loss": 0.1967, + "step": 18292 + }, + { + "epoch": 4.867748802554551, + "grad_norm": 0.38343361020088196, + "learning_rate": 3.53470471125672e-10, + "loss": 0.1802, + "step": 18293 + }, + { + "epoch": 4.868014901543374, + "grad_norm": 0.3347615897655487, + "learning_rate": 3.5205316894761915e-10, + "loss": 0.1746, + "step": 18294 + }, + { + "epoch": 4.868281000532198, + "grad_norm": 0.3536832630634308, + "learning_rate": 3.506387089315055e-10, + "loss": 0.1627, + "step": 18295 + }, + { + "epoch": 4.868547099521022, + "grad_norm": 0.28120753169059753, + "learning_rate": 3.4922709111766536e-10, + "loss": 0.1844, + "step": 18296 + }, + { + "epoch": 4.868813198509845, + "grad_norm": 0.2791498601436615, + "learning_rate": 3.4781831554637765e-10, + "loss": 0.1672, + "step": 18297 + }, + { + "epoch": 4.869079297498669, + "grad_norm": 0.32586348056793213, + "learning_rate": 3.464123822578102e-10, + "loss": 0.1957, + "step": 18298 + }, + { + "epoch": 4.8693453964874935, + "grad_norm": 0.29450488090515137, + "learning_rate": 3.4500929129206436e-10, + "loss": 0.182, + "step": 18299 + }, + { + "epoch": 4.869611495476317, + "grad_norm": 0.28869980573654175, + "learning_rate": 3.436090426891636e-10, + "loss": 0.1662, + "step": 18300 + }, + { + "epoch": 4.869877594465141, + "grad_norm": 0.2785114049911499, + "learning_rate": 3.422116364890537e-10, + "loss": 0.1699, + "step": 18301 + }, + { + "epoch": 4.870143693453965, + "grad_norm": 0.3917093276977539, + "learning_rate": 3.408170727315696e-10, + "loss": 0.1953, + "step": 18302 + }, + { + "epoch": 4.870409792442789, + "grad_norm": 0.36233851313591003, + "learning_rate": 3.3942535145650154e-10, + "loss": 0.1645, + "step": 18303 + }, + { + "epoch": 4.870675891431612, + "grad_norm": 0.27952614426612854, + "learning_rate": 3.3803647270355116e-10, + "loss": 0.1719, + "step": 18304 + }, + { + "epoch": 4.8709419904204365, + "grad_norm": 0.4190031886100769, + "learning_rate": 3.366504365123091e-10, + "loss": 0.1859, + "step": 18305 + }, + { + "epoch": 4.87120808940926, + "grad_norm": 0.3468034565448761, + "learning_rate": 3.352672429223435e-10, + "loss": 0.1665, + "step": 18306 + }, + { + "epoch": 4.871474188398084, + "grad_norm": 0.418006032705307, + "learning_rate": 3.3388689197306753e-10, + "loss": 0.1876, + "step": 18307 + }, + { + "epoch": 4.871740287386908, + "grad_norm": 0.3299577832221985, + "learning_rate": 3.325093837038717e-10, + "loss": 0.186, + "step": 18308 + }, + { + "epoch": 4.872006386375732, + "grad_norm": 0.2689879536628723, + "learning_rate": 3.311347181540469e-10, + "loss": 0.1537, + "step": 18309 + }, + { + "epoch": 4.872272485364555, + "grad_norm": 0.25123557448387146, + "learning_rate": 3.297628953627951e-10, + "loss": 0.1446, + "step": 18310 + }, + { + "epoch": 4.8725385843533795, + "grad_norm": 0.25989383459091187, + "learning_rate": 3.2839391536924056e-10, + "loss": 0.178, + "step": 18311 + }, + { + "epoch": 4.872804683342204, + "grad_norm": 0.348666250705719, + "learning_rate": 3.2702777821244087e-10, + "loss": 0.1746, + "step": 18312 + }, + { + "epoch": 4.873070782331027, + "grad_norm": 0.28080734610557556, + "learning_rate": 3.2566448393134274e-10, + "loss": 0.1596, + "step": 18313 + }, + { + "epoch": 4.873336881319851, + "grad_norm": 0.29112616181373596, + "learning_rate": 3.2430403256485937e-10, + "loss": 0.1774, + "step": 18314 + }, + { + "epoch": 4.873602980308675, + "grad_norm": 0.35819995403289795, + "learning_rate": 3.229464241517599e-10, + "loss": 0.1878, + "step": 18315 + }, + { + "epoch": 4.873869079297498, + "grad_norm": 0.2858063876628876, + "learning_rate": 3.2159165873079095e-10, + "loss": 0.168, + "step": 18316 + }, + { + "epoch": 4.8741351782863225, + "grad_norm": 0.3319970369338989, + "learning_rate": 3.202397363405773e-10, + "loss": 0.1726, + "step": 18317 + }, + { + "epoch": 4.874401277275147, + "grad_norm": 0.337243914604187, + "learning_rate": 3.1889065701968807e-10, + "loss": 0.1908, + "step": 18318 + }, + { + "epoch": 4.87466737626397, + "grad_norm": 0.49201497435569763, + "learning_rate": 3.175444208065925e-10, + "loss": 0.1748, + "step": 18319 + }, + { + "epoch": 4.874933475252794, + "grad_norm": 0.29142114520072937, + "learning_rate": 3.1620102773970426e-10, + "loss": 0.1704, + "step": 18320 + }, + { + "epoch": 4.875199574241618, + "grad_norm": 0.31393274664878845, + "learning_rate": 3.1486047785732604e-10, + "loss": 0.1558, + "step": 18321 + }, + { + "epoch": 4.875465673230442, + "grad_norm": 0.269946813583374, + "learning_rate": 3.135227711977051e-10, + "loss": 0.1704, + "step": 18322 + }, + { + "epoch": 4.8757317722192655, + "grad_norm": 0.38719096779823303, + "learning_rate": 3.121879077989775e-10, + "loss": 0.2054, + "step": 18323 + }, + { + "epoch": 4.87599787120809, + "grad_norm": 0.28126269578933716, + "learning_rate": 3.10855887699224e-10, + "loss": 0.1771, + "step": 18324 + }, + { + "epoch": 4.876263970196913, + "grad_norm": 0.2589380145072937, + "learning_rate": 3.0952671093644744e-10, + "loss": 0.1799, + "step": 18325 + }, + { + "epoch": 4.876530069185737, + "grad_norm": 0.3111386299133301, + "learning_rate": 3.082003775485398e-10, + "loss": 0.1811, + "step": 18326 + }, + { + "epoch": 4.876796168174561, + "grad_norm": 0.33380764722824097, + "learning_rate": 3.068768875733596e-10, + "loss": 0.1668, + "step": 18327 + }, + { + "epoch": 4.877062267163385, + "grad_norm": 0.27680903673171997, + "learning_rate": 3.0555624104862123e-10, + "loss": 0.174, + "step": 18328 + }, + { + "epoch": 4.877328366152208, + "grad_norm": 0.3058173656463623, + "learning_rate": 3.0423843801200556e-10, + "loss": 0.1648, + "step": 18329 + }, + { + "epoch": 4.877594465141033, + "grad_norm": 0.26737555861473083, + "learning_rate": 3.029234785011048e-10, + "loss": 0.1743, + "step": 18330 + }, + { + "epoch": 4.877860564129857, + "grad_norm": 0.3766164183616638, + "learning_rate": 3.0161136255342225e-10, + "loss": 0.158, + "step": 18331 + }, + { + "epoch": 4.87812666311868, + "grad_norm": 0.29646605253219604, + "learning_rate": 3.003020902063835e-10, + "loss": 0.191, + "step": 18332 + }, + { + "epoch": 4.878392762107504, + "grad_norm": 0.35430020093917847, + "learning_rate": 2.989956614973255e-10, + "loss": 0.1911, + "step": 18333 + }, + { + "epoch": 4.878658861096328, + "grad_norm": 0.29045844078063965, + "learning_rate": 2.9769207646350716e-10, + "loss": 0.1848, + "step": 18334 + }, + { + "epoch": 4.878924960085151, + "grad_norm": 0.25991004705429077, + "learning_rate": 2.96391335142121e-10, + "loss": 0.1668, + "step": 18335 + }, + { + "epoch": 4.8791910590739755, + "grad_norm": 0.33315104246139526, + "learning_rate": 2.950934375702707e-10, + "loss": 0.1811, + "step": 18336 + }, + { + "epoch": 4.8794571580628, + "grad_norm": 0.2583266496658325, + "learning_rate": 2.937983837849489e-10, + "loss": 0.1737, + "step": 18337 + }, + { + "epoch": 4.879723257051623, + "grad_norm": 0.39822056889533997, + "learning_rate": 2.9250617382311496e-10, + "loss": 0.1922, + "step": 18338 + }, + { + "epoch": 4.879989356040447, + "grad_norm": 0.27339521050453186, + "learning_rate": 2.9121680772162816e-10, + "loss": 0.1739, + "step": 18339 + }, + { + "epoch": 4.880255455029271, + "grad_norm": 0.3974146246910095, + "learning_rate": 2.8993028551724806e-10, + "loss": 0.1822, + "step": 18340 + }, + { + "epoch": 4.880521554018094, + "grad_norm": 0.3681298494338989, + "learning_rate": 2.8864660724667864e-10, + "loss": 0.1969, + "step": 18341 + }, + { + "epoch": 4.8807876530069185, + "grad_norm": 0.2677922248840332, + "learning_rate": 2.873657729465351e-10, + "loss": 0.1617, + "step": 18342 + }, + { + "epoch": 4.881053751995743, + "grad_norm": 0.2806972563266754, + "learning_rate": 2.8608778265334366e-10, + "loss": 0.1788, + "step": 18343 + }, + { + "epoch": 4.881319850984566, + "grad_norm": 0.29096758365631104, + "learning_rate": 2.84812636403553e-10, + "loss": 0.171, + "step": 18344 + }, + { + "epoch": 4.88158594997339, + "grad_norm": 0.2886067032814026, + "learning_rate": 2.835403342335452e-10, + "loss": 0.1824, + "step": 18345 + }, + { + "epoch": 4.881852048962214, + "grad_norm": 0.36309289932250977, + "learning_rate": 2.822708761795911e-10, + "loss": 0.1815, + "step": 18346 + }, + { + "epoch": 4.882118147951038, + "grad_norm": 0.2744152843952179, + "learning_rate": 2.810042622779063e-10, + "loss": 0.1645, + "step": 18347 + }, + { + "epoch": 4.8823842469398615, + "grad_norm": 0.28333133459091187, + "learning_rate": 2.7974049256462853e-10, + "loss": 0.1795, + "step": 18348 + }, + { + "epoch": 4.882650345928686, + "grad_norm": 0.26053208112716675, + "learning_rate": 2.7847956707578445e-10, + "loss": 0.1647, + "step": 18349 + }, + { + "epoch": 4.882916444917509, + "grad_norm": 0.48119089007377625, + "learning_rate": 2.772214858473565e-10, + "loss": 0.1899, + "step": 18350 + }, + { + "epoch": 4.883182543906333, + "grad_norm": 0.4013840854167938, + "learning_rate": 2.7596624891521593e-10, + "loss": 0.1808, + "step": 18351 + }, + { + "epoch": 4.883448642895157, + "grad_norm": 0.2725149095058441, + "learning_rate": 2.747138563151563e-10, + "loss": 0.1723, + "step": 18352 + }, + { + "epoch": 4.883714741883981, + "grad_norm": 0.3406542241573334, + "learning_rate": 2.7346430808291574e-10, + "loss": 0.1694, + "step": 18353 + }, + { + "epoch": 4.8839808408728045, + "grad_norm": 0.30983924865722656, + "learning_rate": 2.7221760425413243e-10, + "loss": 0.1686, + "step": 18354 + }, + { + "epoch": 4.884246939861629, + "grad_norm": 0.3771633803844452, + "learning_rate": 2.7097374486435565e-10, + "loss": 0.1745, + "step": 18355 + }, + { + "epoch": 4.884513038850453, + "grad_norm": 0.3207329213619232, + "learning_rate": 2.6973272994906813e-10, + "loss": 0.1528, + "step": 18356 + }, + { + "epoch": 4.884779137839276, + "grad_norm": 0.4620004892349243, + "learning_rate": 2.6849455954366386e-10, + "loss": 0.1962, + "step": 18357 + }, + { + "epoch": 4.8850452368281, + "grad_norm": 0.31118714809417725, + "learning_rate": 2.6725923368344783e-10, + "loss": 0.1657, + "step": 18358 + }, + { + "epoch": 4.885311335816924, + "grad_norm": 0.3382824957370758, + "learning_rate": 2.660267524036808e-10, + "loss": 0.1808, + "step": 18359 + }, + { + "epoch": 4.8855774348057475, + "grad_norm": 0.27484962344169617, + "learning_rate": 2.6479711573949014e-10, + "loss": 0.175, + "step": 18360 + }, + { + "epoch": 4.885843533794572, + "grad_norm": 0.2752825617790222, + "learning_rate": 2.63570323725959e-10, + "loss": 0.1758, + "step": 18361 + }, + { + "epoch": 4.886109632783396, + "grad_norm": 0.29865774512290955, + "learning_rate": 2.6234637639808155e-10, + "loss": 0.1742, + "step": 18362 + }, + { + "epoch": 4.886375731772219, + "grad_norm": 0.2793433666229248, + "learning_rate": 2.611252737907632e-10, + "loss": 0.1635, + "step": 18363 + }, + { + "epoch": 4.886641830761043, + "grad_norm": 0.27863308787345886, + "learning_rate": 2.599070159388206e-10, + "loss": 0.1628, + "step": 18364 + }, + { + "epoch": 4.886907929749867, + "grad_norm": 0.2784312665462494, + "learning_rate": 2.5869160287702584e-10, + "loss": 0.1939, + "step": 18365 + }, + { + "epoch": 4.8871740287386904, + "grad_norm": 0.3835919201374054, + "learning_rate": 2.5747903464001794e-10, + "loss": 0.1787, + "step": 18366 + }, + { + "epoch": 4.887440127727515, + "grad_norm": 0.40257227420806885, + "learning_rate": 2.562693112624026e-10, + "loss": 0.1977, + "step": 18367 + }, + { + "epoch": 4.887706226716339, + "grad_norm": 0.3275109529495239, + "learning_rate": 2.550624327786632e-10, + "loss": 0.1734, + "step": 18368 + }, + { + "epoch": 4.887972325705162, + "grad_norm": 0.28834688663482666, + "learning_rate": 2.5385839922325016e-10, + "loss": 0.1759, + "step": 18369 + }, + { + "epoch": 4.888238424693986, + "grad_norm": 0.2672900855541229, + "learning_rate": 2.526572106304803e-10, + "loss": 0.1777, + "step": 18370 + }, + { + "epoch": 4.88850452368281, + "grad_norm": 0.34272462129592896, + "learning_rate": 2.5145886703462627e-10, + "loss": 0.1779, + "step": 18371 + }, + { + "epoch": 4.888770622671634, + "grad_norm": 0.29315468668937683, + "learning_rate": 2.502633684698607e-10, + "loss": 0.151, + "step": 18372 + }, + { + "epoch": 4.8890367216604576, + "grad_norm": 0.35381630063056946, + "learning_rate": 2.4907071497028977e-10, + "loss": 0.1752, + "step": 18373 + }, + { + "epoch": 4.889302820649282, + "grad_norm": 0.3585895597934723, + "learning_rate": 2.4788090656991945e-10, + "loss": 0.1771, + "step": 18374 + }, + { + "epoch": 4.889568919638105, + "grad_norm": 0.26764366030693054, + "learning_rate": 2.4669394330270045e-10, + "loss": 0.1746, + "step": 18375 + }, + { + "epoch": 4.889835018626929, + "grad_norm": 0.25638437271118164, + "learning_rate": 2.455098252024723e-10, + "loss": 0.1638, + "step": 18376 + }, + { + "epoch": 4.890101117615753, + "grad_norm": 0.27293115854263306, + "learning_rate": 2.443285523030192e-10, + "loss": 0.1759, + "step": 18377 + }, + { + "epoch": 4.890367216604577, + "grad_norm": 0.3628344237804413, + "learning_rate": 2.431501246380252e-10, + "loss": 0.1838, + "step": 18378 + }, + { + "epoch": 4.8906333155934005, + "grad_norm": 0.2751172184944153, + "learning_rate": 2.4197454224110794e-10, + "loss": 0.1673, + "step": 18379 + }, + { + "epoch": 4.890899414582225, + "grad_norm": 0.3797107934951782, + "learning_rate": 2.408018051457961e-10, + "loss": 0.1769, + "step": 18380 + }, + { + "epoch": 4.891165513571049, + "grad_norm": 0.29602518677711487, + "learning_rate": 2.396319133855296e-10, + "loss": 0.1733, + "step": 18381 + }, + { + "epoch": 4.891431612559872, + "grad_norm": 0.2844385504722595, + "learning_rate": 2.384648669936928e-10, + "loss": 0.1709, + "step": 18382 + }, + { + "epoch": 4.891697711548696, + "grad_norm": 0.48374152183532715, + "learning_rate": 2.3730066600354815e-10, + "loss": 0.2002, + "step": 18383 + }, + { + "epoch": 4.89196381053752, + "grad_norm": 0.3358643651008606, + "learning_rate": 2.3613931044833556e-10, + "loss": 0.1808, + "step": 18384 + }, + { + "epoch": 4.8922299095263435, + "grad_norm": 0.39886873960494995, + "learning_rate": 2.349808003611509e-10, + "loss": 0.1818, + "step": 18385 + }, + { + "epoch": 4.892496008515168, + "grad_norm": 0.3249102532863617, + "learning_rate": 2.338251357750454e-10, + "loss": 0.1799, + "step": 18386 + }, + { + "epoch": 4.892762107503992, + "grad_norm": 0.29249417781829834, + "learning_rate": 2.326723167229816e-10, + "loss": 0.187, + "step": 18387 + }, + { + "epoch": 4.893028206492815, + "grad_norm": 0.44637298583984375, + "learning_rate": 2.3152234323783325e-10, + "loss": 0.1646, + "step": 18388 + }, + { + "epoch": 4.893294305481639, + "grad_norm": 0.4462771415710449, + "learning_rate": 2.3037521535241856e-10, + "loss": 0.1845, + "step": 18389 + }, + { + "epoch": 4.893560404470463, + "grad_norm": 0.3550979197025299, + "learning_rate": 2.2923093309943353e-10, + "loss": 0.2014, + "step": 18390 + }, + { + "epoch": 4.8938265034592865, + "grad_norm": 0.2972887456417084, + "learning_rate": 2.2808949651154098e-10, + "loss": 0.1854, + "step": 18391 + }, + { + "epoch": 4.894092602448111, + "grad_norm": 0.2810679078102112, + "learning_rate": 2.269509056212704e-10, + "loss": 0.1623, + "step": 18392 + }, + { + "epoch": 4.894358701436935, + "grad_norm": 0.28779783844947815, + "learning_rate": 2.2581516046111804e-10, + "loss": 0.1551, + "step": 18393 + }, + { + "epoch": 4.894624800425758, + "grad_norm": 0.28134506940841675, + "learning_rate": 2.2468226106345801e-10, + "loss": 0.1747, + "step": 18394 + }, + { + "epoch": 4.894890899414582, + "grad_norm": 0.32235556840896606, + "learning_rate": 2.2355220746061998e-10, + "loss": 0.1731, + "step": 18395 + }, + { + "epoch": 4.895156998403406, + "grad_norm": 0.41634106636047363, + "learning_rate": 2.2242499968482264e-10, + "loss": 0.1941, + "step": 18396 + }, + { + "epoch": 4.89542309739223, + "grad_norm": 0.36888810992240906, + "learning_rate": 2.2130063776822916e-10, + "loss": 0.1784, + "step": 18397 + }, + { + "epoch": 4.895689196381054, + "grad_norm": 0.42570099234580994, + "learning_rate": 2.2017912174289167e-10, + "loss": 0.1728, + "step": 18398 + }, + { + "epoch": 4.895955295369878, + "grad_norm": 0.34078121185302734, + "learning_rate": 2.1906045164081788e-10, + "loss": 0.1631, + "step": 18399 + }, + { + "epoch": 4.896221394358701, + "grad_norm": 0.2550407648086548, + "learning_rate": 2.1794462749389343e-10, + "loss": 0.1481, + "step": 18400 + }, + { + "epoch": 4.896487493347525, + "grad_norm": 0.4754488468170166, + "learning_rate": 2.1683164933397058e-10, + "loss": 0.1901, + "step": 18401 + }, + { + "epoch": 4.896753592336349, + "grad_norm": 0.28173521161079407, + "learning_rate": 2.1572151719276844e-10, + "loss": 0.1739, + "step": 18402 + }, + { + "epoch": 4.897019691325173, + "grad_norm": 0.2903049886226654, + "learning_rate": 2.1461423110196163e-10, + "loss": 0.1928, + "step": 18403 + }, + { + "epoch": 4.897285790313997, + "grad_norm": 0.27414941787719727, + "learning_rate": 2.1350979109312496e-10, + "loss": 0.1708, + "step": 18404 + }, + { + "epoch": 4.897551889302821, + "grad_norm": 0.4191405177116394, + "learning_rate": 2.1240819719776648e-10, + "loss": 0.1939, + "step": 18405 + }, + { + "epoch": 4.897817988291645, + "grad_norm": 0.3717360198497772, + "learning_rate": 2.1130944944731666e-10, + "loss": 0.1796, + "step": 18406 + }, + { + "epoch": 4.898084087280468, + "grad_norm": 0.34948214888572693, + "learning_rate": 2.1021354787308376e-10, + "loss": 0.1787, + "step": 18407 + }, + { + "epoch": 4.898350186269292, + "grad_norm": 0.2609736919403076, + "learning_rate": 2.0912049250635388e-10, + "loss": 0.1688, + "step": 18408 + }, + { + "epoch": 4.898616285258116, + "grad_norm": 0.296810507774353, + "learning_rate": 2.08030283378291e-10, + "loss": 0.1782, + "step": 18409 + }, + { + "epoch": 4.89888238424694, + "grad_norm": 0.33781781792640686, + "learning_rate": 2.069429205199924e-10, + "loss": 0.1854, + "step": 18410 + }, + { + "epoch": 4.899148483235764, + "grad_norm": 0.2588849663734436, + "learning_rate": 2.0585840396246666e-10, + "loss": 0.1602, + "step": 18411 + }, + { + "epoch": 4.899414582224588, + "grad_norm": 0.28917503356933594, + "learning_rate": 2.047767337366557e-10, + "loss": 0.1896, + "step": 18412 + }, + { + "epoch": 4.899680681213411, + "grad_norm": 0.3001542091369629, + "learning_rate": 2.036979098734015e-10, + "loss": 0.1781, + "step": 18413 + }, + { + "epoch": 4.899946780202235, + "grad_norm": 0.29672500491142273, + "learning_rate": 2.0262193240347947e-10, + "loss": 0.1755, + "step": 18414 + }, + { + "epoch": 4.900212879191059, + "grad_norm": 0.40150395035743713, + "learning_rate": 2.0154880135758722e-10, + "loss": 0.1615, + "step": 18415 + }, + { + "epoch": 4.9004789781798825, + "grad_norm": 0.32085829973220825, + "learning_rate": 2.0047851676631145e-10, + "loss": 0.1833, + "step": 18416 + }, + { + "epoch": 4.900745077168707, + "grad_norm": 0.2782307267189026, + "learning_rate": 1.9941107866019435e-10, + "loss": 0.1776, + "step": 18417 + }, + { + "epoch": 4.901011176157531, + "grad_norm": 0.29732292890548706, + "learning_rate": 1.9834648706967827e-10, + "loss": 0.1838, + "step": 18418 + }, + { + "epoch": 4.901277275146354, + "grad_norm": 0.30598053336143494, + "learning_rate": 1.9728474202512778e-10, + "loss": 0.1877, + "step": 18419 + }, + { + "epoch": 4.901543374135178, + "grad_norm": 0.28148066997528076, + "learning_rate": 1.962258435568187e-10, + "loss": 0.1815, + "step": 18420 + }, + { + "epoch": 4.901809473124002, + "grad_norm": 0.31817835569381714, + "learning_rate": 1.9516979169497127e-10, + "loss": 0.1749, + "step": 18421 + }, + { + "epoch": 4.902075572112826, + "grad_norm": 0.25433608889579773, + "learning_rate": 1.9411658646968365e-10, + "loss": 0.171, + "step": 18422 + }, + { + "epoch": 4.90234167110165, + "grad_norm": 0.41494736075401306, + "learning_rate": 1.9306622791100956e-10, + "loss": 0.2004, + "step": 18423 + }, + { + "epoch": 4.902607770090474, + "grad_norm": 0.43654564023017883, + "learning_rate": 1.9201871604890285e-10, + "loss": 0.1782, + "step": 18424 + }, + { + "epoch": 4.902873869079297, + "grad_norm": 0.3142329752445221, + "learning_rate": 1.909740509132507e-10, + "loss": 0.1794, + "step": 18425 + }, + { + "epoch": 4.903139968068121, + "grad_norm": 0.38531285524368286, + "learning_rate": 1.8993223253382929e-10, + "loss": 0.1813, + "step": 18426 + }, + { + "epoch": 4.903406067056945, + "grad_norm": 0.38482794165611267, + "learning_rate": 1.8889326094037038e-10, + "loss": 0.1792, + "step": 18427 + }, + { + "epoch": 4.903672166045769, + "grad_norm": 0.2766588032245636, + "learning_rate": 1.8785713616250587e-10, + "loss": 0.1779, + "step": 18428 + }, + { + "epoch": 4.903938265034593, + "grad_norm": 0.2718169689178467, + "learning_rate": 1.8682385822977875e-10, + "loss": 0.1672, + "step": 18429 + }, + { + "epoch": 4.904204364023417, + "grad_norm": 0.3610142767429352, + "learning_rate": 1.8579342717165436e-10, + "loss": 0.1819, + "step": 18430 + }, + { + "epoch": 4.904470463012241, + "grad_norm": 0.28238165378570557, + "learning_rate": 1.847658430175536e-10, + "loss": 0.1592, + "step": 18431 + }, + { + "epoch": 4.904736562001064, + "grad_norm": 0.27387917041778564, + "learning_rate": 1.8374110579675307e-10, + "loss": 0.1665, + "step": 18432 + }, + { + "epoch": 4.905002660989888, + "grad_norm": 0.36254823207855225, + "learning_rate": 1.82719215538496e-10, + "loss": 0.1693, + "step": 18433 + }, + { + "epoch": 4.905268759978712, + "grad_norm": 0.2928624153137207, + "learning_rate": 1.817001722719147e-10, + "loss": 0.1656, + "step": 18434 + }, + { + "epoch": 4.905534858967536, + "grad_norm": 0.2909259796142578, + "learning_rate": 1.8068397602609696e-10, + "loss": 0.1776, + "step": 18435 + }, + { + "epoch": 4.90580095795636, + "grad_norm": 0.3898829519748688, + "learning_rate": 1.7967062683001965e-10, + "loss": 0.1849, + "step": 18436 + }, + { + "epoch": 4.906067056945184, + "grad_norm": 0.2878948152065277, + "learning_rate": 1.7866012471257076e-10, + "loss": 0.1731, + "step": 18437 + }, + { + "epoch": 4.906333155934007, + "grad_norm": 0.26932382583618164, + "learning_rate": 1.7765246970258273e-10, + "loss": 0.1776, + "step": 18438 + }, + { + "epoch": 4.906599254922831, + "grad_norm": 0.3720378875732422, + "learning_rate": 1.7664766182879932e-10, + "loss": 0.1901, + "step": 18439 + }, + { + "epoch": 4.906865353911655, + "grad_norm": 0.2898809015750885, + "learning_rate": 1.756457011198753e-10, + "loss": 0.1785, + "step": 18440 + }, + { + "epoch": 4.9071314529004795, + "grad_norm": 0.2838614583015442, + "learning_rate": 1.7464658760438788e-10, + "loss": 0.181, + "step": 18441 + }, + { + "epoch": 4.907397551889303, + "grad_norm": 0.2842010259628296, + "learning_rate": 1.7365032131082536e-10, + "loss": 0.179, + "step": 18442 + }, + { + "epoch": 4.907663650878127, + "grad_norm": 0.2761198580265045, + "learning_rate": 1.7265690226762052e-10, + "loss": 0.178, + "step": 18443 + }, + { + "epoch": 4.90792974986695, + "grad_norm": 0.35202908515930176, + "learning_rate": 1.716663305031063e-10, + "loss": 0.1945, + "step": 18444 + }, + { + "epoch": 4.908195848855774, + "grad_norm": 0.27362585067749023, + "learning_rate": 1.7067860604551565e-10, + "loss": 0.1794, + "step": 18445 + }, + { + "epoch": 4.908461947844598, + "grad_norm": 0.31670325994491577, + "learning_rate": 1.6969372892303714e-10, + "loss": 0.1487, + "step": 18446 + }, + { + "epoch": 4.9087280468334225, + "grad_norm": 0.3267209231853485, + "learning_rate": 1.6871169916377047e-10, + "loss": 0.1895, + "step": 18447 + }, + { + "epoch": 4.908994145822246, + "grad_norm": 0.2762942612171173, + "learning_rate": 1.6773251679570443e-10, + "loss": 0.1619, + "step": 18448 + }, + { + "epoch": 4.90926024481107, + "grad_norm": 0.34348416328430176, + "learning_rate": 1.667561818467722e-10, + "loss": 0.1848, + "step": 18449 + }, + { + "epoch": 4.909526343799894, + "grad_norm": 0.33167514204978943, + "learning_rate": 1.6578269434482928e-10, + "loss": 0.1673, + "step": 18450 + }, + { + "epoch": 4.909792442788717, + "grad_norm": 0.2873489260673523, + "learning_rate": 1.6481205431763124e-10, + "loss": 0.1763, + "step": 18451 + }, + { + "epoch": 4.910058541777541, + "grad_norm": 0.38123786449432373, + "learning_rate": 1.6384426179287813e-10, + "loss": 0.1727, + "step": 18452 + }, + { + "epoch": 4.910324640766365, + "grad_norm": 0.299567848443985, + "learning_rate": 1.6287931679815904e-10, + "loss": 0.177, + "step": 18453 + }, + { + "epoch": 4.910590739755189, + "grad_norm": 0.2850651443004608, + "learning_rate": 1.6191721936099633e-10, + "loss": 0.1753, + "step": 18454 + }, + { + "epoch": 4.910856838744013, + "grad_norm": 0.26993927359580994, + "learning_rate": 1.6095796950883477e-10, + "loss": 0.1657, + "step": 18455 + }, + { + "epoch": 4.911122937732837, + "grad_norm": 0.2747349739074707, + "learning_rate": 1.6000156726904134e-10, + "loss": 0.1779, + "step": 18456 + }, + { + "epoch": 4.91138903672166, + "grad_norm": 0.2635691463947296, + "learning_rate": 1.5904801266888314e-10, + "loss": 0.1562, + "step": 18457 + }, + { + "epoch": 4.911655135710484, + "grad_norm": 0.2614465057849884, + "learning_rate": 1.5809730573556058e-10, + "loss": 0.1677, + "step": 18458 + }, + { + "epoch": 4.911921234699308, + "grad_norm": 0.2811394929885864, + "learning_rate": 1.5714944649619644e-10, + "loss": 0.1855, + "step": 18459 + }, + { + "epoch": 4.912187333688132, + "grad_norm": 0.35411062836647034, + "learning_rate": 1.5620443497782465e-10, + "loss": 0.1797, + "step": 18460 + }, + { + "epoch": 4.912453432676956, + "grad_norm": 0.3053078055381775, + "learning_rate": 1.5526227120737923e-10, + "loss": 0.1982, + "step": 18461 + }, + { + "epoch": 4.91271953166578, + "grad_norm": 0.30274468660354614, + "learning_rate": 1.5432295521176087e-10, + "loss": 0.1833, + "step": 18462 + }, + { + "epoch": 4.912985630654603, + "grad_norm": 0.3031100332736969, + "learning_rate": 1.5338648701773705e-10, + "loss": 0.1679, + "step": 18463 + }, + { + "epoch": 4.913251729643427, + "grad_norm": 0.3646046221256256, + "learning_rate": 1.5245286665204193e-10, + "loss": 0.1748, + "step": 18464 + }, + { + "epoch": 4.913517828632251, + "grad_norm": 0.2965335547924042, + "learning_rate": 1.5152209414128758e-10, + "loss": 0.1485, + "step": 18465 + }, + { + "epoch": 4.9137839276210755, + "grad_norm": 0.2879486382007599, + "learning_rate": 1.505941695120194e-10, + "loss": 0.1821, + "step": 18466 + }, + { + "epoch": 4.914050026609899, + "grad_norm": 0.30645909905433655, + "learning_rate": 1.4966909279071627e-10, + "loss": 0.1845, + "step": 18467 + }, + { + "epoch": 4.914316125598723, + "grad_norm": 0.2725645899772644, + "learning_rate": 1.4874686400374592e-10, + "loss": 0.1775, + "step": 18468 + }, + { + "epoch": 4.914582224587546, + "grad_norm": 0.28764232993125916, + "learning_rate": 1.4782748317743176e-10, + "loss": 0.1814, + "step": 18469 + }, + { + "epoch": 4.91484832357637, + "grad_norm": 0.2780200242996216, + "learning_rate": 1.4691095033797507e-10, + "loss": 0.1837, + "step": 18470 + }, + { + "epoch": 4.915114422565194, + "grad_norm": 0.2729853689670563, + "learning_rate": 1.459972655115438e-10, + "loss": 0.1705, + "step": 18471 + }, + { + "epoch": 4.9153805215540185, + "grad_norm": 0.39010825753211975, + "learning_rate": 1.4508642872417266e-10, + "loss": 0.1616, + "step": 18472 + }, + { + "epoch": 4.915646620542842, + "grad_norm": 0.3657337725162506, + "learning_rate": 1.4417844000186309e-10, + "loss": 0.1724, + "step": 18473 + }, + { + "epoch": 4.915912719531666, + "grad_norm": 0.27849817276000977, + "learning_rate": 1.4327329937049436e-10, + "loss": 0.1661, + "step": 18474 + }, + { + "epoch": 4.91617881852049, + "grad_norm": 0.39383405447006226, + "learning_rate": 1.4237100685589031e-10, + "loss": 0.193, + "step": 18475 + }, + { + "epoch": 4.916444917509313, + "grad_norm": 0.2947171628475189, + "learning_rate": 1.414715624837859e-10, + "loss": 0.174, + "step": 18476 + }, + { + "epoch": 4.916711016498137, + "grad_norm": 0.3429816961288452, + "learning_rate": 1.4057496627982724e-10, + "loss": 0.1657, + "step": 18477 + }, + { + "epoch": 4.9169771154869615, + "grad_norm": 0.2745802104473114, + "learning_rate": 1.39681218269605e-10, + "loss": 0.1688, + "step": 18478 + }, + { + "epoch": 4.917243214475785, + "grad_norm": 0.3395518958568573, + "learning_rate": 1.387903184785988e-10, + "loss": 0.1795, + "step": 18479 + }, + { + "epoch": 4.917509313464609, + "grad_norm": 0.2910093665122986, + "learning_rate": 1.3790226693222163e-10, + "loss": 0.1752, + "step": 18480 + }, + { + "epoch": 4.917775412453433, + "grad_norm": 0.27795013785362244, + "learning_rate": 1.3701706365579768e-10, + "loss": 0.1563, + "step": 18481 + }, + { + "epoch": 4.918041511442256, + "grad_norm": 0.2718280851840973, + "learning_rate": 1.3613470867457344e-10, + "loss": 0.1615, + "step": 18482 + }, + { + "epoch": 4.91830761043108, + "grad_norm": 0.2988283634185791, + "learning_rate": 1.3525520201372876e-10, + "loss": 0.1712, + "step": 18483 + }, + { + "epoch": 4.9185737094199045, + "grad_norm": 0.27930572628974915, + "learning_rate": 1.3437854369833247e-10, + "loss": 0.178, + "step": 18484 + }, + { + "epoch": 4.918839808408728, + "grad_norm": 0.30769726634025574, + "learning_rate": 1.3350473375339788e-10, + "loss": 0.1924, + "step": 18485 + }, + { + "epoch": 4.919105907397552, + "grad_norm": 0.38184860348701477, + "learning_rate": 1.3263377220386062e-10, + "loss": 0.1801, + "step": 18486 + }, + { + "epoch": 4.919372006386376, + "grad_norm": 0.3549209237098694, + "learning_rate": 1.3176565907453418e-10, + "loss": 0.1855, + "step": 18487 + }, + { + "epoch": 4.919638105375199, + "grad_norm": 0.28554221987724304, + "learning_rate": 1.3090039439019874e-10, + "loss": 0.1712, + "step": 18488 + }, + { + "epoch": 4.919904204364023, + "grad_norm": 0.4695762097835541, + "learning_rate": 1.3003797817552342e-10, + "loss": 0.1804, + "step": 18489 + }, + { + "epoch": 4.920170303352847, + "grad_norm": 0.3356362581253052, + "learning_rate": 1.291784104551108e-10, + "loss": 0.1926, + "step": 18490 + }, + { + "epoch": 4.920436402341672, + "grad_norm": 0.27518612146377563, + "learning_rate": 1.2832169125348568e-10, + "loss": 0.1779, + "step": 18491 + }, + { + "epoch": 4.920702501330495, + "grad_norm": 0.2755756080150604, + "learning_rate": 1.274678205950619e-10, + "loss": 0.1705, + "step": 18492 + }, + { + "epoch": 4.920968600319319, + "grad_norm": 0.27468451857566833, + "learning_rate": 1.266167985042199e-10, + "loss": 0.1737, + "step": 18493 + }, + { + "epoch": 4.921234699308142, + "grad_norm": 0.27556490898132324, + "learning_rate": 1.2576862500520702e-10, + "loss": 0.1667, + "step": 18494 + }, + { + "epoch": 4.921500798296966, + "grad_norm": 0.3538787066936493, + "learning_rate": 1.2492330012223717e-10, + "loss": 0.1739, + "step": 18495 + }, + { + "epoch": 4.92176689728579, + "grad_norm": 0.2916152775287628, + "learning_rate": 1.2408082387940222e-10, + "loss": 0.1924, + "step": 18496 + }, + { + "epoch": 4.9220329962746145, + "grad_norm": 0.27534401416778564, + "learning_rate": 1.2324119630074958e-10, + "loss": 0.1666, + "step": 18497 + }, + { + "epoch": 4.922299095263438, + "grad_norm": 0.26887714862823486, + "learning_rate": 1.2240441741020457e-10, + "loss": 0.1641, + "step": 18498 + }, + { + "epoch": 4.922565194252262, + "grad_norm": 0.269654244184494, + "learning_rate": 1.215704872316592e-10, + "loss": 0.1645, + "step": 18499 + }, + { + "epoch": 4.922831293241086, + "grad_norm": 0.37262874841690063, + "learning_rate": 1.2073940578888331e-10, + "loss": 0.1967, + "step": 18500 + }, + { + "epoch": 4.923097392229909, + "grad_norm": 0.32620659470558167, + "learning_rate": 1.199111731055802e-10, + "loss": 0.1747, + "step": 18501 + }, + { + "epoch": 4.923363491218733, + "grad_norm": 0.3157995343208313, + "learning_rate": 1.1908578920537537e-10, + "loss": 0.1722, + "step": 18502 + }, + { + "epoch": 4.9236295902075575, + "grad_norm": 0.6205373406410217, + "learning_rate": 1.182632541118167e-10, + "loss": 0.1823, + "step": 18503 + }, + { + "epoch": 4.923895689196381, + "grad_norm": 0.2954105734825134, + "learning_rate": 1.1744356784835208e-10, + "loss": 0.1577, + "step": 18504 + }, + { + "epoch": 4.924161788185205, + "grad_norm": 0.3194320499897003, + "learning_rate": 1.166267304383739e-10, + "loss": 0.1633, + "step": 18505 + }, + { + "epoch": 4.924427887174029, + "grad_norm": 0.258806973695755, + "learning_rate": 1.1581274190517464e-10, + "loss": 0.1738, + "step": 18506 + }, + { + "epoch": 4.924693986162852, + "grad_norm": 0.270866334438324, + "learning_rate": 1.1500160227196909e-10, + "loss": 0.1688, + "step": 18507 + }, + { + "epoch": 4.924960085151676, + "grad_norm": 0.25443536043167114, + "learning_rate": 1.1419331156189427e-10, + "loss": 0.157, + "step": 18508 + }, + { + "epoch": 4.9252261841405005, + "grad_norm": 0.34692010283470154, + "learning_rate": 1.1338786979799842e-10, + "loss": 0.1946, + "step": 18509 + }, + { + "epoch": 4.925492283129324, + "grad_norm": 0.2918321490287781, + "learning_rate": 1.1258527700326314e-10, + "loss": 0.1698, + "step": 18510 + }, + { + "epoch": 4.925758382118148, + "grad_norm": 0.2795330882072449, + "learning_rate": 1.1178553320058126e-10, + "loss": 0.1601, + "step": 18511 + }, + { + "epoch": 4.926024481106972, + "grad_norm": 0.28974848985671997, + "learning_rate": 1.1098863841274564e-10, + "loss": 0.1872, + "step": 18512 + }, + { + "epoch": 4.926290580095795, + "grad_norm": 0.2888753116130829, + "learning_rate": 1.1019459266251585e-10, + "loss": 0.1628, + "step": 18513 + }, + { + "epoch": 4.926556679084619, + "grad_norm": 0.27858999371528625, + "learning_rate": 1.0940339597250714e-10, + "loss": 0.1905, + "step": 18514 + }, + { + "epoch": 4.9268227780734435, + "grad_norm": 0.281956285238266, + "learning_rate": 1.0861504836531255e-10, + "loss": 0.1919, + "step": 18515 + }, + { + "epoch": 4.927088877062268, + "grad_norm": 0.28579258918762207, + "learning_rate": 1.0782954986340298e-10, + "loss": 0.1731, + "step": 18516 + }, + { + "epoch": 4.927354976051091, + "grad_norm": 0.29829055070877075, + "learning_rate": 1.0704690048918275e-10, + "loss": 0.189, + "step": 18517 + }, + { + "epoch": 4.927621075039915, + "grad_norm": 0.3261210024356842, + "learning_rate": 1.0626710026497842e-10, + "loss": 0.1889, + "step": 18518 + }, + { + "epoch": 4.927887174028738, + "grad_norm": 0.33816346526145935, + "learning_rate": 1.054901492130278e-10, + "loss": 0.1718, + "step": 18519 + }, + { + "epoch": 4.928153273017562, + "grad_norm": 0.3206266760826111, + "learning_rate": 1.0471604735549089e-10, + "loss": 0.1819, + "step": 18520 + }, + { + "epoch": 4.9284193720063865, + "grad_norm": 0.4024772346019745, + "learning_rate": 1.0394479471445005e-10, + "loss": 0.1808, + "step": 18521 + }, + { + "epoch": 4.928685470995211, + "grad_norm": 0.270435631275177, + "learning_rate": 1.0317639131190992e-10, + "loss": 0.1797, + "step": 18522 + }, + { + "epoch": 4.928951569984034, + "grad_norm": 0.37287816405296326, + "learning_rate": 1.0241083716977516e-10, + "loss": 0.1902, + "step": 18523 + }, + { + "epoch": 4.929217668972858, + "grad_norm": 0.26216697692871094, + "learning_rate": 1.016481323098839e-10, + "loss": 0.1623, + "step": 18524 + }, + { + "epoch": 4.929483767961682, + "grad_norm": 0.30200478434562683, + "learning_rate": 1.0088827675399647e-10, + "loss": 0.1871, + "step": 18525 + }, + { + "epoch": 4.929749866950505, + "grad_norm": 0.2724837362766266, + "learning_rate": 1.0013127052377335e-10, + "loss": 0.1818, + "step": 18526 + }, + { + "epoch": 4.9300159659393294, + "grad_norm": 0.29630959033966064, + "learning_rate": 9.937711364080836e-11, + "loss": 0.1774, + "step": 18527 + }, + { + "epoch": 4.930282064928154, + "grad_norm": 0.2784840166568756, + "learning_rate": 9.862580612662875e-11, + "loss": 0.1687, + "step": 18528 + }, + { + "epoch": 4.930548163916977, + "grad_norm": 0.4053194224834442, + "learning_rate": 9.787734800263958e-11, + "loss": 0.1815, + "step": 18529 + }, + { + "epoch": 4.930814262905801, + "grad_norm": 0.4386313259601593, + "learning_rate": 9.713173929020158e-11, + "loss": 0.1677, + "step": 18530 + }, + { + "epoch": 4.931080361894625, + "grad_norm": 0.5091015100479126, + "learning_rate": 9.63889800105866e-11, + "loss": 0.1791, + "step": 18531 + }, + { + "epoch": 4.931346460883448, + "grad_norm": 0.4206509590148926, + "learning_rate": 9.56490701849666e-11, + "loss": 0.1973, + "step": 18532 + }, + { + "epoch": 4.931612559872272, + "grad_norm": 0.2863839864730835, + "learning_rate": 9.491200983444692e-11, + "loss": 0.1874, + "step": 18533 + }, + { + "epoch": 4.9318786588610966, + "grad_norm": 0.2636914849281311, + "learning_rate": 9.417779898005518e-11, + "loss": 0.1611, + "step": 18534 + }, + { + "epoch": 4.93214475784992, + "grad_norm": 0.2791191637516022, + "learning_rate": 9.34464376427302e-11, + "loss": 0.1843, + "step": 18535 + }, + { + "epoch": 4.932410856838744, + "grad_norm": 0.26553598046302795, + "learning_rate": 9.271792584333305e-11, + "loss": 0.1724, + "step": 18536 + }, + { + "epoch": 4.932676955827568, + "grad_norm": 0.4209882616996765, + "learning_rate": 9.199226360264712e-11, + "loss": 0.1832, + "step": 18537 + }, + { + "epoch": 4.932943054816391, + "grad_norm": 0.2977527379989624, + "learning_rate": 9.126945094135585e-11, + "loss": 0.1882, + "step": 18538 + }, + { + "epoch": 4.933209153805215, + "grad_norm": 0.2928573489189148, + "learning_rate": 9.054948788008721e-11, + "loss": 0.1923, + "step": 18539 + }, + { + "epoch": 4.9334752527940395, + "grad_norm": 0.2784872055053711, + "learning_rate": 8.98323744393803e-11, + "loss": 0.1851, + "step": 18540 + }, + { + "epoch": 4.933741351782864, + "grad_norm": 0.3575987219810486, + "learning_rate": 8.911811063967433e-11, + "loss": 0.1868, + "step": 18541 + }, + { + "epoch": 4.934007450771687, + "grad_norm": 0.41144099831581116, + "learning_rate": 8.840669650135302e-11, + "loss": 0.1862, + "step": 18542 + }, + { + "epoch": 4.934273549760511, + "grad_norm": 0.32067063450813293, + "learning_rate": 8.769813204470012e-11, + "loss": 0.1759, + "step": 18543 + }, + { + "epoch": 4.934539648749334, + "grad_norm": 0.27424779534339905, + "learning_rate": 8.699241728992168e-11, + "loss": 0.1729, + "step": 18544 + }, + { + "epoch": 4.934805747738158, + "grad_norm": 0.28786060214042664, + "learning_rate": 8.628955225715717e-11, + "loss": 0.165, + "step": 18545 + }, + { + "epoch": 4.9350718467269825, + "grad_norm": 0.41609182953834534, + "learning_rate": 8.558953696645721e-11, + "loss": 0.1705, + "step": 18546 + }, + { + "epoch": 4.935337945715807, + "grad_norm": 0.39428138732910156, + "learning_rate": 8.48923714377725e-11, + "loss": 0.1756, + "step": 18547 + }, + { + "epoch": 4.93560404470463, + "grad_norm": 0.2674691379070282, + "learning_rate": 8.419805569098715e-11, + "loss": 0.1556, + "step": 18548 + }, + { + "epoch": 4.935870143693454, + "grad_norm": 0.26995381712913513, + "learning_rate": 8.350658974591862e-11, + "loss": 0.1767, + "step": 18549 + }, + { + "epoch": 4.936136242682278, + "grad_norm": 0.28597065806388855, + "learning_rate": 8.281797362228449e-11, + "loss": 0.1987, + "step": 18550 + }, + { + "epoch": 4.936402341671101, + "grad_norm": 0.2556612491607666, + "learning_rate": 8.21322073397246e-11, + "loss": 0.1491, + "step": 18551 + }, + { + "epoch": 4.9366684406599255, + "grad_norm": 0.2942815124988556, + "learning_rate": 8.144929091778996e-11, + "loss": 0.1787, + "step": 18552 + }, + { + "epoch": 4.93693453964875, + "grad_norm": 0.43943771719932556, + "learning_rate": 8.07692243759761e-11, + "loss": 0.1761, + "step": 18553 + }, + { + "epoch": 4.937200638637573, + "grad_norm": 0.3743462860584259, + "learning_rate": 8.009200773365643e-11, + "loss": 0.1767, + "step": 18554 + }, + { + "epoch": 4.937466737626397, + "grad_norm": 0.3602737486362457, + "learning_rate": 7.941764101017101e-11, + "loss": 0.1708, + "step": 18555 + }, + { + "epoch": 4.937732836615221, + "grad_norm": 0.28699004650115967, + "learning_rate": 7.874612422473781e-11, + "loss": 0.1902, + "step": 18556 + }, + { + "epoch": 4.937998935604044, + "grad_norm": 0.41624000668525696, + "learning_rate": 7.807745739650818e-11, + "loss": 0.1909, + "step": 18557 + }, + { + "epoch": 4.9382650345928685, + "grad_norm": 0.3430819511413574, + "learning_rate": 7.741164054456683e-11, + "loss": 0.1646, + "step": 18558 + }, + { + "epoch": 4.938531133581693, + "grad_norm": 0.2826721966266632, + "learning_rate": 7.674867368789862e-11, + "loss": 0.1864, + "step": 18559 + }, + { + "epoch": 4.938797232570517, + "grad_norm": 0.271709144115448, + "learning_rate": 7.60885568454217e-11, + "loss": 0.1723, + "step": 18560 + }, + { + "epoch": 4.93906333155934, + "grad_norm": 0.3476875126361847, + "learning_rate": 7.543129003594329e-11, + "loss": 0.1921, + "step": 18561 + }, + { + "epoch": 4.939329430548164, + "grad_norm": 0.2631318271160126, + "learning_rate": 7.477687327821502e-11, + "loss": 0.153, + "step": 18562 + }, + { + "epoch": 4.939595529536987, + "grad_norm": 0.7177987098693848, + "learning_rate": 7.412530659091088e-11, + "loss": 0.1893, + "step": 18563 + }, + { + "epoch": 4.9398616285258115, + "grad_norm": 0.3550770878791809, + "learning_rate": 7.347658999262706e-11, + "loss": 0.1662, + "step": 18564 + }, + { + "epoch": 4.940127727514636, + "grad_norm": 0.285680890083313, + "learning_rate": 7.283072350183773e-11, + "loss": 0.1629, + "step": 18565 + }, + { + "epoch": 4.94039382650346, + "grad_norm": 0.35232532024383545, + "learning_rate": 7.218770713698363e-11, + "loss": 0.1765, + "step": 18566 + }, + { + "epoch": 4.940659925492283, + "grad_norm": 0.2755294144153595, + "learning_rate": 7.154754091639459e-11, + "loss": 0.1614, + "step": 18567 + }, + { + "epoch": 4.940926024481107, + "grad_norm": 0.33812496066093445, + "learning_rate": 7.091022485833375e-11, + "loss": 0.1849, + "step": 18568 + }, + { + "epoch": 4.941192123469931, + "grad_norm": 0.3459889590740204, + "learning_rate": 7.027575898097548e-11, + "loss": 0.1711, + "step": 18569 + }, + { + "epoch": 4.941458222458754, + "grad_norm": 0.2713136672973633, + "learning_rate": 6.964414330242752e-11, + "loss": 0.1631, + "step": 18570 + }, + { + "epoch": 4.941724321447579, + "grad_norm": 0.26693248748779297, + "learning_rate": 6.901537784069766e-11, + "loss": 0.158, + "step": 18571 + }, + { + "epoch": 4.941990420436403, + "grad_norm": 0.2874147295951843, + "learning_rate": 6.838946261371603e-11, + "loss": 0.1693, + "step": 18572 + }, + { + "epoch": 4.942256519425226, + "grad_norm": 0.3371695280075073, + "learning_rate": 6.7766397639335e-11, + "loss": 0.1908, + "step": 18573 + }, + { + "epoch": 4.94252261841405, + "grad_norm": 0.26242950558662415, + "learning_rate": 6.714618293532926e-11, + "loss": 0.1676, + "step": 18574 + }, + { + "epoch": 4.942788717402874, + "grad_norm": 0.28327077627182007, + "learning_rate": 6.652881851938463e-11, + "loss": 0.1747, + "step": 18575 + }, + { + "epoch": 4.943054816391697, + "grad_norm": 0.34591272473335266, + "learning_rate": 6.591430440912038e-11, + "loss": 0.1831, + "step": 18576 + }, + { + "epoch": 4.9433209153805215, + "grad_norm": 0.2552053928375244, + "learning_rate": 6.530264062205582e-11, + "loss": 0.1598, + "step": 18577 + }, + { + "epoch": 4.943587014369346, + "grad_norm": 0.4388851225376129, + "learning_rate": 6.469382717563254e-11, + "loss": 0.1744, + "step": 18578 + }, + { + "epoch": 4.943853113358169, + "grad_norm": 0.30396947264671326, + "learning_rate": 6.408786408721445e-11, + "loss": 0.1782, + "step": 18579 + }, + { + "epoch": 4.944119212346993, + "grad_norm": 0.33897292613983154, + "learning_rate": 6.348475137409881e-11, + "loss": 0.17, + "step": 18580 + }, + { + "epoch": 4.944385311335817, + "grad_norm": 0.3761938512325287, + "learning_rate": 6.288448905347188e-11, + "loss": 0.1835, + "step": 18581 + }, + { + "epoch": 4.94465141032464, + "grad_norm": 0.23860996961593628, + "learning_rate": 6.22870771424644e-11, + "loss": 0.1482, + "step": 18582 + }, + { + "epoch": 4.9449175093134645, + "grad_norm": 0.40245386958122253, + "learning_rate": 6.169251565811828e-11, + "loss": 0.1875, + "step": 18583 + }, + { + "epoch": 4.945183608302289, + "grad_norm": 0.2983627915382385, + "learning_rate": 6.110080461737554e-11, + "loss": 0.1732, + "step": 18584 + }, + { + "epoch": 4.945449707291113, + "grad_norm": 0.2880597710609436, + "learning_rate": 6.051194403713378e-11, + "loss": 0.1833, + "step": 18585 + }, + { + "epoch": 4.945715806279936, + "grad_norm": 0.2560053765773773, + "learning_rate": 5.992593393416845e-11, + "loss": 0.1774, + "step": 18586 + }, + { + "epoch": 4.94598190526876, + "grad_norm": 0.25465115904808044, + "learning_rate": 5.934277432521062e-11, + "loss": 0.1739, + "step": 18587 + }, + { + "epoch": 4.946248004257583, + "grad_norm": 0.35534077882766724, + "learning_rate": 5.876246522688033e-11, + "loss": 0.1831, + "step": 18588 + }, + { + "epoch": 4.9465141032464075, + "grad_norm": 0.30138057470321655, + "learning_rate": 5.8185006655742106e-11, + "loss": 0.1693, + "step": 18589 + }, + { + "epoch": 4.946780202235232, + "grad_norm": 0.37775564193725586, + "learning_rate": 5.761039862826056e-11, + "loss": 0.194, + "step": 18590 + }, + { + "epoch": 4.947046301224056, + "grad_norm": 0.2718472480773926, + "learning_rate": 5.7038641160811474e-11, + "loss": 0.1668, + "step": 18591 + }, + { + "epoch": 4.947312400212879, + "grad_norm": 0.28081247210502625, + "learning_rate": 5.646973426971513e-11, + "loss": 0.1836, + "step": 18592 + }, + { + "epoch": 4.947578499201703, + "grad_norm": 0.27363449335098267, + "learning_rate": 5.590367797120299e-11, + "loss": 0.1636, + "step": 18593 + }, + { + "epoch": 4.947844598190527, + "grad_norm": 0.3497486114501953, + "learning_rate": 5.5340472281417695e-11, + "loss": 0.1925, + "step": 18594 + }, + { + "epoch": 4.9481106971793505, + "grad_norm": 0.28743869066238403, + "learning_rate": 5.4780117216413067e-11, + "loss": 0.166, + "step": 18595 + }, + { + "epoch": 4.948376796168175, + "grad_norm": 0.2699221074581146, + "learning_rate": 5.4222612792187425e-11, + "loss": 0.1776, + "step": 18596 + }, + { + "epoch": 4.948642895156999, + "grad_norm": 0.30537256598472595, + "learning_rate": 5.366795902461696e-11, + "loss": 0.1887, + "step": 18597 + }, + { + "epoch": 4.948908994145822, + "grad_norm": 0.36888161301612854, + "learning_rate": 5.3116155929544546e-11, + "loss": 0.1844, + "step": 18598 + }, + { + "epoch": 4.949175093134646, + "grad_norm": 0.2770610749721527, + "learning_rate": 5.256720352270205e-11, + "loss": 0.1651, + "step": 18599 + }, + { + "epoch": 4.94944119212347, + "grad_norm": 0.2745817005634308, + "learning_rate": 5.202110181974362e-11, + "loss": 0.161, + "step": 18600 + }, + { + "epoch": 4.9497072911122935, + "grad_norm": 0.3383483290672302, + "learning_rate": 5.147785083624567e-11, + "loss": 0.1983, + "step": 18601 + }, + { + "epoch": 4.949973390101118, + "grad_norm": 0.28749939799308777, + "learning_rate": 5.093745058771803e-11, + "loss": 0.1544, + "step": 18602 + }, + { + "epoch": 4.950239489089942, + "grad_norm": 0.30589210987091064, + "learning_rate": 5.039990108954839e-11, + "loss": 0.1639, + "step": 18603 + }, + { + "epoch": 4.950505588078765, + "grad_norm": 0.48245182633399963, + "learning_rate": 4.986520235708003e-11, + "loss": 0.2037, + "step": 18604 + }, + { + "epoch": 4.950771687067589, + "grad_norm": 0.3796653747558594, + "learning_rate": 4.933335440556741e-11, + "loss": 0.2023, + "step": 18605 + }, + { + "epoch": 4.951037786056413, + "grad_norm": 0.2857265770435333, + "learning_rate": 4.880435725017618e-11, + "loss": 0.1797, + "step": 18606 + }, + { + "epoch": 4.951303885045236, + "grad_norm": 0.31703928112983704, + "learning_rate": 4.8278210905994266e-11, + "loss": 0.1789, + "step": 18607 + }, + { + "epoch": 4.951569984034061, + "grad_norm": 0.2877640426158905, + "learning_rate": 4.775491538803189e-11, + "loss": 0.1533, + "step": 18608 + }, + { + "epoch": 4.951836083022885, + "grad_norm": 0.31015530228614807, + "learning_rate": 4.723447071121045e-11, + "loss": 0.2018, + "step": 18609 + }, + { + "epoch": 4.952102182011709, + "grad_norm": 0.2760695815086365, + "learning_rate": 4.6716876890373626e-11, + "loss": 0.1668, + "step": 18610 + }, + { + "epoch": 4.952368281000532, + "grad_norm": 0.292053759098053, + "learning_rate": 4.620213394028738e-11, + "loss": 0.1877, + "step": 18611 + }, + { + "epoch": 4.952634379989356, + "grad_norm": 0.34635162353515625, + "learning_rate": 4.569024187563997e-11, + "loss": 0.1923, + "step": 18612 + }, + { + "epoch": 4.952900478978179, + "grad_norm": 0.3321636915206909, + "learning_rate": 4.518120071101972e-11, + "loss": 0.1868, + "step": 18613 + }, + { + "epoch": 4.9531665779670035, + "grad_norm": 0.3359609544277191, + "learning_rate": 4.467501046093725e-11, + "loss": 0.1819, + "step": 18614 + }, + { + "epoch": 4.953432676955828, + "grad_norm": 0.3860275447368622, + "learning_rate": 4.417167113985876e-11, + "loss": 0.1699, + "step": 18615 + }, + { + "epoch": 4.953698775944652, + "grad_norm": 0.4048386514186859, + "learning_rate": 4.367118276211723e-11, + "loss": 0.1902, + "step": 18616 + }, + { + "epoch": 4.953964874933475, + "grad_norm": 0.2796434760093689, + "learning_rate": 4.3173545341990136e-11, + "loss": 0.1995, + "step": 18617 + }, + { + "epoch": 4.954230973922299, + "grad_norm": 0.46522578597068787, + "learning_rate": 4.2678758893677224e-11, + "loss": 0.1868, + "step": 18618 + }, + { + "epoch": 4.954497072911123, + "grad_norm": 1.0748478174209595, + "learning_rate": 4.2186823431300534e-11, + "loss": 0.1769, + "step": 18619 + }, + { + "epoch": 4.9547631718999465, + "grad_norm": 0.3208196759223938, + "learning_rate": 4.1697738968871076e-11, + "loss": 0.1862, + "step": 18620 + }, + { + "epoch": 4.955029270888771, + "grad_norm": 0.30393004417419434, + "learning_rate": 4.121150552034436e-11, + "loss": 0.1893, + "step": 18621 + }, + { + "epoch": 4.955295369877595, + "grad_norm": 0.3379041850566864, + "learning_rate": 4.072812309959817e-11, + "loss": 0.1731, + "step": 18622 + }, + { + "epoch": 4.955561468866418, + "grad_norm": 0.3250897228717804, + "learning_rate": 4.024759172041037e-11, + "loss": 0.2059, + "step": 18623 + }, + { + "epoch": 4.955827567855242, + "grad_norm": 0.2758863866329193, + "learning_rate": 3.976991139648112e-11, + "loss": 0.1887, + "step": 18624 + }, + { + "epoch": 4.956093666844066, + "grad_norm": 0.3754667043685913, + "learning_rate": 3.929508214145505e-11, + "loss": 0.1756, + "step": 18625 + }, + { + "epoch": 4.9563597658328895, + "grad_norm": 0.27201274037361145, + "learning_rate": 3.882310396885469e-11, + "loss": 0.1723, + "step": 18626 + }, + { + "epoch": 4.956625864821714, + "grad_norm": 0.28785771131515503, + "learning_rate": 3.8353976892158134e-11, + "loss": 0.1544, + "step": 18627 + }, + { + "epoch": 4.956891963810538, + "grad_norm": 0.3234238624572754, + "learning_rate": 3.788770092473248e-11, + "loss": 0.179, + "step": 18628 + }, + { + "epoch": 4.957158062799361, + "grad_norm": 1.921303391456604, + "learning_rate": 3.7424276079878193e-11, + "loss": 0.1654, + "step": 18629 + }, + { + "epoch": 4.957424161788185, + "grad_norm": 0.28772813081741333, + "learning_rate": 3.696370237081803e-11, + "loss": 0.1787, + "step": 18630 + }, + { + "epoch": 4.957690260777009, + "grad_norm": 0.3077390193939209, + "learning_rate": 3.6505979810697026e-11, + "loss": 0.1652, + "step": 18631 + }, + { + "epoch": 4.9579563597658325, + "grad_norm": 0.27201101183891296, + "learning_rate": 3.6051108412549216e-11, + "loss": 0.1662, + "step": 18632 + }, + { + "epoch": 4.958222458754657, + "grad_norm": 0.3125036954879761, + "learning_rate": 3.5599088189375294e-11, + "loss": 0.1615, + "step": 18633 + }, + { + "epoch": 4.958488557743481, + "grad_norm": 0.29229500889778137, + "learning_rate": 3.514991915404275e-11, + "loss": 0.1892, + "step": 18634 + }, + { + "epoch": 4.958754656732305, + "grad_norm": 0.32356885075569153, + "learning_rate": 3.4703601319374665e-11, + "loss": 0.1636, + "step": 18635 + }, + { + "epoch": 4.959020755721128, + "grad_norm": 0.26944828033447266, + "learning_rate": 3.426013469809419e-11, + "loss": 0.1619, + "step": 18636 + }, + { + "epoch": 4.959286854709952, + "grad_norm": 0.34966418147087097, + "learning_rate": 3.3819519302857867e-11, + "loss": 0.1646, + "step": 18637 + }, + { + "epoch": 4.9595529536987755, + "grad_norm": 0.33312201499938965, + "learning_rate": 3.3381755146233426e-11, + "loss": 0.1742, + "step": 18638 + }, + { + "epoch": 4.9598190526876, + "grad_norm": 0.3226145803928375, + "learning_rate": 3.294684224069977e-11, + "loss": 0.1739, + "step": 18639 + }, + { + "epoch": 4.960085151676424, + "grad_norm": 0.36116236448287964, + "learning_rate": 3.251478059866919e-11, + "loss": 0.1837, + "step": 18640 + }, + { + "epoch": 4.960351250665248, + "grad_norm": 0.2547573149204254, + "learning_rate": 3.208557023245406e-11, + "loss": 0.1695, + "step": 18641 + }, + { + "epoch": 4.960617349654071, + "grad_norm": 0.27930012345314026, + "learning_rate": 3.165921115431125e-11, + "loss": 0.1763, + "step": 18642 + }, + { + "epoch": 4.960883448642895, + "grad_norm": 0.2957213222980499, + "learning_rate": 3.123570337638659e-11, + "loss": 0.1552, + "step": 18643 + }, + { + "epoch": 4.961149547631719, + "grad_norm": 0.3202536404132843, + "learning_rate": 3.081504691077041e-11, + "loss": 0.1699, + "step": 18644 + }, + { + "epoch": 4.961415646620543, + "grad_norm": 0.2589651942253113, + "learning_rate": 3.039724176945313e-11, + "loss": 0.1593, + "step": 18645 + }, + { + "epoch": 4.961681745609367, + "grad_norm": 0.4381062090396881, + "learning_rate": 2.998228796434743e-11, + "loss": 0.1835, + "step": 18646 + }, + { + "epoch": 4.961947844598191, + "grad_norm": 0.2859187126159668, + "learning_rate": 2.9570185507310494e-11, + "loss": 0.1837, + "step": 18647 + }, + { + "epoch": 4.962213943587014, + "grad_norm": 0.2842218577861786, + "learning_rate": 2.9160934410066286e-11, + "loss": 0.163, + "step": 18648 + }, + { + "epoch": 4.962480042575838, + "grad_norm": 0.2883981764316559, + "learning_rate": 2.8754534684316544e-11, + "loss": 0.1707, + "step": 18649 + }, + { + "epoch": 4.962746141564662, + "grad_norm": 0.26798832416534424, + "learning_rate": 2.83509863416298e-11, + "loss": 0.159, + "step": 18650 + }, + { + "epoch": 4.9630122405534856, + "grad_norm": 0.3567732274532318, + "learning_rate": 2.7950289393530168e-11, + "loss": 0.2005, + "step": 18651 + }, + { + "epoch": 4.96327833954231, + "grad_norm": 0.2929176092147827, + "learning_rate": 2.7552443851441842e-11, + "loss": 0.1788, + "step": 18652 + }, + { + "epoch": 4.963544438531134, + "grad_norm": 0.37393122911453247, + "learning_rate": 2.71574497267113e-11, + "loss": 0.1955, + "step": 18653 + }, + { + "epoch": 4.963810537519957, + "grad_norm": 0.2582618296146393, + "learning_rate": 2.6765307030596206e-11, + "loss": 0.1654, + "step": 18654 + }, + { + "epoch": 4.964076636508781, + "grad_norm": 0.2901797592639923, + "learning_rate": 2.637601577429871e-11, + "loss": 0.1829, + "step": 18655 + }, + { + "epoch": 4.964342735497605, + "grad_norm": 0.3289097249507904, + "learning_rate": 2.5989575968909938e-11, + "loss": 0.1876, + "step": 18656 + }, + { + "epoch": 4.9646088344864285, + "grad_norm": 0.37405675649642944, + "learning_rate": 2.5605987625454406e-11, + "loss": 0.1931, + "step": 18657 + }, + { + "epoch": 4.964874933475253, + "grad_norm": 0.25279247760772705, + "learning_rate": 2.522525075487891e-11, + "loss": 0.1624, + "step": 18658 + }, + { + "epoch": 4.965141032464077, + "grad_norm": 0.2702253758907318, + "learning_rate": 2.4847365368030336e-11, + "loss": 0.1643, + "step": 18659 + }, + { + "epoch": 4.965407131452901, + "grad_norm": 0.31509116291999817, + "learning_rate": 2.4472331475700047e-11, + "loss": 0.1908, + "step": 18660 + }, + { + "epoch": 4.965673230441724, + "grad_norm": 0.28254806995391846, + "learning_rate": 2.410014908857949e-11, + "loss": 0.1794, + "step": 18661 + }, + { + "epoch": 4.965939329430548, + "grad_norm": 0.3727607727050781, + "learning_rate": 2.37308182172824e-11, + "loss": 0.1794, + "step": 18662 + }, + { + "epoch": 4.9662054284193715, + "grad_norm": 0.35256415605545044, + "learning_rate": 2.336433887233369e-11, + "loss": 0.1924, + "step": 18663 + }, + { + "epoch": 4.966471527408196, + "grad_norm": 0.2717290222644806, + "learning_rate": 2.300071106420276e-11, + "loss": 0.1722, + "step": 18664 + }, + { + "epoch": 4.96673762639702, + "grad_norm": 0.2629311680793762, + "learning_rate": 2.2639934803259098e-11, + "loss": 0.1552, + "step": 18665 + }, + { + "epoch": 4.967003725385844, + "grad_norm": 0.2746181786060333, + "learning_rate": 2.2282010099794468e-11, + "loss": 0.1617, + "step": 18666 + }, + { + "epoch": 4.967269824374667, + "grad_norm": 0.5464704036712646, + "learning_rate": 2.192693696400072e-11, + "loss": 0.174, + "step": 18667 + }, + { + "epoch": 4.967535923363491, + "grad_norm": 0.34081265330314636, + "learning_rate": 2.157471540601419e-11, + "loss": 0.171, + "step": 18668 + }, + { + "epoch": 4.967802022352315, + "grad_norm": 0.3542974591255188, + "learning_rate": 2.1225345435882393e-11, + "loss": 0.1942, + "step": 18669 + }, + { + "epoch": 4.968068121341139, + "grad_norm": 0.2777623236179352, + "learning_rate": 2.0878827063575132e-11, + "loss": 0.159, + "step": 18670 + }, + { + "epoch": 4.968334220329963, + "grad_norm": 0.29530197381973267, + "learning_rate": 2.0535160298973397e-11, + "loss": 0.1848, + "step": 18671 + }, + { + "epoch": 4.968600319318787, + "grad_norm": 0.2807449400424957, + "learning_rate": 2.0194345151869353e-11, + "loss": 0.1813, + "step": 18672 + }, + { + "epoch": 4.96886641830761, + "grad_norm": 0.44237884879112244, + "learning_rate": 1.985638163199965e-11, + "loss": 0.1995, + "step": 18673 + }, + { + "epoch": 4.969132517296434, + "grad_norm": 0.3102456331253052, + "learning_rate": 1.9521269748989934e-11, + "loss": 0.1873, + "step": 18674 + }, + { + "epoch": 4.969398616285258, + "grad_norm": 0.3858606219291687, + "learning_rate": 1.9189009512410314e-11, + "loss": 0.1842, + "step": 18675 + }, + { + "epoch": 4.969664715274082, + "grad_norm": 0.2761248052120209, + "learning_rate": 1.88596009317199e-11, + "loss": 0.1719, + "step": 18676 + }, + { + "epoch": 4.969930814262906, + "grad_norm": 0.3379594385623932, + "learning_rate": 1.8533044016333378e-11, + "loss": 0.1616, + "step": 18677 + }, + { + "epoch": 4.97019691325173, + "grad_norm": 0.3958843946456909, + "learning_rate": 1.8209338775554416e-11, + "loss": 0.1803, + "step": 18678 + }, + { + "epoch": 4.970463012240554, + "grad_norm": 0.2612419128417969, + "learning_rate": 1.7888485218620075e-11, + "loss": 0.1709, + "step": 18679 + }, + { + "epoch": 4.970729111229377, + "grad_norm": 0.27320611476898193, + "learning_rate": 1.7570483354667486e-11, + "loss": 0.1738, + "step": 18680 + }, + { + "epoch": 4.970995210218201, + "grad_norm": 0.3979911506175995, + "learning_rate": 1.7255333192789377e-11, + "loss": 0.1703, + "step": 18681 + }, + { + "epoch": 4.971261309207025, + "grad_norm": 0.28522059321403503, + "learning_rate": 1.6943034741967453e-11, + "loss": 0.186, + "step": 18682 + }, + { + "epoch": 4.971527408195849, + "grad_norm": 0.31809934973716736, + "learning_rate": 1.66335880110946e-11, + "loss": 0.1852, + "step": 18683 + }, + { + "epoch": 4.971793507184673, + "grad_norm": 0.2659786641597748, + "learning_rate": 1.6326993009019296e-11, + "loss": 0.1719, + "step": 18684 + }, + { + "epoch": 4.972059606173497, + "grad_norm": 0.3144432008266449, + "learning_rate": 1.6023249744467894e-11, + "loss": 0.1753, + "step": 18685 + }, + { + "epoch": 4.97232570516232, + "grad_norm": 0.29913952946662903, + "learning_rate": 1.5722358226111232e-11, + "loss": 0.1924, + "step": 18686 + }, + { + "epoch": 4.972591804151144, + "grad_norm": 0.3161860704421997, + "learning_rate": 1.5424318462531337e-11, + "loss": 0.1559, + "step": 18687 + }, + { + "epoch": 4.9728579031399684, + "grad_norm": 0.25867733359336853, + "learning_rate": 1.5129130462221417e-11, + "loss": 0.1548, + "step": 18688 + }, + { + "epoch": 4.973124002128792, + "grad_norm": 0.3759869933128357, + "learning_rate": 1.4836794233619164e-11, + "loss": 0.181, + "step": 18689 + }, + { + "epoch": 4.973390101117616, + "grad_norm": 0.3214949071407318, + "learning_rate": 1.4547309785040151e-11, + "loss": 0.1724, + "step": 18690 + }, + { + "epoch": 4.97365620010644, + "grad_norm": 0.2893395721912384, + "learning_rate": 1.426067712476664e-11, + "loss": 0.1675, + "step": 18691 + }, + { + "epoch": 4.973922299095263, + "grad_norm": 0.4426876902580261, + "learning_rate": 1.3976896260947667e-11, + "loss": 0.1928, + "step": 18692 + }, + { + "epoch": 4.974188398084087, + "grad_norm": 0.28071337938308716, + "learning_rate": 1.3695967201687864e-11, + "loss": 0.1739, + "step": 18693 + }, + { + "epoch": 4.974454497072911, + "grad_norm": 0.39757540822029114, + "learning_rate": 1.3417889955014139e-11, + "loss": 0.179, + "step": 18694 + }, + { + "epoch": 4.974720596061735, + "grad_norm": 0.5117313265800476, + "learning_rate": 1.3142664528831283e-11, + "loss": 0.1857, + "step": 18695 + }, + { + "epoch": 4.974986695050559, + "grad_norm": 0.3175498843193054, + "learning_rate": 1.2870290931010775e-11, + "loss": 0.1823, + "step": 18696 + }, + { + "epoch": 4.975252794039383, + "grad_norm": 0.26949429512023926, + "learning_rate": 1.2600769169301972e-11, + "loss": 0.17, + "step": 18697 + }, + { + "epoch": 4.975518893028206, + "grad_norm": 0.37763288617134094, + "learning_rate": 1.2334099251409825e-11, + "loss": 0.1977, + "step": 18698 + }, + { + "epoch": 4.97578499201703, + "grad_norm": 0.3287433087825775, + "learning_rate": 1.2070281184939357e-11, + "loss": 0.1845, + "step": 18699 + }, + { + "epoch": 4.976051091005854, + "grad_norm": 0.260658860206604, + "learning_rate": 1.1809314977406781e-11, + "loss": 0.1753, + "step": 18700 + }, + { + "epoch": 4.976317189994678, + "grad_norm": 0.3720853626728058, + "learning_rate": 1.155120063625059e-11, + "loss": 0.1967, + "step": 18701 + }, + { + "epoch": 4.976583288983502, + "grad_norm": 0.3534468710422516, + "learning_rate": 1.129593816885377e-11, + "loss": 0.1921, + "step": 18702 + }, + { + "epoch": 4.976849387972326, + "grad_norm": 0.3740658760070801, + "learning_rate": 1.1043527582466072e-11, + "loss": 0.1779, + "step": 18703 + }, + { + "epoch": 4.97711548696115, + "grad_norm": 0.30281928181648254, + "learning_rate": 1.0793968884315052e-11, + "loss": 0.1794, + "step": 18704 + }, + { + "epoch": 4.977381585949973, + "grad_norm": 0.32333117723464966, + "learning_rate": 1.0547262081506136e-11, + "loss": 0.1733, + "step": 18705 + }, + { + "epoch": 4.977647684938797, + "grad_norm": 0.2744999825954437, + "learning_rate": 1.030340718108924e-11, + "loss": 0.176, + "step": 18706 + }, + { + "epoch": 4.977913783927621, + "grad_norm": 0.2907017767429352, + "learning_rate": 1.0062404189992157e-11, + "loss": 0.1891, + "step": 18707 + }, + { + "epoch": 4.978179882916445, + "grad_norm": 0.2772831916809082, + "learning_rate": 9.82425311510937e-12, + "loss": 0.1828, + "step": 18708 + }, + { + "epoch": 4.978445981905269, + "grad_norm": 0.2791103720664978, + "learning_rate": 9.588953963224345e-12, + "loss": 0.1629, + "step": 18709 + }, + { + "epoch": 4.978712080894093, + "grad_norm": 0.301422119140625, + "learning_rate": 9.35650674106503e-12, + "loss": 0.1646, + "step": 18710 + }, + { + "epoch": 4.978978179882916, + "grad_norm": 0.277567058801651, + "learning_rate": 9.126911455237251e-12, + "loss": 0.1786, + "step": 18711 + }, + { + "epoch": 4.97924427887174, + "grad_norm": 0.2900044322013855, + "learning_rate": 8.900168112302431e-12, + "loss": 0.1686, + "step": 18712 + }, + { + "epoch": 4.9795103778605645, + "grad_norm": 0.41271620988845825, + "learning_rate": 8.676276718733166e-12, + "loss": 0.1999, + "step": 18713 + }, + { + "epoch": 4.979776476849388, + "grad_norm": 0.3548508286476135, + "learning_rate": 8.455237280902139e-12, + "loss": 0.1652, + "step": 18714 + }, + { + "epoch": 4.980042575838212, + "grad_norm": 0.2698659300804138, + "learning_rate": 8.237049805126517e-12, + "loss": 0.1819, + "step": 18715 + }, + { + "epoch": 4.980308674827036, + "grad_norm": 0.2941203713417053, + "learning_rate": 8.021714297612448e-12, + "loss": 0.1886, + "step": 18716 + }, + { + "epoch": 4.980574773815859, + "grad_norm": 0.31089070439338684, + "learning_rate": 7.809230764521668e-12, + "loss": 0.1626, + "step": 18717 + }, + { + "epoch": 4.980840872804683, + "grad_norm": 0.4540148675441742, + "learning_rate": 7.599599211904894e-12, + "loss": 0.1664, + "step": 18718 + }, + { + "epoch": 4.9811069717935075, + "grad_norm": 0.28815820813179016, + "learning_rate": 7.392819645735127e-12, + "loss": 0.1797, + "step": 18719 + }, + { + "epoch": 4.981373070782331, + "grad_norm": 0.328022301197052, + "learning_rate": 7.188892071929853e-12, + "loss": 0.1806, + "step": 18720 + }, + { + "epoch": 4.981639169771155, + "grad_norm": 0.27734988927841187, + "learning_rate": 6.9878164962844376e-12, + "loss": 0.1677, + "step": 18721 + }, + { + "epoch": 4.981905268759979, + "grad_norm": 0.279407799243927, + "learning_rate": 6.789592924538734e-12, + "loss": 0.1795, + "step": 18722 + }, + { + "epoch": 4.982171367748802, + "grad_norm": 0.2787691354751587, + "learning_rate": 6.594221362354879e-12, + "loss": 0.1629, + "step": 18723 + }, + { + "epoch": 4.982437466737626, + "grad_norm": 0.4032391905784607, + "learning_rate": 6.401701815295091e-12, + "loss": 0.1946, + "step": 18724 + }, + { + "epoch": 4.9827035657264505, + "grad_norm": 0.4102476239204407, + "learning_rate": 6.2120342888549725e-12, + "loss": 0.175, + "step": 18725 + }, + { + "epoch": 4.982969664715274, + "grad_norm": 0.27656665444374084, + "learning_rate": 6.0252187884524134e-12, + "loss": 0.1698, + "step": 18726 + }, + { + "epoch": 4.983235763704098, + "grad_norm": 0.2732604146003723, + "learning_rate": 5.84125531940538e-12, + "loss": 0.1677, + "step": 18727 + }, + { + "epoch": 4.983501862692922, + "grad_norm": 0.4621378183364868, + "learning_rate": 5.660143886965229e-12, + "loss": 0.1987, + "step": 18728 + }, + { + "epoch": 4.983767961681746, + "grad_norm": 0.27874812483787537, + "learning_rate": 5.481884496283396e-12, + "loss": 0.1716, + "step": 18729 + }, + { + "epoch": 4.984034060670569, + "grad_norm": 0.34859994053840637, + "learning_rate": 5.306477152466904e-12, + "loss": 0.1928, + "step": 18730 + }, + { + "epoch": 4.984300159659393, + "grad_norm": 0.27074506878852844, + "learning_rate": 5.1339218605117586e-12, + "loss": 0.1723, + "step": 18731 + }, + { + "epoch": 4.984566258648217, + "grad_norm": 0.26701411604881287, + "learning_rate": 4.964218625336247e-12, + "loss": 0.1566, + "step": 18732 + }, + { + "epoch": 4.984832357637041, + "grad_norm": 0.2855263948440552, + "learning_rate": 4.7973674517698405e-12, + "loss": 0.1637, + "step": 18733 + }, + { + "epoch": 4.985098456625865, + "grad_norm": 0.32745176553726196, + "learning_rate": 4.633368344597599e-12, + "loss": 0.1771, + "step": 18734 + }, + { + "epoch": 4.985364555614689, + "grad_norm": 0.4309958219528198, + "learning_rate": 4.4722213084713574e-12, + "loss": 0.2065, + "step": 18735 + }, + { + "epoch": 4.985630654603512, + "grad_norm": 0.43677809834480286, + "learning_rate": 4.313926347998542e-12, + "loss": 0.1959, + "step": 18736 + }, + { + "epoch": 4.985896753592336, + "grad_norm": 0.26307785511016846, + "learning_rate": 4.15848346769776e-12, + "loss": 0.1717, + "step": 18737 + }, + { + "epoch": 4.9861628525811605, + "grad_norm": 0.4317682087421417, + "learning_rate": 4.0058926719988005e-12, + "loss": 0.1991, + "step": 18738 + }, + { + "epoch": 4.986428951569984, + "grad_norm": 0.27487698197364807, + "learning_rate": 3.8561539652537385e-12, + "loss": 0.1629, + "step": 18739 + }, + { + "epoch": 4.986695050558808, + "grad_norm": 0.28562793135643005, + "learning_rate": 3.709267351725831e-12, + "loss": 0.1678, + "step": 18740 + }, + { + "epoch": 4.986961149547632, + "grad_norm": 0.3685055375099182, + "learning_rate": 3.56523283561172e-12, + "loss": 0.1739, + "step": 18741 + }, + { + "epoch": 4.987227248536455, + "grad_norm": 0.35900282859802246, + "learning_rate": 3.4240504210303336e-12, + "loss": 0.189, + "step": 18742 + }, + { + "epoch": 4.987493347525279, + "grad_norm": 0.3852148652076721, + "learning_rate": 3.285720111989576e-12, + "loss": 0.1902, + "step": 18743 + }, + { + "epoch": 4.9877594465141035, + "grad_norm": 0.2839105427265167, + "learning_rate": 3.150241912441842e-12, + "loss": 0.152, + "step": 18744 + }, + { + "epoch": 4.988025545502927, + "grad_norm": 0.3073789179325104, + "learning_rate": 3.01761582626181e-12, + "loss": 0.1659, + "step": 18745 + }, + { + "epoch": 4.988291644491751, + "grad_norm": 0.2883364260196686, + "learning_rate": 2.887841857213136e-12, + "loss": 0.1678, + "step": 18746 + }, + { + "epoch": 4.988557743480575, + "grad_norm": 0.38662582635879517, + "learning_rate": 2.760920009015066e-12, + "loss": 0.1864, + "step": 18747 + }, + { + "epoch": 4.988823842469398, + "grad_norm": 0.2829833924770355, + "learning_rate": 2.6368502852758266e-12, + "loss": 0.1801, + "step": 18748 + }, + { + "epoch": 4.989089941458222, + "grad_norm": 0.2903778851032257, + "learning_rate": 2.515632689537028e-12, + "loss": 0.1725, + "step": 18749 + }, + { + "epoch": 4.9893560404470465, + "grad_norm": 0.27370133996009827, + "learning_rate": 2.3972672252625668e-12, + "loss": 0.1688, + "step": 18750 + }, + { + "epoch": 4.98962213943587, + "grad_norm": 0.2798982560634613, + "learning_rate": 2.2817538958164184e-12, + "loss": 0.165, + "step": 18751 + }, + { + "epoch": 4.989888238424694, + "grad_norm": 0.2746127247810364, + "learning_rate": 2.169092704495945e-12, + "loss": 0.1589, + "step": 18752 + }, + { + "epoch": 4.990154337413518, + "grad_norm": 0.36891743540763855, + "learning_rate": 2.0592836545318958e-12, + "loss": 0.1681, + "step": 18753 + }, + { + "epoch": 4.990420436402342, + "grad_norm": 0.27368730306625366, + "learning_rate": 1.9523267490328956e-12, + "loss": 0.1914, + "step": 18754 + }, + { + "epoch": 4.990686535391165, + "grad_norm": 0.30387938022613525, + "learning_rate": 1.848221991052057e-12, + "loss": 0.1801, + "step": 18755 + }, + { + "epoch": 4.9909526343799895, + "grad_norm": 0.26175764203071594, + "learning_rate": 1.7469693835758802e-12, + "loss": 0.1558, + "step": 18756 + }, + { + "epoch": 4.991218733368813, + "grad_norm": 0.2755529284477234, + "learning_rate": 1.6485689294909456e-12, + "loss": 0.1745, + "step": 18757 + }, + { + "epoch": 4.991484832357637, + "grad_norm": 2.504051446914673, + "learning_rate": 1.5530206315839123e-12, + "loss": 0.1904, + "step": 18758 + }, + { + "epoch": 4.991750931346461, + "grad_norm": 0.2833792567253113, + "learning_rate": 1.4603244925859292e-12, + "loss": 0.1517, + "step": 18759 + }, + { + "epoch": 4.992017030335285, + "grad_norm": 0.5066165924072266, + "learning_rate": 1.3704805151615317e-12, + "loss": 0.1898, + "step": 18760 + }, + { + "epoch": 4.992283129324108, + "grad_norm": 0.2801041603088379, + "learning_rate": 1.2834887018420281e-12, + "loss": 0.158, + "step": 18761 + }, + { + "epoch": 4.9925492283129325, + "grad_norm": 0.45320555567741394, + "learning_rate": 1.1993490551365226e-12, + "loss": 0.1975, + "step": 18762 + }, + { + "epoch": 4.992815327301757, + "grad_norm": 0.2783893048763275, + "learning_rate": 1.1180615774208924e-12, + "loss": 0.1747, + "step": 18763 + }, + { + "epoch": 4.99308142629058, + "grad_norm": 0.26777762174606323, + "learning_rate": 1.0396262710377079e-12, + "loss": 0.1701, + "step": 18764 + }, + { + "epoch": 4.993347525279404, + "grad_norm": 0.30732110142707825, + "learning_rate": 9.640431382074155e-13, + "loss": 0.2082, + "step": 18765 + }, + { + "epoch": 4.993613624268228, + "grad_norm": 0.28367841243743896, + "learning_rate": 8.913121810949498e-13, + "loss": 0.1782, + "step": 18766 + }, + { + "epoch": 4.993879723257051, + "grad_norm": 0.26548102498054504, + "learning_rate": 8.214334017653257e-13, + "loss": 0.1753, + "step": 18767 + }, + { + "epoch": 4.994145822245875, + "grad_norm": 0.3661574423313141, + "learning_rate": 7.544068022169448e-13, + "loss": 0.1723, + "step": 18768 + }, + { + "epoch": 4.9944119212347, + "grad_norm": 0.2796405851840973, + "learning_rate": 6.902323843593904e-13, + "loss": 0.1725, + "step": 18769 + }, + { + "epoch": 4.994678020223523, + "grad_norm": 0.34130504727363586, + "learning_rate": 6.289101500245309e-13, + "loss": 0.178, + "step": 18770 + }, + { + "epoch": 4.994944119212347, + "grad_norm": 0.333170086145401, + "learning_rate": 5.704401009665183e-13, + "loss": 0.1769, + "step": 18771 + }, + { + "epoch": 4.995210218201171, + "grad_norm": 0.3059298098087311, + "learning_rate": 5.148222388395851e-13, + "loss": 0.182, + "step": 18772 + }, + { + "epoch": 4.995476317189994, + "grad_norm": 0.2758291959762573, + "learning_rate": 4.620565652424524e-13, + "loss": 0.1632, + "step": 18773 + }, + { + "epoch": 4.995742416178818, + "grad_norm": 0.4011588990688324, + "learning_rate": 4.1214308168502355e-13, + "loss": 0.1821, + "step": 18774 + }, + { + "epoch": 4.9960085151676425, + "grad_norm": 0.30734893679618835, + "learning_rate": 3.650817895661795e-13, + "loss": 0.1621, + "step": 18775 + }, + { + "epoch": 4.996274614156466, + "grad_norm": 0.3268824815750122, + "learning_rate": 3.208726902625969e-13, + "loss": 0.1599, + "step": 18776 + }, + { + "epoch": 4.99654071314529, + "grad_norm": 0.47943565249443054, + "learning_rate": 2.7951578500662323e-13, + "loss": 0.2241, + "step": 18777 + }, + { + "epoch": 4.996806812134114, + "grad_norm": 0.3246813118457794, + "learning_rate": 2.410110749861971e-13, + "loss": 0.1867, + "step": 18778 + }, + { + "epoch": 4.997072911122938, + "grad_norm": 0.337116539478302, + "learning_rate": 2.0535856130043938e-13, + "loss": 0.1671, + "step": 18779 + }, + { + "epoch": 4.997339010111761, + "grad_norm": 0.4220016300678253, + "learning_rate": 1.7255824497075523e-13, + "loss": 0.1905, + "step": 18780 + }, + { + "epoch": 4.9976051091005855, + "grad_norm": 0.34873804450035095, + "learning_rate": 1.4261012692973195e-13, + "loss": 0.1763, + "step": 18781 + }, + { + "epoch": 4.997871208089409, + "grad_norm": 0.26954108476638794, + "learning_rate": 1.1551420803224132e-13, + "loss": 0.1874, + "step": 18782 + }, + { + "epoch": 4.998137307078233, + "grad_norm": 0.31235015392303467, + "learning_rate": 9.127048904433721e-14, + "loss": 0.2009, + "step": 18783 + }, + { + "epoch": 4.998403406067057, + "grad_norm": 0.2771170139312744, + "learning_rate": 6.987897066546011e-14, + "loss": 0.1645, + "step": 18784 + }, + { + "epoch": 4.998669505055881, + "grad_norm": 0.3589935600757599, + "learning_rate": 5.13396535062327e-14, + "loss": 0.1774, + "step": 18785 + }, + { + "epoch": 4.998935604044704, + "grad_norm": 0.34380587935447693, + "learning_rate": 3.5652538088459803e-14, + "loss": 0.1832, + "step": 18786 + }, + { + "epoch": 4.9992017030335285, + "grad_norm": 0.37392520904541016, + "learning_rate": 2.2817624867332853e-14, + "loss": 0.211, + "step": 18787 + }, + { + "epoch": 4.999467802022353, + "grad_norm": 0.5946944952011108, + "learning_rate": 1.2834914198123214e-14, + "loss": 0.1983, + "step": 18788 + }, + { + "epoch": 4.999733901011176, + "grad_norm": 0.2751145660877228, + "learning_rate": 5.704406380591109e-15, + "loss": 0.1803, + "step": 18789 + }, + { + "epoch": 5.0, + "grad_norm": 0.30985572934150696, + "learning_rate": 1.42610160347445e-15, + "loss": 0.1597, + "step": 18790 + }, + { + "epoch": 5.0, + "step": 18790, + "total_flos": 1.507039161664785e+21, + "train_loss": 0.10727324645972239, + "train_runtime": 209460.8021, + "train_samples_per_second": 5.741, + "train_steps_per_second": 0.09 + } + ], + "logging_steps": 1.0, + "max_steps": 18790, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.507039161664785e+21, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}