{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9979661016949153, "eval_steps": 500, "global_step": 276, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003615819209039548, "grad_norm": 14.63072256054265, "learning_rate": 0.0, "loss": 1.4293, "step": 1 }, { "epoch": 0.007231638418079096, "grad_norm": 13.703421994615557, "learning_rate": 1.7857142857142858e-07, "loss": 1.4193, "step": 2 }, { "epoch": 0.010847457627118645, "grad_norm": 13.716616968172858, "learning_rate": 3.5714285714285716e-07, "loss": 1.3691, "step": 3 }, { "epoch": 0.014463276836158192, "grad_norm": 13.81492601698403, "learning_rate": 5.357142857142857e-07, "loss": 1.4337, "step": 4 }, { "epoch": 0.01807909604519774, "grad_norm": 13.013807610008277, "learning_rate": 7.142857142857143e-07, "loss": 1.335, "step": 5 }, { "epoch": 0.02169491525423729, "grad_norm": 13.316675508543259, "learning_rate": 8.928571428571429e-07, "loss": 1.4104, "step": 6 }, { "epoch": 0.025310734463276835, "grad_norm": 12.54293399835329, "learning_rate": 1.0714285714285714e-06, "loss": 1.3364, "step": 7 }, { "epoch": 0.028926553672316384, "grad_norm": 11.706383970770567, "learning_rate": 1.25e-06, "loss": 1.3289, "step": 8 }, { "epoch": 0.03254237288135593, "grad_norm": 9.135561727877839, "learning_rate": 1.4285714285714286e-06, "loss": 1.2395, "step": 9 }, { "epoch": 0.03615819209039548, "grad_norm": 8.24371141549115, "learning_rate": 1.6071428571428574e-06, "loss": 1.2597, "step": 10 }, { "epoch": 0.03977401129943503, "grad_norm": 5.641164800490142, "learning_rate": 1.7857142857142859e-06, "loss": 1.1323, "step": 11 }, { "epoch": 0.04338983050847458, "grad_norm": 5.154352679322036, "learning_rate": 1.9642857142857144e-06, "loss": 1.1555, "step": 12 }, { "epoch": 0.04700564971751412, "grad_norm": 4.902918571287102, "learning_rate": 2.1428571428571427e-06, "loss": 1.1208, "step": 13 }, { "epoch": 0.05062146892655367, "grad_norm": 4.651473817511105, "learning_rate": 2.321428571428572e-06, "loss": 1.1211, "step": 14 }, { "epoch": 0.05423728813559322, "grad_norm": 7.855915558603105, "learning_rate": 2.5e-06, "loss": 1.0921, "step": 15 }, { "epoch": 0.05785310734463277, "grad_norm": 9.766906290977035, "learning_rate": 2.6785714285714285e-06, "loss": 1.0519, "step": 16 }, { "epoch": 0.061468926553672316, "grad_norm": 7.745681824751476, "learning_rate": 2.8571428571428573e-06, "loss": 1.0559, "step": 17 }, { "epoch": 0.06508474576271187, "grad_norm": 4.81594013217655, "learning_rate": 3.0357142857142856e-06, "loss": 1.0629, "step": 18 }, { "epoch": 0.06870056497175141, "grad_norm": 4.091417378072179, "learning_rate": 3.2142857142857147e-06, "loss": 1.0244, "step": 19 }, { "epoch": 0.07231638418079096, "grad_norm": 2.8101858455670237, "learning_rate": 3.3928571428571435e-06, "loss": 0.9697, "step": 20 }, { "epoch": 0.0759322033898305, "grad_norm": 3.180728091735543, "learning_rate": 3.5714285714285718e-06, "loss": 0.9601, "step": 21 }, { "epoch": 0.07954802259887006, "grad_norm": 3.6493251911677658, "learning_rate": 3.7500000000000005e-06, "loss": 0.9409, "step": 22 }, { "epoch": 0.0831638418079096, "grad_norm": 3.415956781592975, "learning_rate": 3.928571428571429e-06, "loss": 0.9486, "step": 23 }, { "epoch": 0.08677966101694916, "grad_norm": 3.0472462438826, "learning_rate": 4.107142857142857e-06, "loss": 0.8874, "step": 24 }, { "epoch": 0.0903954802259887, "grad_norm": 2.264144754905512, "learning_rate": 4.2857142857142855e-06, "loss": 0.8994, "step": 25 }, { "epoch": 0.09401129943502824, "grad_norm": 2.191754581281806, "learning_rate": 4.464285714285715e-06, "loss": 0.8975, "step": 26 }, { "epoch": 0.0976271186440678, "grad_norm": 2.3668184420746385, "learning_rate": 4.642857142857144e-06, "loss": 0.8581, "step": 27 }, { "epoch": 0.10124293785310734, "grad_norm": 2.7515269578252193, "learning_rate": 4.821428571428572e-06, "loss": 0.8423, "step": 28 }, { "epoch": 0.1048587570621469, "grad_norm": 2.3735525312849837, "learning_rate": 5e-06, "loss": 0.8205, "step": 29 }, { "epoch": 0.10847457627118644, "grad_norm": 2.032163014842967, "learning_rate": 4.999799414013322e-06, "loss": 0.8754, "step": 30 }, { "epoch": 0.112090395480226, "grad_norm": 2.0590504400576997, "learning_rate": 4.999197688241076e-06, "loss": 0.8267, "step": 31 }, { "epoch": 0.11570621468926554, "grad_norm": 2.092420106610846, "learning_rate": 4.998194919241471e-06, "loss": 0.8312, "step": 32 }, { "epoch": 0.11932203389830509, "grad_norm": 1.9381311525751297, "learning_rate": 4.996791267927632e-06, "loss": 0.7878, "step": 33 }, { "epoch": 0.12293785310734463, "grad_norm": 1.7374506458276127, "learning_rate": 4.994986959541788e-06, "loss": 0.7816, "step": 34 }, { "epoch": 0.12655367231638417, "grad_norm": 2.0807346358096166, "learning_rate": 4.9927822836191185e-06, "loss": 0.7818, "step": 35 }, { "epoch": 0.13016949152542373, "grad_norm": 1.73060241181151, "learning_rate": 4.990177593941303e-06, "loss": 0.7674, "step": 36 }, { "epoch": 0.1337853107344633, "grad_norm": 1.7060389302256593, "learning_rate": 4.987173308479738e-06, "loss": 0.7652, "step": 37 }, { "epoch": 0.13740112994350281, "grad_norm": 1.6025020216781503, "learning_rate": 4.9837699093284765e-06, "loss": 0.7457, "step": 38 }, { "epoch": 0.14101694915254237, "grad_norm": 1.7291972266390814, "learning_rate": 4.9799679426268575e-06, "loss": 0.7943, "step": 39 }, { "epoch": 0.14463276836158193, "grad_norm": 1.6572877077102874, "learning_rate": 4.975768018471877e-06, "loss": 0.7815, "step": 40 }, { "epoch": 0.14824858757062148, "grad_norm": 1.5680833894820443, "learning_rate": 4.971170810820279e-06, "loss": 0.7557, "step": 41 }, { "epoch": 0.151864406779661, "grad_norm": 1.4822606907798626, "learning_rate": 4.966177057380409e-06, "loss": 0.7561, "step": 42 }, { "epoch": 0.15548022598870057, "grad_norm": 1.6178903613745546, "learning_rate": 4.960787559493836e-06, "loss": 0.7474, "step": 43 }, { "epoch": 0.15909604519774012, "grad_norm": 1.5096967013865987, "learning_rate": 4.955003182006761e-06, "loss": 0.716, "step": 44 }, { "epoch": 0.16271186440677965, "grad_norm": 1.4671027619246142, "learning_rate": 4.948824853131237e-06, "loss": 0.7353, "step": 45 }, { "epoch": 0.1663276836158192, "grad_norm": 1.4778933913112275, "learning_rate": 4.942253564296217e-06, "loss": 0.7358, "step": 46 }, { "epoch": 0.16994350282485876, "grad_norm": 1.4672254672280391, "learning_rate": 4.935290369988468e-06, "loss": 0.7419, "step": 47 }, { "epoch": 0.17355932203389832, "grad_norm": 1.4415543659264245, "learning_rate": 4.927936387583348e-06, "loss": 0.748, "step": 48 }, { "epoch": 0.17717514124293784, "grad_norm": 1.3940953807201077, "learning_rate": 4.920192797165511e-06, "loss": 0.7571, "step": 49 }, { "epoch": 0.1807909604519774, "grad_norm": 1.4519222988783331, "learning_rate": 4.912060841339536e-06, "loss": 0.7127, "step": 50 }, { "epoch": 0.18440677966101696, "grad_norm": 1.4865547512367276, "learning_rate": 4.9035418250305314e-06, "loss": 0.7272, "step": 51 }, { "epoch": 0.18802259887005648, "grad_norm": 1.5008090646455723, "learning_rate": 4.894637115274728e-06, "loss": 0.7258, "step": 52 }, { "epoch": 0.19163841807909604, "grad_norm": 1.7037112279423843, "learning_rate": 4.8853481410001225e-06, "loss": 0.7316, "step": 53 }, { "epoch": 0.1952542372881356, "grad_norm": 1.440963552161302, "learning_rate": 4.875676392797169e-06, "loss": 0.7551, "step": 54 }, { "epoch": 0.19887005649717515, "grad_norm": 1.440064113753515, "learning_rate": 4.865623422679593e-06, "loss": 0.7446, "step": 55 }, { "epoch": 0.20248587570621468, "grad_norm": 1.6503815942075044, "learning_rate": 4.855190843835338e-06, "loss": 0.6955, "step": 56 }, { "epoch": 0.20610169491525424, "grad_norm": 1.3995241998208656, "learning_rate": 4.844380330367701e-06, "loss": 0.7214, "step": 57 }, { "epoch": 0.2097175141242938, "grad_norm": 1.461439825228087, "learning_rate": 4.833193617026692e-06, "loss": 0.7386, "step": 58 }, { "epoch": 0.21333333333333335, "grad_norm": 1.5840510779057535, "learning_rate": 4.821632498930656e-06, "loss": 0.7156, "step": 59 }, { "epoch": 0.21694915254237288, "grad_norm": 1.5720479925852917, "learning_rate": 4.809698831278217e-06, "loss": 0.6961, "step": 60 }, { "epoch": 0.22056497175141243, "grad_norm": 1.6408543211624975, "learning_rate": 4.797394529050577e-06, "loss": 0.7223, "step": 61 }, { "epoch": 0.224180790960452, "grad_norm": 1.4636770967664614, "learning_rate": 4.784721566704217e-06, "loss": 0.7157, "step": 62 }, { "epoch": 0.22779661016949151, "grad_norm": 1.5059556334177817, "learning_rate": 4.771681977854062e-06, "loss": 0.7091, "step": 63 }, { "epoch": 0.23141242937853107, "grad_norm": 1.3659765286811016, "learning_rate": 4.75827785494715e-06, "loss": 0.7012, "step": 64 }, { "epoch": 0.23502824858757063, "grad_norm": 1.418860606361076, "learning_rate": 4.744511348926855e-06, "loss": 0.7124, "step": 65 }, { "epoch": 0.23864406779661018, "grad_norm": 1.4260454819088881, "learning_rate": 4.730384668887731e-06, "loss": 0.7215, "step": 66 }, { "epoch": 0.2422598870056497, "grad_norm": 1.5391993842680236, "learning_rate": 4.715900081721021e-06, "loss": 0.6946, "step": 67 }, { "epoch": 0.24587570621468927, "grad_norm": 1.552630241762705, "learning_rate": 4.7010599117508936e-06, "loss": 0.7109, "step": 68 }, { "epoch": 0.24949152542372882, "grad_norm": 1.5270739656846004, "learning_rate": 4.685866540361456e-06, "loss": 0.7092, "step": 69 }, { "epoch": 0.25310734463276835, "grad_norm": 1.5114873681601728, "learning_rate": 4.670322405614621e-06, "loss": 0.7055, "step": 70 }, { "epoch": 0.25672316384180793, "grad_norm": 1.5299853438436042, "learning_rate": 4.654430001858874e-06, "loss": 0.6878, "step": 71 }, { "epoch": 0.26033898305084746, "grad_norm": 1.445451157672883, "learning_rate": 4.638191879329005e-06, "loss": 0.7222, "step": 72 }, { "epoch": 0.263954802259887, "grad_norm": 1.4221665529233392, "learning_rate": 4.621610643736878e-06, "loss": 0.7237, "step": 73 }, { "epoch": 0.2675706214689266, "grad_norm": 1.469857395185097, "learning_rate": 4.6046889558532925e-06, "loss": 0.6966, "step": 74 }, { "epoch": 0.2711864406779661, "grad_norm": 1.5144096217089977, "learning_rate": 4.587429531081019e-06, "loss": 0.7018, "step": 75 }, { "epoch": 0.27480225988700563, "grad_norm": 1.3933861614255332, "learning_rate": 4.569835139019054e-06, "loss": 0.6548, "step": 76 }, { "epoch": 0.2784180790960452, "grad_norm": 1.7309884671892724, "learning_rate": 4.551908603018191e-06, "loss": 0.6976, "step": 77 }, { "epoch": 0.28203389830508474, "grad_norm": 1.5328106326816477, "learning_rate": 4.53365279972796e-06, "loss": 0.693, "step": 78 }, { "epoch": 0.28564971751412427, "grad_norm": 1.5098349501089556, "learning_rate": 4.515070658635013e-06, "loss": 0.697, "step": 79 }, { "epoch": 0.28926553672316385, "grad_norm": 1.7412535656694954, "learning_rate": 4.4961651615930344e-06, "loss": 0.7115, "step": 80 }, { "epoch": 0.2928813559322034, "grad_norm": 1.5076941472793814, "learning_rate": 4.476939342344246e-06, "loss": 0.7163, "step": 81 }, { "epoch": 0.29649717514124296, "grad_norm": 1.4730053806555845, "learning_rate": 4.457396286032589e-06, "loss": 0.6886, "step": 82 }, { "epoch": 0.3001129943502825, "grad_norm": 1.4855157413144469, "learning_rate": 4.437539128708647e-06, "loss": 0.7339, "step": 83 }, { "epoch": 0.303728813559322, "grad_norm": 1.5405150766486688, "learning_rate": 4.417371056826417e-06, "loss": 0.7149, "step": 84 }, { "epoch": 0.3073446327683616, "grad_norm": 1.5508769466195937, "learning_rate": 4.396895306731978e-06, "loss": 0.6822, "step": 85 }, { "epoch": 0.31096045197740113, "grad_norm": 1.4588960420622106, "learning_rate": 4.376115164144157e-06, "loss": 0.6836, "step": 86 }, { "epoch": 0.31457627118644066, "grad_norm": 1.6627413334009173, "learning_rate": 4.355033963627277e-06, "loss": 0.7131, "step": 87 }, { "epoch": 0.31819209039548024, "grad_norm": 1.5553517341256553, "learning_rate": 4.333655088056065e-06, "loss": 0.6854, "step": 88 }, { "epoch": 0.32180790960451977, "grad_norm": 1.6684230900983013, "learning_rate": 4.3119819680728e-06, "loss": 0.7094, "step": 89 }, { "epoch": 0.3254237288135593, "grad_norm": 1.3726285594917855, "learning_rate": 4.290018081536807e-06, "loss": 0.6872, "step": 90 }, { "epoch": 0.3290395480225989, "grad_norm": 1.380724539985608, "learning_rate": 4.267766952966369e-06, "loss": 0.7139, "step": 91 }, { "epoch": 0.3326553672316384, "grad_norm": 1.5348446771711333, "learning_rate": 4.245232152973148e-06, "loss": 0.6778, "step": 92 }, { "epoch": 0.336271186440678, "grad_norm": 1.5229418737475282, "learning_rate": 4.222417297689217e-06, "loss": 0.6689, "step": 93 }, { "epoch": 0.3398870056497175, "grad_norm": 1.5070456041355724, "learning_rate": 4.199326048186783e-06, "loss": 0.6894, "step": 94 }, { "epoch": 0.34350282485875705, "grad_norm": 1.3616666744687627, "learning_rate": 4.175962109890697e-06, "loss": 0.6554, "step": 95 }, { "epoch": 0.34711864406779663, "grad_norm": 1.5391506984706091, "learning_rate": 4.152329231983852e-06, "loss": 0.7023, "step": 96 }, { "epoch": 0.35073446327683616, "grad_norm": 1.4241158917993244, "learning_rate": 4.128431206805556e-06, "loss": 0.7107, "step": 97 }, { "epoch": 0.3543502824858757, "grad_norm": 1.4631603034598988, "learning_rate": 4.104271869242975e-06, "loss": 0.6894, "step": 98 }, { "epoch": 0.3579661016949153, "grad_norm": 1.4094826388778878, "learning_rate": 4.07985509611576e-06, "loss": 0.677, "step": 99 }, { "epoch": 0.3615819209039548, "grad_norm": 1.4740189681153695, "learning_rate": 4.0551848055539345e-06, "loss": 0.6699, "step": 100 }, { "epoch": 0.36519774011299433, "grad_norm": 1.4525217649150708, "learning_rate": 4.030264956369158e-06, "loss": 0.7368, "step": 101 }, { "epoch": 0.3688135593220339, "grad_norm": 1.451460518174865, "learning_rate": 4.005099547419458e-06, "loss": 0.7034, "step": 102 }, { "epoch": 0.37242937853107344, "grad_norm": 1.5038951264951017, "learning_rate": 3.979692616967543e-06, "loss": 0.6837, "step": 103 }, { "epoch": 0.37604519774011297, "grad_norm": 1.4157220265249935, "learning_rate": 3.9540482420327845e-06, "loss": 0.6875, "step": 104 }, { "epoch": 0.37966101694915255, "grad_norm": 1.4737148597287204, "learning_rate": 3.9281705377369814e-06, "loss": 0.6901, "step": 105 }, { "epoch": 0.3832768361581921, "grad_norm": 1.4677149428122727, "learning_rate": 3.902063656644012e-06, "loss": 0.6792, "step": 106 }, { "epoch": 0.38689265536723166, "grad_norm": 1.3434999781262245, "learning_rate": 3.875731788093478e-06, "loss": 0.6876, "step": 107 }, { "epoch": 0.3905084745762712, "grad_norm": 1.4987089091978376, "learning_rate": 3.84917915752845e-06, "loss": 0.6761, "step": 108 }, { "epoch": 0.3941242937853107, "grad_norm": 1.508794866255814, "learning_rate": 3.8224100258174066e-06, "loss": 0.6838, "step": 109 }, { "epoch": 0.3977401129943503, "grad_norm": 1.4688146101109116, "learning_rate": 3.795428688570505e-06, "loss": 0.684, "step": 110 }, { "epoch": 0.40135593220338983, "grad_norm": 1.4606514616212791, "learning_rate": 3.7682394754502687e-06, "loss": 0.6824, "step": 111 }, { "epoch": 0.40497175141242936, "grad_norm": 1.4096709625800348, "learning_rate": 3.7408467494768104e-06, "loss": 0.6969, "step": 112 }, { "epoch": 0.40858757062146894, "grad_norm": 1.418524365345472, "learning_rate": 3.7132549063277033e-06, "loss": 0.7097, "step": 113 }, { "epoch": 0.41220338983050847, "grad_norm": 1.5410281672091906, "learning_rate": 3.685468373632613e-06, "loss": 0.6746, "step": 114 }, { "epoch": 0.415819209039548, "grad_norm": 1.3641313015018903, "learning_rate": 3.657491610262802e-06, "loss": 0.6448, "step": 115 }, { "epoch": 0.4194350282485876, "grad_norm": 1.3041765820835833, "learning_rate": 3.6293291056156178e-06, "loss": 0.6819, "step": 116 }, { "epoch": 0.4230508474576271, "grad_norm": 1.506905844856063, "learning_rate": 3.600985378894086e-06, "loss": 0.6876, "step": 117 }, { "epoch": 0.4266666666666667, "grad_norm": 1.376689012221553, "learning_rate": 3.572464978381719e-06, "loss": 0.684, "step": 118 }, { "epoch": 0.4302824858757062, "grad_norm": 1.340240336011346, "learning_rate": 3.5437724807126583e-06, "loss": 0.6505, "step": 119 }, { "epoch": 0.43389830508474575, "grad_norm": 1.325670474835959, "learning_rate": 3.514912490137268e-06, "loss": 0.6357, "step": 120 }, { "epoch": 0.43751412429378533, "grad_norm": 1.2967281789427187, "learning_rate": 3.4858896377832966e-06, "loss": 0.6716, "step": 121 }, { "epoch": 0.44112994350282486, "grad_norm": 1.4289334076188702, "learning_rate": 3.4567085809127247e-06, "loss": 0.6749, "step": 122 }, { "epoch": 0.4447457627118644, "grad_norm": 1.4006278853440177, "learning_rate": 3.42737400217442e-06, "loss": 0.6675, "step": 123 }, { "epoch": 0.448361581920904, "grad_norm": 1.4903303746050145, "learning_rate": 3.397890608852718e-06, "loss": 0.6795, "step": 124 }, { "epoch": 0.4519774011299435, "grad_norm": 1.5237327914091412, "learning_rate": 3.3682631321120507e-06, "loss": 0.6834, "step": 125 }, { "epoch": 0.45559322033898303, "grad_norm": 1.375637272974929, "learning_rate": 3.3384963262377434e-06, "loss": 0.6546, "step": 126 }, { "epoch": 0.4592090395480226, "grad_norm": 1.4010303116849099, "learning_rate": 3.3085949678730953e-06, "loss": 0.6687, "step": 127 }, { "epoch": 0.46282485875706214, "grad_norm": 1.3723912301345371, "learning_rate": 3.278563855252885e-06, "loss": 0.6927, "step": 128 }, { "epoch": 0.46644067796610167, "grad_norm": 1.4580226139492987, "learning_rate": 3.248407807433396e-06, "loss": 0.6843, "step": 129 }, { "epoch": 0.47005649717514125, "grad_norm": 1.5657029406507326, "learning_rate": 3.2181316635191125e-06, "loss": 0.6639, "step": 130 }, { "epoch": 0.4736723163841808, "grad_norm": 1.5943610613148829, "learning_rate": 3.1877402818861954e-06, "loss": 0.6655, "step": 131 }, { "epoch": 0.47728813559322036, "grad_norm": 1.398551732301804, "learning_rate": 3.157238539402862e-06, "loss": 0.6648, "step": 132 }, { "epoch": 0.4809039548022599, "grad_norm": 1.3527863119261647, "learning_rate": 3.1266313306468018e-06, "loss": 0.6793, "step": 133 }, { "epoch": 0.4845197740112994, "grad_norm": 1.4133737666494006, "learning_rate": 3.095923567119748e-06, "loss": 0.6808, "step": 134 }, { "epoch": 0.488135593220339, "grad_norm": 1.3489274410441074, "learning_rate": 3.0651201764593375e-06, "loss": 0.669, "step": 135 }, { "epoch": 0.49175141242937853, "grad_norm": 1.4710077216567483, "learning_rate": 3.034226101648377e-06, "loss": 0.6685, "step": 136 }, { "epoch": 0.49536723163841806, "grad_norm": 1.4143201023235143, "learning_rate": 3.0032463002216504e-06, "loss": 0.6803, "step": 137 }, { "epoch": 0.49898305084745764, "grad_norm": 1.350434140774409, "learning_rate": 2.972185743470386e-06, "loss": 0.6293, "step": 138 }, { "epoch": 0.5025988700564972, "grad_norm": 1.4061918089975518, "learning_rate": 2.941049415644522e-06, "loss": 0.6981, "step": 139 }, { "epoch": 0.5062146892655367, "grad_norm": 1.4466820101061297, "learning_rate": 2.909842313152888e-06, "loss": 0.6738, "step": 140 }, { "epoch": 0.5098305084745762, "grad_norm": 1.5124850873525673, "learning_rate": 2.878569443761442e-06, "loss": 0.7131, "step": 141 }, { "epoch": 0.5134463276836159, "grad_norm": 1.4743009883750753, "learning_rate": 2.847235825789673e-06, "loss": 0.7016, "step": 142 }, { "epoch": 0.5170621468926554, "grad_norm": 1.3586041525935764, "learning_rate": 2.8158464873053236e-06, "loss": 0.6724, "step": 143 }, { "epoch": 0.5206779661016949, "grad_norm": 1.4996529158631906, "learning_rate": 2.784406465317538e-06, "loss": 0.6662, "step": 144 }, { "epoch": 0.5242937853107345, "grad_norm": 1.4671966292049852, "learning_rate": 2.752920804968581e-06, "loss": 0.6631, "step": 145 }, { "epoch": 0.527909604519774, "grad_norm": 1.444812582502839, "learning_rate": 2.7213945587242507e-06, "loss": 0.6513, "step": 146 }, { "epoch": 0.5315254237288135, "grad_norm": 1.3053540444757412, "learning_rate": 2.689832785563116e-06, "loss": 0.6555, "step": 147 }, { "epoch": 0.5351412429378531, "grad_norm": 1.314006962084444, "learning_rate": 2.658240550164704e-06, "loss": 0.6661, "step": 148 }, { "epoch": 0.5387570621468927, "grad_norm": 1.4304673510029906, "learning_rate": 2.626622922096782e-06, "loss": 0.6621, "step": 149 }, { "epoch": 0.5423728813559322, "grad_norm": 1.4876534124839516, "learning_rate": 2.5949849750018486e-06, "loss": 0.6758, "step": 150 }, { "epoch": 0.5459887005649717, "grad_norm": 1.3200607589115334, "learning_rate": 2.56333178578297e-06, "loss": 0.6559, "step": 151 }, { "epoch": 0.5496045197740113, "grad_norm": 1.3240990359086642, "learning_rate": 2.5316684337891005e-06, "loss": 0.6232, "step": 152 }, { "epoch": 0.5532203389830509, "grad_norm": 1.3124913285203368, "learning_rate": 2.5e-06, "loss": 0.6373, "step": 153 }, { "epoch": 0.5568361581920904, "grad_norm": 1.3981996964149215, "learning_rate": 2.4683315662109003e-06, "loss": 0.6779, "step": 154 }, { "epoch": 0.56045197740113, "grad_norm": 1.3816310911971024, "learning_rate": 2.436668214217031e-06, "loss": 0.654, "step": 155 }, { "epoch": 0.5640677966101695, "grad_norm": 1.263049809743652, "learning_rate": 2.4050150249981522e-06, "loss": 0.6625, "step": 156 }, { "epoch": 0.567683615819209, "grad_norm": 1.3247706606665524, "learning_rate": 2.3733770779032185e-06, "loss": 0.6862, "step": 157 }, { "epoch": 0.5712994350282485, "grad_norm": 1.3384592393063528, "learning_rate": 2.341759449835297e-06, "loss": 0.669, "step": 158 }, { "epoch": 0.5749152542372882, "grad_norm": 1.3084651079974374, "learning_rate": 2.310167214436885e-06, "loss": 0.6389, "step": 159 }, { "epoch": 0.5785310734463277, "grad_norm": 1.3374680724124108, "learning_rate": 2.27860544127575e-06, "loss": 0.6472, "step": 160 }, { "epoch": 0.5821468926553672, "grad_norm": 1.3122640789722633, "learning_rate": 2.24707919503142e-06, "loss": 0.6579, "step": 161 }, { "epoch": 0.5857627118644068, "grad_norm": 1.4412496554625216, "learning_rate": 2.2155935346824634e-06, "loss": 0.6481, "step": 162 }, { "epoch": 0.5893785310734463, "grad_norm": 1.3863338838498946, "learning_rate": 2.1841535126946777e-06, "loss": 0.6535, "step": 163 }, { "epoch": 0.5929943502824859, "grad_norm": 1.345502076046215, "learning_rate": 2.1527641742103282e-06, "loss": 0.6707, "step": 164 }, { "epoch": 0.5966101694915255, "grad_norm": 1.3847319032734033, "learning_rate": 2.1214305562385592e-06, "loss": 0.6663, "step": 165 }, { "epoch": 0.600225988700565, "grad_norm": 1.430665603476504, "learning_rate": 2.0901576868471125e-06, "loss": 0.6747, "step": 166 }, { "epoch": 0.6038418079096045, "grad_norm": 1.353261054674096, "learning_rate": 2.05895058435548e-06, "loss": 0.6512, "step": 167 }, { "epoch": 0.607457627118644, "grad_norm": 1.2859249411363938, "learning_rate": 2.0278142565296153e-06, "loss": 0.6324, "step": 168 }, { "epoch": 0.6110734463276836, "grad_norm": 1.4013926433738113, "learning_rate": 1.9967536997783495e-06, "loss": 0.6679, "step": 169 }, { "epoch": 0.6146892655367232, "grad_norm": 1.3862043238014647, "learning_rate": 1.9657738983516227e-06, "loss": 0.6729, "step": 170 }, { "epoch": 0.6183050847457627, "grad_norm": 1.3586507239524463, "learning_rate": 1.934879823540663e-06, "loss": 0.6493, "step": 171 }, { "epoch": 0.6219209039548023, "grad_norm": 1.3297630157555045, "learning_rate": 1.9040764328802523e-06, "loss": 0.6398, "step": 172 }, { "epoch": 0.6255367231638418, "grad_norm": 1.3817085459115725, "learning_rate": 1.8733686693531986e-06, "loss": 0.6582, "step": 173 }, { "epoch": 0.6291525423728813, "grad_norm": 1.3202548993972594, "learning_rate": 1.842761460597138e-06, "loss": 0.6532, "step": 174 }, { "epoch": 0.632768361581921, "grad_norm": 1.3288961390749972, "learning_rate": 1.812259718113805e-06, "loss": 0.6603, "step": 175 }, { "epoch": 0.6363841807909605, "grad_norm": 1.329943461477084, "learning_rate": 1.7818683364808883e-06, "loss": 0.658, "step": 176 }, { "epoch": 0.64, "grad_norm": 1.3692273444745175, "learning_rate": 1.7515921925666053e-06, "loss": 0.6317, "step": 177 }, { "epoch": 0.6436158192090395, "grad_norm": 1.421706203526152, "learning_rate": 1.7214361447471156e-06, "loss": 0.677, "step": 178 }, { "epoch": 0.6472316384180791, "grad_norm": 1.3083895534561967, "learning_rate": 1.6914050321269049e-06, "loss": 0.6736, "step": 179 }, { "epoch": 0.6508474576271186, "grad_norm": 1.3408157323699283, "learning_rate": 1.6615036737622574e-06, "loss": 0.6802, "step": 180 }, { "epoch": 0.6544632768361582, "grad_norm": 1.2866436449644132, "learning_rate": 1.6317368678879497e-06, "loss": 0.646, "step": 181 }, { "epoch": 0.6580790960451978, "grad_norm": 1.4469309073814418, "learning_rate": 1.6021093911472825e-06, "loss": 0.6502, "step": 182 }, { "epoch": 0.6616949152542373, "grad_norm": 1.3285438490415578, "learning_rate": 1.572625997825581e-06, "loss": 0.6392, "step": 183 }, { "epoch": 0.6653107344632768, "grad_norm": 1.3767595914556963, "learning_rate": 1.5432914190872757e-06, "loss": 0.6478, "step": 184 }, { "epoch": 0.6689265536723163, "grad_norm": 1.3913230479527472, "learning_rate": 1.5141103622167042e-06, "loss": 0.6624, "step": 185 }, { "epoch": 0.672542372881356, "grad_norm": 1.3717018869154762, "learning_rate": 1.4850875098627326e-06, "loss": 0.6519, "step": 186 }, { "epoch": 0.6761581920903955, "grad_norm": 1.3411020703523342, "learning_rate": 1.456227519287343e-06, "loss": 0.6382, "step": 187 }, { "epoch": 0.679774011299435, "grad_norm": 1.238291654237968, "learning_rate": 1.4275350216182824e-06, "loss": 0.6391, "step": 188 }, { "epoch": 0.6833898305084746, "grad_norm": 1.374517850534095, "learning_rate": 1.3990146211059141e-06, "loss": 0.6456, "step": 189 }, { "epoch": 0.6870056497175141, "grad_norm": 1.306148052181935, "learning_rate": 1.3706708943843822e-06, "loss": 0.6441, "step": 190 }, { "epoch": 0.6906214689265536, "grad_norm": 1.3876372946236282, "learning_rate": 1.3425083897371983e-06, "loss": 0.6603, "step": 191 }, { "epoch": 0.6942372881355933, "grad_norm": 1.3904204488306329, "learning_rate": 1.3145316263673874e-06, "loss": 0.6721, "step": 192 }, { "epoch": 0.6978531073446328, "grad_norm": 1.510808405530817, "learning_rate": 1.286745093672298e-06, "loss": 0.649, "step": 193 }, { "epoch": 0.7014689265536723, "grad_norm": 1.4312145193748698, "learning_rate": 1.2591532505231906e-06, "loss": 0.6573, "step": 194 }, { "epoch": 0.7050847457627119, "grad_norm": 1.4024105046257231, "learning_rate": 1.2317605245497324e-06, "loss": 0.6727, "step": 195 }, { "epoch": 0.7087005649717514, "grad_norm": 1.3505688502581619, "learning_rate": 1.204571311429496e-06, "loss": 0.6131, "step": 196 }, { "epoch": 0.7123163841807909, "grad_norm": 1.319472329102849, "learning_rate": 1.1775899741825947e-06, "loss": 0.6434, "step": 197 }, { "epoch": 0.7159322033898305, "grad_norm": 1.5152192403656248, "learning_rate": 1.1508208424715511e-06, "loss": 0.656, "step": 198 }, { "epoch": 0.7195480225988701, "grad_norm": 1.6136474853206006, "learning_rate": 1.1242682119065217e-06, "loss": 0.6613, "step": 199 }, { "epoch": 0.7231638418079096, "grad_norm": 1.312425015581362, "learning_rate": 1.0979363433559892e-06, "loss": 0.6577, "step": 200 }, { "epoch": 0.7267796610169491, "grad_norm": 1.3953598075891687, "learning_rate": 1.0718294622630188e-06, "loss": 0.6905, "step": 201 }, { "epoch": 0.7303954802259887, "grad_norm": 1.372903007290825, "learning_rate": 1.045951757967215e-06, "loss": 0.6448, "step": 202 }, { "epoch": 0.7340112994350283, "grad_norm": 1.4652703276389691, "learning_rate": 1.0203073830324566e-06, "loss": 0.6395, "step": 203 }, { "epoch": 0.7376271186440678, "grad_norm": 1.366422271732463, "learning_rate": 9.949004525805423e-07, "loss": 0.6148, "step": 204 }, { "epoch": 0.7412429378531074, "grad_norm": 1.2886663538886012, "learning_rate": 9.697350436308428e-07, "loss": 0.6322, "step": 205 }, { "epoch": 0.7448587570621469, "grad_norm": 1.3574509449302405, "learning_rate": 9.448151944460657e-07, "loss": 0.6835, "step": 206 }, { "epoch": 0.7484745762711864, "grad_norm": 1.393654049362733, "learning_rate": 9.201449038842403e-07, "loss": 0.6713, "step": 207 }, { "epoch": 0.7520903954802259, "grad_norm": 1.3962653237548226, "learning_rate": 8.957281307570254e-07, "loss": 0.6349, "step": 208 }, { "epoch": 0.7557062146892656, "grad_norm": 1.2716349996486196, "learning_rate": 8.71568793194445e-07, "loss": 0.6395, "step": 209 }, { "epoch": 0.7593220338983051, "grad_norm": 1.3461355380403557, "learning_rate": 8.476707680161486e-07, "loss": 0.6566, "step": 210 }, { "epoch": 0.7629378531073446, "grad_norm": 1.3005123962225364, "learning_rate": 8.240378901093035e-07, "loss": 0.6498, "step": 211 }, { "epoch": 0.7665536723163842, "grad_norm": 1.2709444859864987, "learning_rate": 8.006739518132179e-07, "loss": 0.6702, "step": 212 }, { "epoch": 0.7701694915254237, "grad_norm": 1.3327493213768535, "learning_rate": 7.775827023107835e-07, "loss": 0.6403, "step": 213 }, { "epoch": 0.7737853107344633, "grad_norm": 1.4047092975707265, "learning_rate": 7.547678470268526e-07, "loss": 0.6492, "step": 214 }, { "epoch": 0.7774011299435029, "grad_norm": 1.383631976222337, "learning_rate": 7.322330470336314e-07, "loss": 0.6289, "step": 215 }, { "epoch": 0.7810169491525424, "grad_norm": 1.2567546465681303, "learning_rate": 7.099819184631929e-07, "loss": 0.6393, "step": 216 }, { "epoch": 0.7846327683615819, "grad_norm": 1.2354895441269147, "learning_rate": 6.880180319272006e-07, "loss": 0.6429, "step": 217 }, { "epoch": 0.7882485875706214, "grad_norm": 1.3478465932161185, "learning_rate": 6.663449119439358e-07, "loss": 0.6652, "step": 218 }, { "epoch": 0.791864406779661, "grad_norm": 1.3094969549541247, "learning_rate": 6.449660363727236e-07, "loss": 0.6424, "step": 219 }, { "epoch": 0.7954802259887006, "grad_norm": 1.385053278123551, "learning_rate": 6.238848358558439e-07, "loss": 0.6409, "step": 220 }, { "epoch": 0.7990960451977401, "grad_norm": 1.3349840769877994, "learning_rate": 6.031046932680229e-07, "loss": 0.6815, "step": 221 }, { "epoch": 0.8027118644067797, "grad_norm": 1.4435164246937722, "learning_rate": 5.826289431735832e-07, "loss": 0.6489, "step": 222 }, { "epoch": 0.8063276836158192, "grad_norm": 1.3356172924249725, "learning_rate": 5.624608712913531e-07, "loss": 0.6298, "step": 223 }, { "epoch": 0.8099435028248587, "grad_norm": 1.2977048398488058, "learning_rate": 5.426037139674117e-07, "loss": 0.6509, "step": 224 }, { "epoch": 0.8135593220338984, "grad_norm": 1.3774919529523362, "learning_rate": 5.23060657655754e-07, "loss": 0.6573, "step": 225 }, { "epoch": 0.8171751412429379, "grad_norm": 1.3410009004205188, "learning_rate": 5.038348384069663e-07, "loss": 0.633, "step": 226 }, { "epoch": 0.8207909604519774, "grad_norm": 1.2928727452886148, "learning_rate": 4.84929341364988e-07, "loss": 0.6754, "step": 227 }, { "epoch": 0.8244067796610169, "grad_norm": 1.358332904803305, "learning_rate": 4.6634720027204093e-07, "loss": 0.6614, "step": 228 }, { "epoch": 0.8280225988700565, "grad_norm": 1.4907573288328078, "learning_rate": 4.480913969818099e-07, "loss": 0.6281, "step": 229 }, { "epoch": 0.831638418079096, "grad_norm": 1.3392158203262607, "learning_rate": 4.3016486098094667e-07, "loss": 0.6161, "step": 230 }, { "epoch": 0.8352542372881356, "grad_norm": 1.2444012140753318, "learning_rate": 4.125704689189819e-07, "loss": 0.6247, "step": 231 }, { "epoch": 0.8388700564971752, "grad_norm": 1.4784447467052926, "learning_rate": 3.953110441467073e-07, "loss": 0.6586, "step": 232 }, { "epoch": 0.8424858757062147, "grad_norm": 1.366997421968513, "learning_rate": 3.7838935626312246e-07, "loss": 0.6463, "step": 233 }, { "epoch": 0.8461016949152542, "grad_norm": 1.4541952821449111, "learning_rate": 3.6180812067099477e-07, "loss": 0.6726, "step": 234 }, { "epoch": 0.8497175141242937, "grad_norm": 1.3494835181921292, "learning_rate": 3.455699981411259e-07, "loss": 0.6378, "step": 235 }, { "epoch": 0.8533333333333334, "grad_norm": 1.3091535029497192, "learning_rate": 3.296775943853789e-07, "loss": 0.6381, "step": 236 }, { "epoch": 0.8569491525423729, "grad_norm": 1.292004503291086, "learning_rate": 3.141334596385448e-07, "loss": 0.6361, "step": 237 }, { "epoch": 0.8605649717514124, "grad_norm": 1.3072359435785652, "learning_rate": 2.9894008824910726e-07, "loss": 0.6311, "step": 238 }, { "epoch": 0.864180790960452, "grad_norm": 1.3418443898914727, "learning_rate": 2.840999182789797e-07, "loss": 0.6584, "step": 239 }, { "epoch": 0.8677966101694915, "grad_norm": 1.3790407833555425, "learning_rate": 2.696153311122704e-07, "loss": 0.6275, "step": 240 }, { "epoch": 0.871412429378531, "grad_norm": 1.3668643411134043, "learning_rate": 2.5548865107314606e-07, "loss": 0.6574, "step": 241 }, { "epoch": 0.8750282485875707, "grad_norm": 1.267748779732102, "learning_rate": 2.4172214505285006e-07, "loss": 0.6394, "step": 242 }, { "epoch": 0.8786440677966102, "grad_norm": 1.2912726516960031, "learning_rate": 2.2831802214593774e-07, "loss": 0.6352, "step": 243 }, { "epoch": 0.8822598870056497, "grad_norm": 1.3408535402209096, "learning_rate": 2.1527843329578328e-07, "loss": 0.6332, "step": 244 }, { "epoch": 0.8858757062146893, "grad_norm": 1.369964120309017, "learning_rate": 2.026054709494235e-07, "loss": 0.6488, "step": 245 }, { "epoch": 0.8894915254237288, "grad_norm": 1.2985394908560597, "learning_rate": 1.9030116872178317e-07, "loss": 0.6268, "step": 246 }, { "epoch": 0.8931073446327683, "grad_norm": 1.2695253299944342, "learning_rate": 1.7836750106934475e-07, "loss": 0.6098, "step": 247 }, { "epoch": 0.896723163841808, "grad_norm": 1.2614680998276686, "learning_rate": 1.6680638297330854e-07, "loss": 0.6328, "step": 248 }, { "epoch": 0.9003389830508475, "grad_norm": 1.3022416023187458, "learning_rate": 1.5561966963229925e-07, "loss": 0.6353, "step": 249 }, { "epoch": 0.903954802259887, "grad_norm": 1.3192179782937912, "learning_rate": 1.448091561646628e-07, "loss": 0.6212, "step": 250 }, { "epoch": 0.9075706214689265, "grad_norm": 1.391821357807215, "learning_rate": 1.3437657732040783e-07, "loss": 0.6581, "step": 251 }, { "epoch": 0.9111864406779661, "grad_norm": 1.348097897319632, "learning_rate": 1.243236072028317e-07, "loss": 0.6483, "step": 252 }, { "epoch": 0.9148022598870057, "grad_norm": 1.3327973766461398, "learning_rate": 1.1465185899987797e-07, "loss": 0.6688, "step": 253 }, { "epoch": 0.9184180790960452, "grad_norm": 1.3050936769777537, "learning_rate": 1.0536288472527162e-07, "loss": 0.6702, "step": 254 }, { "epoch": 0.9220338983050848, "grad_norm": 1.3816378424721178, "learning_rate": 9.645817496946902e-08, "loss": 0.663, "step": 255 }, { "epoch": 0.9256497175141243, "grad_norm": 1.3585454766173655, "learning_rate": 8.79391586604636e-08, "loss": 0.64, "step": 256 }, { "epoch": 0.9292655367231638, "grad_norm": 1.3292531553316056, "learning_rate": 7.980720283448957e-08, "loss": 0.6335, "step": 257 }, { "epoch": 0.9328813559322033, "grad_norm": 1.2599441985317357, "learning_rate": 7.206361241665266e-08, "loss": 0.6403, "step": 258 }, { "epoch": 0.936497175141243, "grad_norm": 1.3958906990596642, "learning_rate": 6.470963001153268e-08, "loss": 0.6383, "step": 259 }, { "epoch": 0.9401129943502825, "grad_norm": 1.295975911169058, "learning_rate": 5.774643570378296e-08, "loss": 0.6485, "step": 260 }, { "epoch": 0.943728813559322, "grad_norm": 1.247115424184442, "learning_rate": 5.117514686876379e-08, "loss": 0.6479, "step": 261 }, { "epoch": 0.9473446327683616, "grad_norm": 1.2578135030925344, "learning_rate": 4.4996817993239464e-08, "loss": 0.6363, "step": 262 }, { "epoch": 0.9509604519774011, "grad_norm": 1.2567862444318008, "learning_rate": 3.9212440506164465e-08, "loss": 0.6417, "step": 263 }, { "epoch": 0.9545762711864407, "grad_norm": 1.2972287074987505, "learning_rate": 3.382294261959157e-08, "loss": 0.676, "step": 264 }, { "epoch": 0.9581920903954803, "grad_norm": 1.3028064182497767, "learning_rate": 2.8829189179721552e-08, "loss": 0.6502, "step": 265 }, { "epoch": 0.9618079096045198, "grad_norm": 1.3401154575841365, "learning_rate": 2.423198152812306e-08, "loss": 0.6271, "step": 266 }, { "epoch": 0.9654237288135593, "grad_norm": 1.332097876741386, "learning_rate": 2.0032057373142453e-08, "loss": 0.6312, "step": 267 }, { "epoch": 0.9690395480225988, "grad_norm": 1.2752118722459802, "learning_rate": 1.6230090671524312e-08, "loss": 0.6313, "step": 268 }, { "epoch": 0.9726553672316384, "grad_norm": 1.315897104629199, "learning_rate": 1.2826691520262114e-08, "loss": 0.6328, "step": 269 }, { "epoch": 0.976271186440678, "grad_norm": 1.341142823473401, "learning_rate": 9.822406058697665e-09, "loss": 0.6247, "step": 270 }, { "epoch": 0.9798870056497175, "grad_norm": 1.3751891651375663, "learning_rate": 7.217716380881479e-09, "loss": 0.6185, "step": 271 }, { "epoch": 0.9835028248587571, "grad_norm": 1.2993917620171413, "learning_rate": 5.0130404582127144e-09, "loss": 0.6308, "step": 272 }, { "epoch": 0.9871186440677966, "grad_norm": 1.306024529686539, "learning_rate": 3.208732072368104e-09, "loss": 0.6383, "step": 273 }, { "epoch": 0.9907344632768361, "grad_norm": 1.3213672912672132, "learning_rate": 1.8050807585293095e-09, "loss": 0.6559, "step": 274 }, { "epoch": 0.9943502824858758, "grad_norm": 1.3174616601564904, "learning_rate": 8.023117589237017e-10, "loss": 0.6701, "step": 275 }, { "epoch": 0.9979661016949153, "grad_norm": 1.3624163578590245, "learning_rate": 2.0058598667854755e-10, "loss": 0.6445, "step": 276 }, { "epoch": 0.9979661016949153, "step": 276, "total_flos": 100810460790784.0, "train_loss": 0.7222011056931122, "train_runtime": 4200.972, "train_samples_per_second": 8.426, "train_steps_per_second": 0.066 } ], "logging_steps": 1, "max_steps": 276, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 100810460790784.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }