diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14218 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 2025, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014831294030404152, + "grad_norm": 18.786383723402018, + "learning_rate": 0.0, + "loss": 4.6908, + "step": 1 + }, + { + "epoch": 0.0029662588060808304, + "grad_norm": 18.80287261108234, + "learning_rate": 4.926108374384237e-08, + "loss": 4.6562, + "step": 2 + }, + { + "epoch": 0.004449388209121246, + "grad_norm": 17.846997166005348, + "learning_rate": 9.852216748768474e-08, + "loss": 4.7314, + "step": 3 + }, + { + "epoch": 0.005932517612161661, + "grad_norm": 19.667970858990294, + "learning_rate": 1.477832512315271e-07, + "loss": 4.5315, + "step": 4 + }, + { + "epoch": 0.007415647015202077, + "grad_norm": 19.808635340918112, + "learning_rate": 1.9704433497536947e-07, + "loss": 4.8485, + "step": 5 + }, + { + "epoch": 0.008898776418242492, + "grad_norm": 19.05065660730444, + "learning_rate": 2.4630541871921185e-07, + "loss": 4.6327, + "step": 6 + }, + { + "epoch": 0.010381905821282907, + "grad_norm": 18.17261113256485, + "learning_rate": 2.955665024630542e-07, + "loss": 4.7037, + "step": 7 + }, + { + "epoch": 0.011865035224323322, + "grad_norm": 17.346264497951665, + "learning_rate": 3.4482758620689656e-07, + "loss": 4.4886, + "step": 8 + }, + { + "epoch": 0.013348164627363738, + "grad_norm": 18.01544997898461, + "learning_rate": 3.9408866995073894e-07, + "loss": 4.6506, + "step": 9 + }, + { + "epoch": 0.014831294030404153, + "grad_norm": 18.809974847435655, + "learning_rate": 4.433497536945813e-07, + "loss": 4.6855, + "step": 10 + }, + { + "epoch": 0.016314423433444566, + "grad_norm": 21.125018447741958, + "learning_rate": 4.926108374384237e-07, + "loss": 4.8559, + "step": 11 + }, + { + "epoch": 0.017797552836484983, + "grad_norm": 21.24812099913605, + "learning_rate": 5.418719211822661e-07, + "loss": 4.8051, + "step": 12 + }, + { + "epoch": 0.0192806822395254, + "grad_norm": 18.269722335171284, + "learning_rate": 5.911330049261084e-07, + "loss": 4.6777, + "step": 13 + }, + { + "epoch": 0.020763811642565813, + "grad_norm": 21.25784301154598, + "learning_rate": 6.403940886699508e-07, + "loss": 4.8485, + "step": 14 + }, + { + "epoch": 0.02224694104560623, + "grad_norm": 17.629902567675607, + "learning_rate": 6.896551724137931e-07, + "loss": 4.6368, + "step": 15 + }, + { + "epoch": 0.023730070448646643, + "grad_norm": 18.366305046082662, + "learning_rate": 7.389162561576356e-07, + "loss": 4.6649, + "step": 16 + }, + { + "epoch": 0.02521319985168706, + "grad_norm": 16.786960185742494, + "learning_rate": 7.881773399014779e-07, + "loss": 4.6049, + "step": 17 + }, + { + "epoch": 0.026696329254727477, + "grad_norm": 20.317236021918475, + "learning_rate": 8.374384236453203e-07, + "loss": 4.7412, + "step": 18 + }, + { + "epoch": 0.02817945865776789, + "grad_norm": 18.690348833697435, + "learning_rate": 8.866995073891626e-07, + "loss": 4.5878, + "step": 19 + }, + { + "epoch": 0.029662588060808306, + "grad_norm": 17.87643849279544, + "learning_rate": 9.359605911330049e-07, + "loss": 4.5248, + "step": 20 + }, + { + "epoch": 0.03114571746384872, + "grad_norm": 17.989129356251624, + "learning_rate": 9.852216748768474e-07, + "loss": 4.4659, + "step": 21 + }, + { + "epoch": 0.03262884686688913, + "grad_norm": 17.063723622513514, + "learning_rate": 1.0344827586206898e-06, + "loss": 4.6065, + "step": 22 + }, + { + "epoch": 0.03411197626992955, + "grad_norm": 16.867915448175605, + "learning_rate": 1.0837438423645322e-06, + "loss": 4.5269, + "step": 23 + }, + { + "epoch": 0.035595105672969966, + "grad_norm": 43.24954614629097, + "learning_rate": 1.1330049261083746e-06, + "loss": 4.3817, + "step": 24 + }, + { + "epoch": 0.03707823507601038, + "grad_norm": 17.132700075926664, + "learning_rate": 1.1822660098522167e-06, + "loss": 4.6425, + "step": 25 + }, + { + "epoch": 0.0385613644790508, + "grad_norm": 16.532718825301234, + "learning_rate": 1.2315270935960593e-06, + "loss": 4.3896, + "step": 26 + }, + { + "epoch": 0.04004449388209121, + "grad_norm": 18.037051380471006, + "learning_rate": 1.2807881773399017e-06, + "loss": 4.5292, + "step": 27 + }, + { + "epoch": 0.041527623285131626, + "grad_norm": 15.927860345264993, + "learning_rate": 1.330049261083744e-06, + "loss": 4.3615, + "step": 28 + }, + { + "epoch": 0.043010752688172046, + "grad_norm": 15.441860207655512, + "learning_rate": 1.3793103448275862e-06, + "loss": 4.2654, + "step": 29 + }, + { + "epoch": 0.04449388209121246, + "grad_norm": 16.327141918141244, + "learning_rate": 1.4285714285714286e-06, + "loss": 4.4775, + "step": 30 + }, + { + "epoch": 0.04597701149425287, + "grad_norm": 13.594766214655781, + "learning_rate": 1.4778325123152712e-06, + "loss": 4.0426, + "step": 31 + }, + { + "epoch": 0.047460140897293286, + "grad_norm": 13.248840295004053, + "learning_rate": 1.5270935960591136e-06, + "loss": 3.987, + "step": 32 + }, + { + "epoch": 0.048943270300333706, + "grad_norm": 14.595899907343314, + "learning_rate": 1.5763546798029558e-06, + "loss": 3.7758, + "step": 33 + }, + { + "epoch": 0.05042639970337412, + "grad_norm": 13.241437876198615, + "learning_rate": 1.6256157635467982e-06, + "loss": 3.9996, + "step": 34 + }, + { + "epoch": 0.05190952910641453, + "grad_norm": 13.024690610866747, + "learning_rate": 1.6748768472906405e-06, + "loss": 3.8919, + "step": 35 + }, + { + "epoch": 0.05339265850945495, + "grad_norm": 11.3009498917436, + "learning_rate": 1.724137931034483e-06, + "loss": 3.7347, + "step": 36 + }, + { + "epoch": 0.054875787912495366, + "grad_norm": 12.49110867008895, + "learning_rate": 1.7733990147783253e-06, + "loss": 3.9375, + "step": 37 + }, + { + "epoch": 0.05635891731553578, + "grad_norm": 11.051788610345715, + "learning_rate": 1.8226600985221677e-06, + "loss": 3.9867, + "step": 38 + }, + { + "epoch": 0.05784204671857619, + "grad_norm": 10.6076787669455, + "learning_rate": 1.8719211822660098e-06, + "loss": 3.8673, + "step": 39 + }, + { + "epoch": 0.05932517612161661, + "grad_norm": 12.117081817388216, + "learning_rate": 1.9211822660098524e-06, + "loss": 4.1264, + "step": 40 + }, + { + "epoch": 0.060808305524657026, + "grad_norm": 10.930771289183832, + "learning_rate": 1.970443349753695e-06, + "loss": 3.9587, + "step": 41 + }, + { + "epoch": 0.06229143492769744, + "grad_norm": 12.576381736982768, + "learning_rate": 2.019704433497537e-06, + "loss": 3.9733, + "step": 42 + }, + { + "epoch": 0.06377456433073786, + "grad_norm": 11.16330239082774, + "learning_rate": 2.0689655172413796e-06, + "loss": 3.8086, + "step": 43 + }, + { + "epoch": 0.06525769373377827, + "grad_norm": 11.981188261168471, + "learning_rate": 2.118226600985222e-06, + "loss": 4.1407, + "step": 44 + }, + { + "epoch": 0.06674082313681869, + "grad_norm": 10.296831496973295, + "learning_rate": 2.1674876847290643e-06, + "loss": 3.7975, + "step": 45 + }, + { + "epoch": 0.0682239525398591, + "grad_norm": 12.55438842390697, + "learning_rate": 2.2167487684729067e-06, + "loss": 3.9727, + "step": 46 + }, + { + "epoch": 0.06970708194289951, + "grad_norm": 10.065237407503123, + "learning_rate": 2.266009852216749e-06, + "loss": 3.7744, + "step": 47 + }, + { + "epoch": 0.07119021134593993, + "grad_norm": 10.063124562788271, + "learning_rate": 2.315270935960591e-06, + "loss": 3.8153, + "step": 48 + }, + { + "epoch": 0.07267334074898035, + "grad_norm": 10.339553133552766, + "learning_rate": 2.3645320197044334e-06, + "loss": 3.7914, + "step": 49 + }, + { + "epoch": 0.07415647015202076, + "grad_norm": 42.46435753534301, + "learning_rate": 2.4137931034482762e-06, + "loss": 3.6383, + "step": 50 + }, + { + "epoch": 0.07563959955506118, + "grad_norm": 18.413902068387213, + "learning_rate": 2.4630541871921186e-06, + "loss": 3.8353, + "step": 51 + }, + { + "epoch": 0.0771227289581016, + "grad_norm": 10.748432076710364, + "learning_rate": 2.512315270935961e-06, + "loss": 3.8109, + "step": 52 + }, + { + "epoch": 0.078605858361142, + "grad_norm": 10.056233979597803, + "learning_rate": 2.5615763546798034e-06, + "loss": 3.7953, + "step": 53 + }, + { + "epoch": 0.08008898776418243, + "grad_norm": 10.243182810391009, + "learning_rate": 2.6108374384236458e-06, + "loss": 3.8993, + "step": 54 + }, + { + "epoch": 0.08157211716722285, + "grad_norm": 10.79427465966413, + "learning_rate": 2.660098522167488e-06, + "loss": 3.6984, + "step": 55 + }, + { + "epoch": 0.08305524657026325, + "grad_norm": 9.561717407005624, + "learning_rate": 2.70935960591133e-06, + "loss": 3.7998, + "step": 56 + }, + { + "epoch": 0.08453837597330367, + "grad_norm": 11.534631519533132, + "learning_rate": 2.7586206896551725e-06, + "loss": 3.732, + "step": 57 + }, + { + "epoch": 0.08602150537634409, + "grad_norm": 10.563451863537827, + "learning_rate": 2.807881773399015e-06, + "loss": 3.652, + "step": 58 + }, + { + "epoch": 0.0875046347793845, + "grad_norm": 10.25748249572424, + "learning_rate": 2.8571428571428573e-06, + "loss": 3.8213, + "step": 59 + }, + { + "epoch": 0.08898776418242492, + "grad_norm": 10.413807365282782, + "learning_rate": 2.9064039408866996e-06, + "loss": 3.7325, + "step": 60 + }, + { + "epoch": 0.09047089358546533, + "grad_norm": 9.178159078703844, + "learning_rate": 2.9556650246305424e-06, + "loss": 3.5833, + "step": 61 + }, + { + "epoch": 0.09195402298850575, + "grad_norm": 9.767210152688241, + "learning_rate": 3.004926108374385e-06, + "loss": 3.5544, + "step": 62 + }, + { + "epoch": 0.09343715239154617, + "grad_norm": 10.511622521159012, + "learning_rate": 3.054187192118227e-06, + "loss": 3.7755, + "step": 63 + }, + { + "epoch": 0.09492028179458657, + "grad_norm": 10.63704311244583, + "learning_rate": 3.103448275862069e-06, + "loss": 3.7053, + "step": 64 + }, + { + "epoch": 0.09640341119762699, + "grad_norm": 9.267229017725155, + "learning_rate": 3.1527093596059115e-06, + "loss": 3.616, + "step": 65 + }, + { + "epoch": 0.09788654060066741, + "grad_norm": 9.402808241012835, + "learning_rate": 3.201970443349754e-06, + "loss": 3.7965, + "step": 66 + }, + { + "epoch": 0.09936967000370782, + "grad_norm": 9.44635371685643, + "learning_rate": 3.2512315270935963e-06, + "loss": 3.5098, + "step": 67 + }, + { + "epoch": 0.10085279940674824, + "grad_norm": 8.506236974185102, + "learning_rate": 3.3004926108374387e-06, + "loss": 3.5503, + "step": 68 + }, + { + "epoch": 0.10233592880978866, + "grad_norm": 9.359702932483554, + "learning_rate": 3.349753694581281e-06, + "loss": 3.4986, + "step": 69 + }, + { + "epoch": 0.10381905821282907, + "grad_norm": 8.678890783170266, + "learning_rate": 3.399014778325123e-06, + "loss": 3.5729, + "step": 70 + }, + { + "epoch": 0.10530218761586949, + "grad_norm": 9.380159698189763, + "learning_rate": 3.448275862068966e-06, + "loss": 3.6107, + "step": 71 + }, + { + "epoch": 0.1067853170189099, + "grad_norm": 9.511301316274285, + "learning_rate": 3.497536945812808e-06, + "loss": 3.5754, + "step": 72 + }, + { + "epoch": 0.10826844642195031, + "grad_norm": 9.705386660707495, + "learning_rate": 3.5467980295566506e-06, + "loss": 3.5645, + "step": 73 + }, + { + "epoch": 0.10975157582499073, + "grad_norm": 11.002673507323347, + "learning_rate": 3.596059113300493e-06, + "loss": 3.7453, + "step": 74 + }, + { + "epoch": 0.11123470522803114, + "grad_norm": 9.37090772119277, + "learning_rate": 3.6453201970443354e-06, + "loss": 3.3921, + "step": 75 + }, + { + "epoch": 0.11271783463107156, + "grad_norm": 9.770124768269168, + "learning_rate": 3.6945812807881777e-06, + "loss": 3.6354, + "step": 76 + }, + { + "epoch": 0.11420096403411198, + "grad_norm": 9.497324615118266, + "learning_rate": 3.7438423645320197e-06, + "loss": 3.602, + "step": 77 + }, + { + "epoch": 0.11568409343715239, + "grad_norm": 8.773500689826756, + "learning_rate": 3.793103448275862e-06, + "loss": 3.4961, + "step": 78 + }, + { + "epoch": 0.1171672228401928, + "grad_norm": 8.96930003459756, + "learning_rate": 3.842364532019705e-06, + "loss": 3.538, + "step": 79 + }, + { + "epoch": 0.11865035224323323, + "grad_norm": 8.86922780561134, + "learning_rate": 3.891625615763547e-06, + "loss": 3.6722, + "step": 80 + }, + { + "epoch": 0.12013348164627363, + "grad_norm": 9.11283594667135, + "learning_rate": 3.94088669950739e-06, + "loss": 3.6318, + "step": 81 + }, + { + "epoch": 0.12161661104931405, + "grad_norm": 8.558070609569732, + "learning_rate": 3.990147783251232e-06, + "loss": 3.5067, + "step": 82 + }, + { + "epoch": 0.12309974045235447, + "grad_norm": 8.933509964939034, + "learning_rate": 4.039408866995074e-06, + "loss": 3.4603, + "step": 83 + }, + { + "epoch": 0.12458286985539488, + "grad_norm": 9.404412459599069, + "learning_rate": 4.088669950738917e-06, + "loss": 3.5714, + "step": 84 + }, + { + "epoch": 0.12606599925843529, + "grad_norm": 9.33155201589157, + "learning_rate": 4.137931034482759e-06, + "loss": 3.6407, + "step": 85 + }, + { + "epoch": 0.12754912866147572, + "grad_norm": 8.404889522419374, + "learning_rate": 4.1871921182266015e-06, + "loss": 3.5519, + "step": 86 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 9.762929585080268, + "learning_rate": 4.236453201970444e-06, + "loss": 3.5421, + "step": 87 + }, + { + "epoch": 0.13051538746755653, + "grad_norm": 8.198758001693763, + "learning_rate": 4.2857142857142855e-06, + "loss": 3.5361, + "step": 88 + }, + { + "epoch": 0.13199851687059697, + "grad_norm": 8.473498768946182, + "learning_rate": 4.334975369458129e-06, + "loss": 3.5182, + "step": 89 + }, + { + "epoch": 0.13348164627363737, + "grad_norm": 9.55274091418346, + "learning_rate": 4.384236453201971e-06, + "loss": 3.4824, + "step": 90 + }, + { + "epoch": 0.13496477567667778, + "grad_norm": 9.334403616383222, + "learning_rate": 4.4334975369458135e-06, + "loss": 3.7305, + "step": 91 + }, + { + "epoch": 0.1364479050797182, + "grad_norm": 8.994343479203629, + "learning_rate": 4.482758620689656e-06, + "loss": 3.5324, + "step": 92 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 9.281003029942129, + "learning_rate": 4.532019704433498e-06, + "loss": 3.5702, + "step": 93 + }, + { + "epoch": 0.13941416388579903, + "grad_norm": 8.962951773025738, + "learning_rate": 4.581280788177341e-06, + "loss": 3.6954, + "step": 94 + }, + { + "epoch": 0.14089729328883946, + "grad_norm": 8.931580789236714, + "learning_rate": 4.630541871921182e-06, + "loss": 3.5869, + "step": 95 + }, + { + "epoch": 0.14238042269187987, + "grad_norm": 10.07812495312495, + "learning_rate": 4.6798029556650245e-06, + "loss": 3.7184, + "step": 96 + }, + { + "epoch": 0.14386355209492027, + "grad_norm": 9.265109522694898, + "learning_rate": 4.729064039408867e-06, + "loss": 3.5581, + "step": 97 + }, + { + "epoch": 0.1453466814979607, + "grad_norm": 9.805821795818115, + "learning_rate": 4.77832512315271e-06, + "loss": 3.5113, + "step": 98 + }, + { + "epoch": 0.1468298109010011, + "grad_norm": 8.938342209604796, + "learning_rate": 4.8275862068965525e-06, + "loss": 3.6062, + "step": 99 + }, + { + "epoch": 0.14831294030404152, + "grad_norm": 9.446263673901287, + "learning_rate": 4.876847290640395e-06, + "loss": 3.4988, + "step": 100 + }, + { + "epoch": 0.14979606970708195, + "grad_norm": 9.244354090176463, + "learning_rate": 4.926108374384237e-06, + "loss": 3.6043, + "step": 101 + }, + { + "epoch": 0.15127919911012236, + "grad_norm": 8.502655039940205, + "learning_rate": 4.97536945812808e-06, + "loss": 3.4744, + "step": 102 + }, + { + "epoch": 0.15276232851316277, + "grad_norm": 8.983298330579707, + "learning_rate": 5.024630541871922e-06, + "loss": 3.5189, + "step": 103 + }, + { + "epoch": 0.1542454579162032, + "grad_norm": 8.838976555832478, + "learning_rate": 5.073891625615764e-06, + "loss": 3.426, + "step": 104 + }, + { + "epoch": 0.1557285873192436, + "grad_norm": 8.74703538883808, + "learning_rate": 5.123152709359607e-06, + "loss": 3.4485, + "step": 105 + }, + { + "epoch": 0.157211716722284, + "grad_norm": 8.354370621517777, + "learning_rate": 5.172413793103449e-06, + "loss": 3.2482, + "step": 106 + }, + { + "epoch": 0.15869484612532445, + "grad_norm": 9.282025175629979, + "learning_rate": 5.2216748768472915e-06, + "loss": 3.5482, + "step": 107 + }, + { + "epoch": 0.16017797552836485, + "grad_norm": 9.280469026468948, + "learning_rate": 5.270935960591134e-06, + "loss": 3.4907, + "step": 108 + }, + { + "epoch": 0.16166110493140526, + "grad_norm": 8.764832939374172, + "learning_rate": 5.320197044334976e-06, + "loss": 3.517, + "step": 109 + }, + { + "epoch": 0.1631442343344457, + "grad_norm": 8.563532245291059, + "learning_rate": 5.369458128078819e-06, + "loss": 3.4879, + "step": 110 + }, + { + "epoch": 0.1646273637374861, + "grad_norm": 8.659377892219055, + "learning_rate": 5.41871921182266e-06, + "loss": 3.5239, + "step": 111 + }, + { + "epoch": 0.1661104931405265, + "grad_norm": 8.826122106821474, + "learning_rate": 5.467980295566503e-06, + "loss": 3.4206, + "step": 112 + }, + { + "epoch": 0.16759362254356694, + "grad_norm": 9.16732344883001, + "learning_rate": 5.517241379310345e-06, + "loss": 3.6734, + "step": 113 + }, + { + "epoch": 0.16907675194660735, + "grad_norm": 8.544384389768751, + "learning_rate": 5.566502463054187e-06, + "loss": 3.6678, + "step": 114 + }, + { + "epoch": 0.17055988134964775, + "grad_norm": 8.965585711070169, + "learning_rate": 5.61576354679803e-06, + "loss": 3.4211, + "step": 115 + }, + { + "epoch": 0.17204301075268819, + "grad_norm": 8.339558268471254, + "learning_rate": 5.665024630541872e-06, + "loss": 3.3798, + "step": 116 + }, + { + "epoch": 0.1735261401557286, + "grad_norm": 9.129157145615734, + "learning_rate": 5.7142857142857145e-06, + "loss": 3.4474, + "step": 117 + }, + { + "epoch": 0.175009269558769, + "grad_norm": 8.303526785813736, + "learning_rate": 5.763546798029557e-06, + "loss": 3.5371, + "step": 118 + }, + { + "epoch": 0.1764923989618094, + "grad_norm": 8.127883761600303, + "learning_rate": 5.812807881773399e-06, + "loss": 3.4379, + "step": 119 + }, + { + "epoch": 0.17797552836484984, + "grad_norm": 9.022932102259254, + "learning_rate": 5.862068965517242e-06, + "loss": 3.5211, + "step": 120 + }, + { + "epoch": 0.17945865776789025, + "grad_norm": 9.402516532091614, + "learning_rate": 5.911330049261085e-06, + "loss": 3.6276, + "step": 121 + }, + { + "epoch": 0.18094178717093065, + "grad_norm": 8.34588951372605, + "learning_rate": 5.960591133004927e-06, + "loss": 3.5206, + "step": 122 + }, + { + "epoch": 0.18242491657397109, + "grad_norm": 9.7524693479868, + "learning_rate": 6.00985221674877e-06, + "loss": 3.5555, + "step": 123 + }, + { + "epoch": 0.1839080459770115, + "grad_norm": 8.806598417601116, + "learning_rate": 6.059113300492612e-06, + "loss": 3.5676, + "step": 124 + }, + { + "epoch": 0.1853911753800519, + "grad_norm": 8.864474483395536, + "learning_rate": 6.108374384236454e-06, + "loss": 3.6244, + "step": 125 + }, + { + "epoch": 0.18687430478309233, + "grad_norm": 9.387806382882287, + "learning_rate": 6.157635467980296e-06, + "loss": 3.4474, + "step": 126 + }, + { + "epoch": 0.18835743418613274, + "grad_norm": 8.767274701707183, + "learning_rate": 6.206896551724138e-06, + "loss": 3.4641, + "step": 127 + }, + { + "epoch": 0.18984056358917314, + "grad_norm": 167.67880314037592, + "learning_rate": 6.256157635467981e-06, + "loss": 3.4971, + "step": 128 + }, + { + "epoch": 0.19132369299221358, + "grad_norm": 8.865684429597623, + "learning_rate": 6.305418719211823e-06, + "loss": 3.6982, + "step": 129 + }, + { + "epoch": 0.19280682239525399, + "grad_norm": 8.82730837744521, + "learning_rate": 6.3546798029556655e-06, + "loss": 3.6028, + "step": 130 + }, + { + "epoch": 0.1942899517982944, + "grad_norm": 8.913313734218653, + "learning_rate": 6.403940886699508e-06, + "loss": 3.539, + "step": 131 + }, + { + "epoch": 0.19577308120133483, + "grad_norm": 9.727457870895694, + "learning_rate": 6.45320197044335e-06, + "loss": 3.4723, + "step": 132 + }, + { + "epoch": 0.19725621060437523, + "grad_norm": 9.10954775849272, + "learning_rate": 6.502463054187193e-06, + "loss": 3.3293, + "step": 133 + }, + { + "epoch": 0.19873934000741564, + "grad_norm": 8.980425743786485, + "learning_rate": 6.551724137931035e-06, + "loss": 3.3958, + "step": 134 + }, + { + "epoch": 0.20022246941045607, + "grad_norm": 9.35053590465394, + "learning_rate": 6.600985221674877e-06, + "loss": 3.6062, + "step": 135 + }, + { + "epoch": 0.20170559881349648, + "grad_norm": 8.78251480289063, + "learning_rate": 6.65024630541872e-06, + "loss": 3.4308, + "step": 136 + }, + { + "epoch": 0.20318872821653688, + "grad_norm": 9.622407837159372, + "learning_rate": 6.699507389162562e-06, + "loss": 3.555, + "step": 137 + }, + { + "epoch": 0.20467185761957732, + "grad_norm": 9.150041372807953, + "learning_rate": 6.748768472906404e-06, + "loss": 3.563, + "step": 138 + }, + { + "epoch": 0.20615498702261773, + "grad_norm": 9.117941548708622, + "learning_rate": 6.798029556650246e-06, + "loss": 3.4626, + "step": 139 + }, + { + "epoch": 0.20763811642565813, + "grad_norm": 8.640357832111434, + "learning_rate": 6.84729064039409e-06, + "loss": 3.3431, + "step": 140 + }, + { + "epoch": 0.20912124582869857, + "grad_norm": 8.566466852133203, + "learning_rate": 6.896551724137932e-06, + "loss": 3.4697, + "step": 141 + }, + { + "epoch": 0.21060437523173897, + "grad_norm": 9.366148789281219, + "learning_rate": 6.945812807881774e-06, + "loss": 3.5536, + "step": 142 + }, + { + "epoch": 0.21208750463477938, + "grad_norm": 9.98403658213693, + "learning_rate": 6.995073891625616e-06, + "loss": 3.5977, + "step": 143 + }, + { + "epoch": 0.2135706340378198, + "grad_norm": 8.103157288404917, + "learning_rate": 7.044334975369459e-06, + "loss": 3.484, + "step": 144 + }, + { + "epoch": 0.21505376344086022, + "grad_norm": 11.725336192550499, + "learning_rate": 7.093596059113301e-06, + "loss": 3.532, + "step": 145 + }, + { + "epoch": 0.21653689284390062, + "grad_norm": 8.95234543153316, + "learning_rate": 7.1428571428571436e-06, + "loss": 3.6035, + "step": 146 + }, + { + "epoch": 0.21802002224694106, + "grad_norm": 10.105572733645126, + "learning_rate": 7.192118226600986e-06, + "loss": 3.5081, + "step": 147 + }, + { + "epoch": 0.21950315164998146, + "grad_norm": 18.304313476305378, + "learning_rate": 7.241379310344828e-06, + "loss": 3.6477, + "step": 148 + }, + { + "epoch": 0.22098628105302187, + "grad_norm": 9.521125784351714, + "learning_rate": 7.290640394088671e-06, + "loss": 3.5722, + "step": 149 + }, + { + "epoch": 0.22246941045606228, + "grad_norm": 9.42181833191413, + "learning_rate": 7.339901477832513e-06, + "loss": 3.5488, + "step": 150 + }, + { + "epoch": 0.2239525398591027, + "grad_norm": 37.13761762782279, + "learning_rate": 7.3891625615763555e-06, + "loss": 3.4834, + "step": 151 + }, + { + "epoch": 0.22543566926214312, + "grad_norm": 9.906415760301511, + "learning_rate": 7.438423645320198e-06, + "loss": 3.6363, + "step": 152 + }, + { + "epoch": 0.22691879866518352, + "grad_norm": 9.192106688121855, + "learning_rate": 7.487684729064039e-06, + "loss": 3.4503, + "step": 153 + }, + { + "epoch": 0.22840192806822396, + "grad_norm": 10.74340910545077, + "learning_rate": 7.536945812807882e-06, + "loss": 3.5492, + "step": 154 + }, + { + "epoch": 0.22988505747126436, + "grad_norm": 8.24816374291012, + "learning_rate": 7.586206896551724e-06, + "loss": 3.4559, + "step": 155 + }, + { + "epoch": 0.23136818687430477, + "grad_norm": 8.81105080977679, + "learning_rate": 7.635467980295567e-06, + "loss": 3.6603, + "step": 156 + }, + { + "epoch": 0.2328513162773452, + "grad_norm": 8.88278061685679, + "learning_rate": 7.68472906403941e-06, + "loss": 3.3937, + "step": 157 + }, + { + "epoch": 0.2343344456803856, + "grad_norm": 8.476313062328968, + "learning_rate": 7.733990147783253e-06, + "loss": 3.5536, + "step": 158 + }, + { + "epoch": 0.23581757508342602, + "grad_norm": 8.515427897551369, + "learning_rate": 7.783251231527095e-06, + "loss": 3.3907, + "step": 159 + }, + { + "epoch": 0.23730070448646645, + "grad_norm": 9.492208795041105, + "learning_rate": 7.832512315270938e-06, + "loss": 3.476, + "step": 160 + }, + { + "epoch": 0.23878383388950686, + "grad_norm": 8.287049816886917, + "learning_rate": 7.88177339901478e-06, + "loss": 3.4293, + "step": 161 + }, + { + "epoch": 0.24026696329254726, + "grad_norm": 8.993420645065115, + "learning_rate": 7.93103448275862e-06, + "loss": 3.5285, + "step": 162 + }, + { + "epoch": 0.2417500926955877, + "grad_norm": 9.203569011279667, + "learning_rate": 7.980295566502464e-06, + "loss": 3.5561, + "step": 163 + }, + { + "epoch": 0.2432332220986281, + "grad_norm": 9.70139138108913, + "learning_rate": 8.029556650246306e-06, + "loss": 3.4426, + "step": 164 + }, + { + "epoch": 0.2447163515016685, + "grad_norm": 8.730702158447457, + "learning_rate": 8.078817733990149e-06, + "loss": 3.4983, + "step": 165 + }, + { + "epoch": 0.24619948090470894, + "grad_norm": 7.918048392189836, + "learning_rate": 8.12807881773399e-06, + "loss": 3.4098, + "step": 166 + }, + { + "epoch": 0.24768261030774935, + "grad_norm": 9.349340895818644, + "learning_rate": 8.177339901477834e-06, + "loss": 3.3562, + "step": 167 + }, + { + "epoch": 0.24916573971078976, + "grad_norm": 8.290485494638787, + "learning_rate": 8.226600985221675e-06, + "loss": 3.3523, + "step": 168 + }, + { + "epoch": 0.25064886911383016, + "grad_norm": 9.72578187146945, + "learning_rate": 8.275862068965518e-06, + "loss": 3.5561, + "step": 169 + }, + { + "epoch": 0.25213199851687057, + "grad_norm": 11.881317350088342, + "learning_rate": 8.32512315270936e-06, + "loss": 3.5852, + "step": 170 + }, + { + "epoch": 0.25361512791991103, + "grad_norm": 9.477031833313044, + "learning_rate": 8.374384236453203e-06, + "loss": 3.3086, + "step": 171 + }, + { + "epoch": 0.25509825732295144, + "grad_norm": 9.178188428968317, + "learning_rate": 8.423645320197045e-06, + "loss": 3.4426, + "step": 172 + }, + { + "epoch": 0.25658138672599184, + "grad_norm": 9.084571314493976, + "learning_rate": 8.472906403940888e-06, + "loss": 3.5641, + "step": 173 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 8.856712920842773, + "learning_rate": 8.52216748768473e-06, + "loss": 3.3968, + "step": 174 + }, + { + "epoch": 0.25954764553207266, + "grad_norm": 9.243049172774835, + "learning_rate": 8.571428571428571e-06, + "loss": 3.4305, + "step": 175 + }, + { + "epoch": 0.26103077493511306, + "grad_norm": 10.6328077384977, + "learning_rate": 8.620689655172414e-06, + "loss": 3.6363, + "step": 176 + }, + { + "epoch": 0.2625139043381535, + "grad_norm": 8.425524573317162, + "learning_rate": 8.669950738916257e-06, + "loss": 3.4179, + "step": 177 + }, + { + "epoch": 0.26399703374119393, + "grad_norm": 9.652386370475945, + "learning_rate": 8.719211822660099e-06, + "loss": 3.4429, + "step": 178 + }, + { + "epoch": 0.26548016314423434, + "grad_norm": 9.826231077446234, + "learning_rate": 8.768472906403942e-06, + "loss": 3.5028, + "step": 179 + }, + { + "epoch": 0.26696329254727474, + "grad_norm": 8.954709261358241, + "learning_rate": 8.817733990147784e-06, + "loss": 3.5224, + "step": 180 + }, + { + "epoch": 0.26844642195031515, + "grad_norm": 8.93782383894103, + "learning_rate": 8.866995073891627e-06, + "loss": 3.4307, + "step": 181 + }, + { + "epoch": 0.26992955135335556, + "grad_norm": 8.942186817814973, + "learning_rate": 8.916256157635468e-06, + "loss": 3.302, + "step": 182 + }, + { + "epoch": 0.271412680756396, + "grad_norm": 8.966104627670525, + "learning_rate": 8.965517241379312e-06, + "loss": 3.4418, + "step": 183 + }, + { + "epoch": 0.2728958101594364, + "grad_norm": 8.599562123924498, + "learning_rate": 9.014778325123153e-06, + "loss": 3.4689, + "step": 184 + }, + { + "epoch": 0.27437893956247683, + "grad_norm": 8.480254655924414, + "learning_rate": 9.064039408866996e-06, + "loss": 3.4207, + "step": 185 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 9.830286791744449, + "learning_rate": 9.113300492610838e-06, + "loss": 3.6131, + "step": 186 + }, + { + "epoch": 0.27734519836855764, + "grad_norm": 9.149031693244932, + "learning_rate": 9.162561576354681e-06, + "loss": 3.5471, + "step": 187 + }, + { + "epoch": 0.27882832777159805, + "grad_norm": 9.7818436468955, + "learning_rate": 9.211822660098523e-06, + "loss": 3.6551, + "step": 188 + }, + { + "epoch": 0.2803114571746385, + "grad_norm": 9.51355611110903, + "learning_rate": 9.261083743842364e-06, + "loss": 3.4622, + "step": 189 + }, + { + "epoch": 0.2817945865776789, + "grad_norm": 9.075972307665362, + "learning_rate": 9.310344827586207e-06, + "loss": 3.4964, + "step": 190 + }, + { + "epoch": 0.2832777159807193, + "grad_norm": 8.177852104963424, + "learning_rate": 9.359605911330049e-06, + "loss": 3.4281, + "step": 191 + }, + { + "epoch": 0.28476084538375973, + "grad_norm": 9.93511793753745, + "learning_rate": 9.408866995073892e-06, + "loss": 3.4385, + "step": 192 + }, + { + "epoch": 0.28624397478680014, + "grad_norm": 8.395570199701872, + "learning_rate": 9.458128078817734e-06, + "loss": 3.4584, + "step": 193 + }, + { + "epoch": 0.28772710418984054, + "grad_norm": 8.378569262101896, + "learning_rate": 9.507389162561577e-06, + "loss": 3.4149, + "step": 194 + }, + { + "epoch": 0.289210233592881, + "grad_norm": 8.672975021568345, + "learning_rate": 9.55665024630542e-06, + "loss": 3.3879, + "step": 195 + }, + { + "epoch": 0.2906933629959214, + "grad_norm": 8.111395108170894, + "learning_rate": 9.605911330049262e-06, + "loss": 3.4286, + "step": 196 + }, + { + "epoch": 0.2921764923989618, + "grad_norm": 8.805267993822593, + "learning_rate": 9.655172413793105e-06, + "loss": 3.4448, + "step": 197 + }, + { + "epoch": 0.2936596218020022, + "grad_norm": 8.485677229483002, + "learning_rate": 9.704433497536947e-06, + "loss": 3.4163, + "step": 198 + }, + { + "epoch": 0.29514275120504263, + "grad_norm": 9.662438212410397, + "learning_rate": 9.75369458128079e-06, + "loss": 3.4624, + "step": 199 + }, + { + "epoch": 0.29662588060808304, + "grad_norm": 8.410423810533182, + "learning_rate": 9.802955665024631e-06, + "loss": 3.5336, + "step": 200 + }, + { + "epoch": 0.29810901001112344, + "grad_norm": 8.146164889204526, + "learning_rate": 9.852216748768475e-06, + "loss": 3.275, + "step": 201 + }, + { + "epoch": 0.2995921394141639, + "grad_norm": 8.718153920100978, + "learning_rate": 9.901477832512316e-06, + "loss": 3.5585, + "step": 202 + }, + { + "epoch": 0.3010752688172043, + "grad_norm": 9.049940223993339, + "learning_rate": 9.95073891625616e-06, + "loss": 3.4347, + "step": 203 + }, + { + "epoch": 0.3025583982202447, + "grad_norm": 9.818435034024272, + "learning_rate": 1e-05, + "loss": 3.5345, + "step": 204 + }, + { + "epoch": 0.3040415276232851, + "grad_norm": 8.701826208571221, + "learning_rate": 9.999992567363374e-06, + "loss": 3.5581, + "step": 205 + }, + { + "epoch": 0.30552465702632553, + "grad_norm": 8.562041681524851, + "learning_rate": 9.999970269475589e-06, + "loss": 3.4539, + "step": 206 + }, + { + "epoch": 0.30700778642936594, + "grad_norm": 8.265657512806557, + "learning_rate": 9.99993310640294e-06, + "loss": 3.2849, + "step": 207 + }, + { + "epoch": 0.3084909158324064, + "grad_norm": 9.290774600011824, + "learning_rate": 9.999881078255916e-06, + "loss": 3.5155, + "step": 208 + }, + { + "epoch": 0.3099740452354468, + "grad_norm": 9.343079945518097, + "learning_rate": 9.999814185189195e-06, + "loss": 3.599, + "step": 209 + }, + { + "epoch": 0.3114571746384872, + "grad_norm": 8.890612074495255, + "learning_rate": 9.99973242740166e-06, + "loss": 3.3872, + "step": 210 + }, + { + "epoch": 0.3129403040415276, + "grad_norm": 9.368128076726636, + "learning_rate": 9.99963580513638e-06, + "loss": 3.3795, + "step": 211 + }, + { + "epoch": 0.314423433444568, + "grad_norm": 9.42123737324935, + "learning_rate": 9.99952431868061e-06, + "loss": 3.4233, + "step": 212 + }, + { + "epoch": 0.31590656284760843, + "grad_norm": 8.629402980615994, + "learning_rate": 9.999397968365817e-06, + "loss": 3.4649, + "step": 213 + }, + { + "epoch": 0.3173896922506489, + "grad_norm": 9.263680332467354, + "learning_rate": 9.999256754567641e-06, + "loss": 3.5783, + "step": 214 + }, + { + "epoch": 0.3188728216536893, + "grad_norm": 8.461217623532413, + "learning_rate": 9.99910067770592e-06, + "loss": 3.2666, + "step": 215 + }, + { + "epoch": 0.3203559510567297, + "grad_norm": 8.467824793318055, + "learning_rate": 9.998929738244678e-06, + "loss": 3.4434, + "step": 216 + }, + { + "epoch": 0.3218390804597701, + "grad_norm": 8.246750640165386, + "learning_rate": 9.998743936692127e-06, + "loss": 3.4513, + "step": 217 + }, + { + "epoch": 0.3233222098628105, + "grad_norm": 9.408072522814093, + "learning_rate": 9.998543273600668e-06, + "loss": 3.3993, + "step": 218 + }, + { + "epoch": 0.3248053392658509, + "grad_norm": 9.234920794421292, + "learning_rate": 9.998327749566881e-06, + "loss": 3.5168, + "step": 219 + }, + { + "epoch": 0.3262884686688914, + "grad_norm": 8.844523891049052, + "learning_rate": 9.998097365231532e-06, + "loss": 3.5234, + "step": 220 + }, + { + "epoch": 0.3277715980719318, + "grad_norm": 8.896403311141372, + "learning_rate": 9.997852121279563e-06, + "loss": 3.4836, + "step": 221 + }, + { + "epoch": 0.3292547274749722, + "grad_norm": 8.699430600527332, + "learning_rate": 9.997592018440102e-06, + "loss": 3.3362, + "step": 222 + }, + { + "epoch": 0.3307378568780126, + "grad_norm": 8.621433873408437, + "learning_rate": 9.997317057486447e-06, + "loss": 3.3661, + "step": 223 + }, + { + "epoch": 0.332220986281053, + "grad_norm": 8.426435028991154, + "learning_rate": 9.997027239236072e-06, + "loss": 3.535, + "step": 224 + }, + { + "epoch": 0.3337041156840934, + "grad_norm": 8.655119689700523, + "learning_rate": 9.996722564550623e-06, + "loss": 3.3772, + "step": 225 + }, + { + "epoch": 0.3351872450871339, + "grad_norm": 8.817945756049772, + "learning_rate": 9.996403034335912e-06, + "loss": 3.456, + "step": 226 + }, + { + "epoch": 0.3366703744901743, + "grad_norm": 8.296859299746801, + "learning_rate": 9.996068649541923e-06, + "loss": 3.5761, + "step": 227 + }, + { + "epoch": 0.3381535038932147, + "grad_norm": 8.756854039049074, + "learning_rate": 9.995719411162798e-06, + "loss": 3.4473, + "step": 228 + }, + { + "epoch": 0.3396366332962551, + "grad_norm": 8.664579742701289, + "learning_rate": 9.995355320236846e-06, + "loss": 3.4817, + "step": 229 + }, + { + "epoch": 0.3411197626992955, + "grad_norm": 9.79098365989231, + "learning_rate": 9.994976377846523e-06, + "loss": 3.429, + "step": 230 + }, + { + "epoch": 0.3426028921023359, + "grad_norm": 8.894301927471451, + "learning_rate": 9.994582585118449e-06, + "loss": 3.4215, + "step": 231 + }, + { + "epoch": 0.34408602150537637, + "grad_norm": 8.355394404940597, + "learning_rate": 9.994173943223392e-06, + "loss": 3.5014, + "step": 232 + }, + { + "epoch": 0.3455691509084168, + "grad_norm": 11.228795632371831, + "learning_rate": 9.993750453376266e-06, + "loss": 3.4935, + "step": 233 + }, + { + "epoch": 0.3470522803114572, + "grad_norm": 8.384444193724555, + "learning_rate": 9.993312116836127e-06, + "loss": 3.4376, + "step": 234 + }, + { + "epoch": 0.3485354097144976, + "grad_norm": 8.305981724364415, + "learning_rate": 9.992858934906175e-06, + "loss": 3.512, + "step": 235 + }, + { + "epoch": 0.350018539117538, + "grad_norm": 8.669593928888522, + "learning_rate": 9.992390908933746e-06, + "loss": 3.323, + "step": 236 + }, + { + "epoch": 0.3515016685205784, + "grad_norm": 8.32645155538179, + "learning_rate": 9.991908040310307e-06, + "loss": 3.3083, + "step": 237 + }, + { + "epoch": 0.3529847979236188, + "grad_norm": 9.795673018688456, + "learning_rate": 9.991410330471452e-06, + "loss": 3.4816, + "step": 238 + }, + { + "epoch": 0.35446792732665927, + "grad_norm": 8.745428886265744, + "learning_rate": 9.990897780896897e-06, + "loss": 3.5892, + "step": 239 + }, + { + "epoch": 0.3559510567296997, + "grad_norm": 8.940980366492123, + "learning_rate": 9.990370393110484e-06, + "loss": 3.5564, + "step": 240 + }, + { + "epoch": 0.3574341861327401, + "grad_norm": 8.287350421262646, + "learning_rate": 9.989828168680164e-06, + "loss": 3.489, + "step": 241 + }, + { + "epoch": 0.3589173155357805, + "grad_norm": 8.645952742677238, + "learning_rate": 9.989271109217999e-06, + "loss": 3.4653, + "step": 242 + }, + { + "epoch": 0.3604004449388209, + "grad_norm": 8.37388022299681, + "learning_rate": 9.988699216380157e-06, + "loss": 3.4577, + "step": 243 + }, + { + "epoch": 0.3618835743418613, + "grad_norm": 8.493105878695916, + "learning_rate": 9.988112491866908e-06, + "loss": 3.2855, + "step": 244 + }, + { + "epoch": 0.36336670374490176, + "grad_norm": 7.973140017770347, + "learning_rate": 9.987510937422616e-06, + "loss": 3.5351, + "step": 245 + }, + { + "epoch": 0.36484983314794217, + "grad_norm": 8.346256759054805, + "learning_rate": 9.986894554835735e-06, + "loss": 3.4156, + "step": 246 + }, + { + "epoch": 0.3663329625509826, + "grad_norm": 8.483756239194143, + "learning_rate": 9.986263345938805e-06, + "loss": 3.4504, + "step": 247 + }, + { + "epoch": 0.367816091954023, + "grad_norm": 8.246553781213507, + "learning_rate": 9.985617312608442e-06, + "loss": 3.5284, + "step": 248 + }, + { + "epoch": 0.3692992213570634, + "grad_norm": 8.721207129306897, + "learning_rate": 9.98495645676534e-06, + "loss": 3.3601, + "step": 249 + }, + { + "epoch": 0.3707823507601038, + "grad_norm": 8.492290748752273, + "learning_rate": 9.98428078037426e-06, + "loss": 3.3333, + "step": 250 + }, + { + "epoch": 0.37226548016314426, + "grad_norm": 8.340700003443796, + "learning_rate": 9.983590285444025e-06, + "loss": 3.42, + "step": 251 + }, + { + "epoch": 0.37374860956618466, + "grad_norm": 8.621171547507993, + "learning_rate": 9.982884974027513e-06, + "loss": 3.5175, + "step": 252 + }, + { + "epoch": 0.37523173896922507, + "grad_norm": 8.868005097799156, + "learning_rate": 9.982164848221652e-06, + "loss": 3.4758, + "step": 253 + }, + { + "epoch": 0.3767148683722655, + "grad_norm": 8.644589249708106, + "learning_rate": 9.981429910167419e-06, + "loss": 3.3553, + "step": 254 + }, + { + "epoch": 0.3781979977753059, + "grad_norm": 8.734162968226386, + "learning_rate": 9.980680162049823e-06, + "loss": 3.5571, + "step": 255 + }, + { + "epoch": 0.3796811271783463, + "grad_norm": 8.374640111655523, + "learning_rate": 9.979915606097907e-06, + "loss": 3.5655, + "step": 256 + }, + { + "epoch": 0.38116425658138675, + "grad_norm": 8.355547168107048, + "learning_rate": 9.979136244584737e-06, + "loss": 3.5089, + "step": 257 + }, + { + "epoch": 0.38264738598442716, + "grad_norm": 8.519982171120624, + "learning_rate": 9.978342079827396e-06, + "loss": 3.4022, + "step": 258 + }, + { + "epoch": 0.38413051538746756, + "grad_norm": 8.06875124193747, + "learning_rate": 9.977533114186981e-06, + "loss": 3.3242, + "step": 259 + }, + { + "epoch": 0.38561364479050797, + "grad_norm": 8.839260449700719, + "learning_rate": 9.976709350068592e-06, + "loss": 3.4626, + "step": 260 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 7.9184363551785975, + "learning_rate": 9.975870789921322e-06, + "loss": 3.4238, + "step": 261 + }, + { + "epoch": 0.3885799035965888, + "grad_norm": 8.424751782929864, + "learning_rate": 9.97501743623826e-06, + "loss": 3.3841, + "step": 262 + }, + { + "epoch": 0.39006303299962924, + "grad_norm": 9.140355384115743, + "learning_rate": 9.974149291556468e-06, + "loss": 3.4903, + "step": 263 + }, + { + "epoch": 0.39154616240266965, + "grad_norm": 8.312371279686516, + "learning_rate": 9.97326635845699e-06, + "loss": 3.3761, + "step": 264 + }, + { + "epoch": 0.39302929180571006, + "grad_norm": 8.863309289515094, + "learning_rate": 9.972368639564836e-06, + "loss": 3.3967, + "step": 265 + }, + { + "epoch": 0.39451242120875046, + "grad_norm": 8.25527672370233, + "learning_rate": 9.971456137548971e-06, + "loss": 3.5106, + "step": 266 + }, + { + "epoch": 0.39599555061179087, + "grad_norm": 8.035829163943328, + "learning_rate": 9.970528855122316e-06, + "loss": 3.4748, + "step": 267 + }, + { + "epoch": 0.3974786800148313, + "grad_norm": 8.873554826943744, + "learning_rate": 9.96958679504173e-06, + "loss": 3.4018, + "step": 268 + }, + { + "epoch": 0.3989618094178717, + "grad_norm": 8.091316482817643, + "learning_rate": 9.96862996010801e-06, + "loss": 3.5028, + "step": 269 + }, + { + "epoch": 0.40044493882091214, + "grad_norm": 8.032802753693392, + "learning_rate": 9.967658353165877e-06, + "loss": 3.5306, + "step": 270 + }, + { + "epoch": 0.40192806822395255, + "grad_norm": 9.112492898109428, + "learning_rate": 9.966671977103972e-06, + "loss": 3.5796, + "step": 271 + }, + { + "epoch": 0.40341119762699296, + "grad_norm": 8.451224203410149, + "learning_rate": 9.965670834854848e-06, + "loss": 3.4231, + "step": 272 + }, + { + "epoch": 0.40489432703003336, + "grad_norm": 7.759161304411397, + "learning_rate": 9.964654929394952e-06, + "loss": 3.6709, + "step": 273 + }, + { + "epoch": 0.40637745643307377, + "grad_norm": 8.300614109893665, + "learning_rate": 9.96362426374463e-06, + "loss": 3.3799, + "step": 274 + }, + { + "epoch": 0.4078605858361142, + "grad_norm": 8.739366720220394, + "learning_rate": 9.962578840968102e-06, + "loss": 3.6521, + "step": 275 + }, + { + "epoch": 0.40934371523915464, + "grad_norm": 8.689977881194311, + "learning_rate": 9.961518664173473e-06, + "loss": 3.5219, + "step": 276 + }, + { + "epoch": 0.41082684464219504, + "grad_norm": 8.241147714067571, + "learning_rate": 9.960443736512701e-06, + "loss": 3.3605, + "step": 277 + }, + { + "epoch": 0.41230997404523545, + "grad_norm": 8.308786564207265, + "learning_rate": 9.959354061181609e-06, + "loss": 3.3815, + "step": 278 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 8.846412898552165, + "learning_rate": 9.958249641419861e-06, + "loss": 3.4968, + "step": 279 + }, + { + "epoch": 0.41527623285131626, + "grad_norm": 8.649234869560674, + "learning_rate": 9.957130480510954e-06, + "loss": 3.5672, + "step": 280 + }, + { + "epoch": 0.41675936225435667, + "grad_norm": 8.534043950334443, + "learning_rate": 9.955996581782218e-06, + "loss": 3.4295, + "step": 281 + }, + { + "epoch": 0.41824249165739713, + "grad_norm": 8.269135014882256, + "learning_rate": 9.954847948604795e-06, + "loss": 3.4125, + "step": 282 + }, + { + "epoch": 0.41972562106043754, + "grad_norm": 8.283669169010219, + "learning_rate": 9.953684584393633e-06, + "loss": 3.3098, + "step": 283 + }, + { + "epoch": 0.42120875046347794, + "grad_norm": 9.010605302042705, + "learning_rate": 9.952506492607477e-06, + "loss": 3.4354, + "step": 284 + }, + { + "epoch": 0.42269187986651835, + "grad_norm": 8.01228893327879, + "learning_rate": 9.95131367674886e-06, + "loss": 3.3221, + "step": 285 + }, + { + "epoch": 0.42417500926955876, + "grad_norm": 8.466937711338495, + "learning_rate": 9.950106140364089e-06, + "loss": 3.3937, + "step": 286 + }, + { + "epoch": 0.42565813867259916, + "grad_norm": 7.955147688530683, + "learning_rate": 9.948883887043232e-06, + "loss": 3.5524, + "step": 287 + }, + { + "epoch": 0.4271412680756396, + "grad_norm": 8.593707435902274, + "learning_rate": 9.947646920420121e-06, + "loss": 3.4388, + "step": 288 + }, + { + "epoch": 0.42862439747868003, + "grad_norm": 9.01564719320856, + "learning_rate": 9.94639524417232e-06, + "loss": 3.5393, + "step": 289 + }, + { + "epoch": 0.43010752688172044, + "grad_norm": 8.52187793535969, + "learning_rate": 9.945128862021133e-06, + "loss": 3.4738, + "step": 290 + }, + { + "epoch": 0.43159065628476084, + "grad_norm": 8.453560463528552, + "learning_rate": 9.943847777731584e-06, + "loss": 3.5047, + "step": 291 + }, + { + "epoch": 0.43307378568780125, + "grad_norm": 8.435890541167167, + "learning_rate": 9.942551995112405e-06, + "loss": 3.4093, + "step": 292 + }, + { + "epoch": 0.43455691509084166, + "grad_norm": 8.574698536345085, + "learning_rate": 9.94124151801603e-06, + "loss": 3.4316, + "step": 293 + }, + { + "epoch": 0.4360400444938821, + "grad_norm": 8.485958380662401, + "learning_rate": 9.939916350338582e-06, + "loss": 3.5205, + "step": 294 + }, + { + "epoch": 0.4375231738969225, + "grad_norm": 7.580180150242579, + "learning_rate": 9.938576496019849e-06, + "loss": 3.353, + "step": 295 + }, + { + "epoch": 0.43900630329996293, + "grad_norm": 7.776764294920078, + "learning_rate": 9.937221959043294e-06, + "loss": 3.336, + "step": 296 + }, + { + "epoch": 0.44048943270300334, + "grad_norm": 8.78008116526295, + "learning_rate": 9.935852743436033e-06, + "loss": 3.4319, + "step": 297 + }, + { + "epoch": 0.44197256210604374, + "grad_norm": 8.462090681165849, + "learning_rate": 9.93446885326882e-06, + "loss": 3.5214, + "step": 298 + }, + { + "epoch": 0.44345569150908415, + "grad_norm": 7.946348663045593, + "learning_rate": 9.933070292656029e-06, + "loss": 3.3598, + "step": 299 + }, + { + "epoch": 0.44493882091212456, + "grad_norm": 8.523161284843894, + "learning_rate": 9.931657065755662e-06, + "loss": 3.5443, + "step": 300 + }, + { + "epoch": 0.446421950315165, + "grad_norm": 8.092378076141015, + "learning_rate": 9.93022917676932e-06, + "loss": 3.3873, + "step": 301 + }, + { + "epoch": 0.4479050797182054, + "grad_norm": 8.580183643419353, + "learning_rate": 9.928786629942191e-06, + "loss": 3.4485, + "step": 302 + }, + { + "epoch": 0.44938820912124583, + "grad_norm": 7.742189427134481, + "learning_rate": 9.92732942956305e-06, + "loss": 3.4644, + "step": 303 + }, + { + "epoch": 0.45087133852428624, + "grad_norm": 7.503056865444109, + "learning_rate": 9.925857579964233e-06, + "loss": 3.3942, + "step": 304 + }, + { + "epoch": 0.45235446792732664, + "grad_norm": 7.594247793557803, + "learning_rate": 9.924371085521627e-06, + "loss": 3.4672, + "step": 305 + }, + { + "epoch": 0.45383759733036705, + "grad_norm": 8.116919534438788, + "learning_rate": 9.922869950654662e-06, + "loss": 3.3969, + "step": 306 + }, + { + "epoch": 0.4553207267334075, + "grad_norm": 8.107871178579584, + "learning_rate": 9.921354179826293e-06, + "loss": 3.4902, + "step": 307 + }, + { + "epoch": 0.4568038561364479, + "grad_norm": 8.307744150866975, + "learning_rate": 9.919823777542992e-06, + "loss": 3.5707, + "step": 308 + }, + { + "epoch": 0.4582869855394883, + "grad_norm": 8.16455565165069, + "learning_rate": 9.918278748354728e-06, + "loss": 3.5387, + "step": 309 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 7.7623065473203186, + "learning_rate": 9.916719096854956e-06, + "loss": 3.4917, + "step": 310 + }, + { + "epoch": 0.46125324434556914, + "grad_norm": 8.413511895650384, + "learning_rate": 9.915144827680606e-06, + "loss": 3.3764, + "step": 311 + }, + { + "epoch": 0.46273637374860954, + "grad_norm": 7.278379261722878, + "learning_rate": 9.913555945512065e-06, + "loss": 3.343, + "step": 312 + }, + { + "epoch": 0.46421950315165, + "grad_norm": 7.805259251836406, + "learning_rate": 9.911952455073169e-06, + "loss": 3.3505, + "step": 313 + }, + { + "epoch": 0.4657026325546904, + "grad_norm": 8.21716288722386, + "learning_rate": 9.91033436113118e-06, + "loss": 3.4779, + "step": 314 + }, + { + "epoch": 0.4671857619577308, + "grad_norm": 9.364575921718144, + "learning_rate": 9.90870166849678e-06, + "loss": 3.3545, + "step": 315 + }, + { + "epoch": 0.4686688913607712, + "grad_norm": 8.030265519933037, + "learning_rate": 9.907054382024058e-06, + "loss": 3.4585, + "step": 316 + }, + { + "epoch": 0.47015202076381163, + "grad_norm": 8.429069269166096, + "learning_rate": 9.90539250661048e-06, + "loss": 3.4022, + "step": 317 + }, + { + "epoch": 0.47163515016685204, + "grad_norm": 7.925433810902394, + "learning_rate": 9.903716047196896e-06, + "loss": 3.4057, + "step": 318 + }, + { + "epoch": 0.4731182795698925, + "grad_norm": 8.617337078740144, + "learning_rate": 9.902025008767512e-06, + "loss": 3.4615, + "step": 319 + }, + { + "epoch": 0.4746014089729329, + "grad_norm": 8.842655664603502, + "learning_rate": 9.900319396349875e-06, + "loss": 3.6309, + "step": 320 + }, + { + "epoch": 0.4760845383759733, + "grad_norm": 8.531411132869124, + "learning_rate": 9.898599215014868e-06, + "loss": 3.3667, + "step": 321 + }, + { + "epoch": 0.4775676677790137, + "grad_norm": 8.7488098325281, + "learning_rate": 9.89686446987668e-06, + "loss": 3.5036, + "step": 322 + }, + { + "epoch": 0.4790507971820541, + "grad_norm": 7.640798041503711, + "learning_rate": 9.895115166092806e-06, + "loss": 3.4971, + "step": 323 + }, + { + "epoch": 0.48053392658509453, + "grad_norm": 7.8725580443232115, + "learning_rate": 9.89335130886402e-06, + "loss": 3.4655, + "step": 324 + }, + { + "epoch": 0.482017055988135, + "grad_norm": 8.567891685259253, + "learning_rate": 9.891572903434366e-06, + "loss": 3.4187, + "step": 325 + }, + { + "epoch": 0.4835001853911754, + "grad_norm": 7.969698292549675, + "learning_rate": 9.889779955091142e-06, + "loss": 3.5007, + "step": 326 + }, + { + "epoch": 0.4849833147942158, + "grad_norm": 8.382877401226757, + "learning_rate": 9.88797246916488e-06, + "loss": 3.3867, + "step": 327 + }, + { + "epoch": 0.4864664441972562, + "grad_norm": 7.931222866987373, + "learning_rate": 9.886150451029334e-06, + "loss": 3.5332, + "step": 328 + }, + { + "epoch": 0.4879495736002966, + "grad_norm": 8.538454313531911, + "learning_rate": 9.884313906101466e-06, + "loss": 3.3486, + "step": 329 + }, + { + "epoch": 0.489432703003337, + "grad_norm": 7.89025043769276, + "learning_rate": 9.88246283984142e-06, + "loss": 3.5396, + "step": 330 + }, + { + "epoch": 0.4909158324063775, + "grad_norm": 8.971310350037932, + "learning_rate": 9.880597257752522e-06, + "loss": 3.5245, + "step": 331 + }, + { + "epoch": 0.4923989618094179, + "grad_norm": 8.128345238202142, + "learning_rate": 9.878717165381249e-06, + "loss": 3.4772, + "step": 332 + }, + { + "epoch": 0.4938820912124583, + "grad_norm": 8.748256760119299, + "learning_rate": 9.876822568317215e-06, + "loss": 3.4015, + "step": 333 + }, + { + "epoch": 0.4953652206154987, + "grad_norm": 9.279799612542275, + "learning_rate": 9.874913472193161e-06, + "loss": 3.3677, + "step": 334 + }, + { + "epoch": 0.4968483500185391, + "grad_norm": 7.883997439949753, + "learning_rate": 9.872989882684938e-06, + "loss": 3.434, + "step": 335 + }, + { + "epoch": 0.4983314794215795, + "grad_norm": 7.793306037449773, + "learning_rate": 9.87105180551148e-06, + "loss": 3.4662, + "step": 336 + }, + { + "epoch": 0.4998146088246199, + "grad_norm": 8.054489365487536, + "learning_rate": 9.869099246434797e-06, + "loss": 3.4571, + "step": 337 + }, + { + "epoch": 0.5012977382276603, + "grad_norm": 7.451703425557449, + "learning_rate": 9.867132211259951e-06, + "loss": 3.2381, + "step": 338 + }, + { + "epoch": 0.5027808676307007, + "grad_norm": 7.757238768619054, + "learning_rate": 9.865150705835049e-06, + "loss": 3.4535, + "step": 339 + }, + { + "epoch": 0.5042639970337411, + "grad_norm": 8.853950069434052, + "learning_rate": 9.863154736051214e-06, + "loss": 3.498, + "step": 340 + }, + { + "epoch": 0.5057471264367817, + "grad_norm": 8.653161722961306, + "learning_rate": 9.861144307842574e-06, + "loss": 3.4587, + "step": 341 + }, + { + "epoch": 0.5072302558398221, + "grad_norm": 9.482791385270335, + "learning_rate": 9.85911942718624e-06, + "loss": 3.254, + "step": 342 + }, + { + "epoch": 0.5087133852428625, + "grad_norm": 8.268373424990012, + "learning_rate": 9.857080100102292e-06, + "loss": 3.4351, + "step": 343 + }, + { + "epoch": 0.5101965146459029, + "grad_norm": 8.335268420162597, + "learning_rate": 9.855026332653763e-06, + "loss": 3.4396, + "step": 344 + }, + { + "epoch": 0.5116796440489433, + "grad_norm": 7.601897566290932, + "learning_rate": 9.852958130946615e-06, + "loss": 3.4393, + "step": 345 + }, + { + "epoch": 0.5131627734519837, + "grad_norm": 8.2171208028703, + "learning_rate": 9.850875501129726e-06, + "loss": 3.302, + "step": 346 + }, + { + "epoch": 0.5146459028550241, + "grad_norm": 8.917275105068944, + "learning_rate": 9.848778449394866e-06, + "loss": 3.4127, + "step": 347 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 8.246533644429597, + "learning_rate": 9.846666981976685e-06, + "loss": 3.3456, + "step": 348 + }, + { + "epoch": 0.5176121616611049, + "grad_norm": 8.042849767122414, + "learning_rate": 9.844541105152694e-06, + "loss": 3.4648, + "step": 349 + }, + { + "epoch": 0.5190952910641453, + "grad_norm": 7.729143290047147, + "learning_rate": 9.842400825243237e-06, + "loss": 3.5788, + "step": 350 + }, + { + "epoch": 0.5205784204671857, + "grad_norm": 8.466392468509476, + "learning_rate": 9.840246148611485e-06, + "loss": 3.5578, + "step": 351 + }, + { + "epoch": 0.5220615498702261, + "grad_norm": 7.1238391183099345, + "learning_rate": 9.83807708166341e-06, + "loss": 3.4015, + "step": 352 + }, + { + "epoch": 0.5235446792732666, + "grad_norm": 8.782199070597844, + "learning_rate": 9.835893630847767e-06, + "loss": 3.491, + "step": 353 + }, + { + "epoch": 0.525027808676307, + "grad_norm": 7.654178028596125, + "learning_rate": 9.83369580265607e-06, + "loss": 3.2503, + "step": 354 + }, + { + "epoch": 0.5265109380793475, + "grad_norm": 7.838652671372468, + "learning_rate": 9.831483603622588e-06, + "loss": 3.3802, + "step": 355 + }, + { + "epoch": 0.5279940674823879, + "grad_norm": 9.229506459874063, + "learning_rate": 9.829257040324308e-06, + "loss": 3.3922, + "step": 356 + }, + { + "epoch": 0.5294771968854283, + "grad_norm": 7.860590093133137, + "learning_rate": 9.827016119380922e-06, + "loss": 3.4598, + "step": 357 + }, + { + "epoch": 0.5309603262884687, + "grad_norm": 7.585198941272079, + "learning_rate": 9.82476084745481e-06, + "loss": 3.3837, + "step": 358 + }, + { + "epoch": 0.5324434556915091, + "grad_norm": 7.724006507248464, + "learning_rate": 9.822491231251025e-06, + "loss": 3.4061, + "step": 359 + }, + { + "epoch": 0.5339265850945495, + "grad_norm": 8.149725291795075, + "learning_rate": 9.820207277517254e-06, + "loss": 3.5287, + "step": 360 + }, + { + "epoch": 0.5354097144975899, + "grad_norm": 7.244546215016548, + "learning_rate": 9.817908993043819e-06, + "loss": 3.2369, + "step": 361 + }, + { + "epoch": 0.5368928439006303, + "grad_norm": 7.366187764856813, + "learning_rate": 9.815596384663642e-06, + "loss": 3.4699, + "step": 362 + }, + { + "epoch": 0.5383759733036707, + "grad_norm": 7.884255932790885, + "learning_rate": 9.81326945925224e-06, + "loss": 3.5758, + "step": 363 + }, + { + "epoch": 0.5398591027067111, + "grad_norm": 7.086392044025169, + "learning_rate": 9.810928223727683e-06, + "loss": 3.3003, + "step": 364 + }, + { + "epoch": 0.5413422321097516, + "grad_norm": 8.052992606776115, + "learning_rate": 9.808572685050596e-06, + "loss": 3.5449, + "step": 365 + }, + { + "epoch": 0.542825361512792, + "grad_norm": 8.300683465753666, + "learning_rate": 9.806202850224123e-06, + "loss": 3.3646, + "step": 366 + }, + { + "epoch": 0.5443084909158324, + "grad_norm": 9.02296311484335, + "learning_rate": 9.803818726293915e-06, + "loss": 3.6639, + "step": 367 + }, + { + "epoch": 0.5457916203188728, + "grad_norm": 8.535786729401627, + "learning_rate": 9.801420320348097e-06, + "loss": 3.5143, + "step": 368 + }, + { + "epoch": 0.5472747497219133, + "grad_norm": 8.510555573916749, + "learning_rate": 9.799007639517266e-06, + "loss": 3.5329, + "step": 369 + }, + { + "epoch": 0.5487578791249537, + "grad_norm": 8.035855835972747, + "learning_rate": 9.796580690974453e-06, + "loss": 3.3323, + "step": 370 + }, + { + "epoch": 0.5502410085279941, + "grad_norm": 8.23933174320661, + "learning_rate": 9.794139481935108e-06, + "loss": 3.5739, + "step": 371 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 7.735133201704136, + "learning_rate": 9.79168401965708e-06, + "loss": 3.3031, + "step": 372 + }, + { + "epoch": 0.5532072673340749, + "grad_norm": 8.03078038491643, + "learning_rate": 9.789214311440589e-06, + "loss": 3.377, + "step": 373 + }, + { + "epoch": 0.5546903967371153, + "grad_norm": 7.982851635511034, + "learning_rate": 9.786730364628217e-06, + "loss": 3.4887, + "step": 374 + }, + { + "epoch": 0.5561735261401557, + "grad_norm": 8.980939241183837, + "learning_rate": 9.784232186604871e-06, + "loss": 3.3964, + "step": 375 + }, + { + "epoch": 0.5576566555431961, + "grad_norm": 7.955482292445725, + "learning_rate": 9.781719784797773e-06, + "loss": 3.5805, + "step": 376 + }, + { + "epoch": 0.5591397849462365, + "grad_norm": 8.416446455198674, + "learning_rate": 9.779193166676426e-06, + "loss": 3.3957, + "step": 377 + }, + { + "epoch": 0.560622914349277, + "grad_norm": 8.02303446931419, + "learning_rate": 9.776652339752609e-06, + "loss": 3.2827, + "step": 378 + }, + { + "epoch": 0.5621060437523174, + "grad_norm": 7.872617789837837, + "learning_rate": 9.774097311580338e-06, + "loss": 3.4152, + "step": 379 + }, + { + "epoch": 0.5635891731553578, + "grad_norm": 7.635923540522209, + "learning_rate": 9.771528089755849e-06, + "loss": 3.3771, + "step": 380 + }, + { + "epoch": 0.5650723025583982, + "grad_norm": 7.833614977872596, + "learning_rate": 9.768944681917582e-06, + "loss": 3.4162, + "step": 381 + }, + { + "epoch": 0.5665554319614386, + "grad_norm": 7.821359192768638, + "learning_rate": 9.766347095746146e-06, + "loss": 3.5016, + "step": 382 + }, + { + "epoch": 0.568038561364479, + "grad_norm": 7.720159513145474, + "learning_rate": 9.763735338964313e-06, + "loss": 3.486, + "step": 383 + }, + { + "epoch": 0.5695216907675195, + "grad_norm": 7.526023546198657, + "learning_rate": 9.761109419336973e-06, + "loss": 3.4449, + "step": 384 + }, + { + "epoch": 0.5710048201705599, + "grad_norm": 7.748548985994988, + "learning_rate": 9.758469344671128e-06, + "loss": 3.3679, + "step": 385 + }, + { + "epoch": 0.5724879495736003, + "grad_norm": 8.103748459848648, + "learning_rate": 9.755815122815871e-06, + "loss": 3.5199, + "step": 386 + }, + { + "epoch": 0.5739710789766407, + "grad_norm": 8.078248388552929, + "learning_rate": 9.75314676166234e-06, + "loss": 3.6169, + "step": 387 + }, + { + "epoch": 0.5754542083796811, + "grad_norm": 7.806177888685872, + "learning_rate": 9.750464269143725e-06, + "loss": 3.4853, + "step": 388 + }, + { + "epoch": 0.5769373377827215, + "grad_norm": 8.623044228628247, + "learning_rate": 9.747767653235219e-06, + "loss": 3.5647, + "step": 389 + }, + { + "epoch": 0.578420467185762, + "grad_norm": 7.697831518271142, + "learning_rate": 9.74505692195401e-06, + "loss": 3.4643, + "step": 390 + }, + { + "epoch": 0.5799035965888024, + "grad_norm": 8.461320239485332, + "learning_rate": 9.742332083359252e-06, + "loss": 3.452, + "step": 391 + }, + { + "epoch": 0.5813867259918428, + "grad_norm": 7.824601476640886, + "learning_rate": 9.739593145552036e-06, + "loss": 3.4227, + "step": 392 + }, + { + "epoch": 0.5828698553948832, + "grad_norm": 7.118984418254054, + "learning_rate": 9.736840116675374e-06, + "loss": 3.3056, + "step": 393 + }, + { + "epoch": 0.5843529847979236, + "grad_norm": 7.80363933622348, + "learning_rate": 9.734073004914173e-06, + "loss": 3.3471, + "step": 394 + }, + { + "epoch": 0.585836114200964, + "grad_norm": 8.186938360791698, + "learning_rate": 9.731291818495208e-06, + "loss": 3.453, + "step": 395 + }, + { + "epoch": 0.5873192436040044, + "grad_norm": 8.082451271153689, + "learning_rate": 9.728496565687096e-06, + "loss": 3.5296, + "step": 396 + }, + { + "epoch": 0.5888023730070449, + "grad_norm": 8.125133651529662, + "learning_rate": 9.725687254800277e-06, + "loss": 3.5807, + "step": 397 + }, + { + "epoch": 0.5902855024100853, + "grad_norm": 8.033973887545017, + "learning_rate": 9.722863894186989e-06, + "loss": 3.3939, + "step": 398 + }, + { + "epoch": 0.5917686318131257, + "grad_norm": 7.426724930951009, + "learning_rate": 9.720026492241232e-06, + "loss": 3.4416, + "step": 399 + }, + { + "epoch": 0.5932517612161661, + "grad_norm": 7.633539724910027, + "learning_rate": 9.71717505739876e-06, + "loss": 3.5481, + "step": 400 + }, + { + "epoch": 0.5947348906192065, + "grad_norm": 7.1620428058339005, + "learning_rate": 9.714309598137045e-06, + "loss": 3.4817, + "step": 401 + }, + { + "epoch": 0.5962180200222469, + "grad_norm": 8.520558570245985, + "learning_rate": 9.711430122975252e-06, + "loss": 3.5499, + "step": 402 + }, + { + "epoch": 0.5977011494252874, + "grad_norm": 7.912960405513235, + "learning_rate": 9.708536640474221e-06, + "loss": 3.5047, + "step": 403 + }, + { + "epoch": 0.5991842788283278, + "grad_norm": 7.587769793245008, + "learning_rate": 9.705629159236431e-06, + "loss": 3.453, + "step": 404 + }, + { + "epoch": 0.6006674082313682, + "grad_norm": 7.414612852829815, + "learning_rate": 9.702707687905984e-06, + "loss": 3.324, + "step": 405 + }, + { + "epoch": 0.6021505376344086, + "grad_norm": 8.717774234309388, + "learning_rate": 9.699772235168572e-06, + "loss": 3.3207, + "step": 406 + }, + { + "epoch": 0.603633667037449, + "grad_norm": 7.5620873872441114, + "learning_rate": 9.69682280975146e-06, + "loss": 3.1616, + "step": 407 + }, + { + "epoch": 0.6051167964404894, + "grad_norm": 7.8012046079077, + "learning_rate": 9.693859420423448e-06, + "loss": 3.3233, + "step": 408 + }, + { + "epoch": 0.6065999258435298, + "grad_norm": 8.244673472396116, + "learning_rate": 9.690882075994856e-06, + "loss": 3.3156, + "step": 409 + }, + { + "epoch": 0.6080830552465702, + "grad_norm": 7.601435502724566, + "learning_rate": 9.687890785317488e-06, + "loss": 3.5038, + "step": 410 + }, + { + "epoch": 0.6095661846496107, + "grad_norm": 8.10906921365731, + "learning_rate": 9.68488555728462e-06, + "loss": 3.3569, + "step": 411 + }, + { + "epoch": 0.6110493140526511, + "grad_norm": 7.4122599193857654, + "learning_rate": 9.681866400830958e-06, + "loss": 3.6072, + "step": 412 + }, + { + "epoch": 0.6125324434556915, + "grad_norm": 7.751627245095446, + "learning_rate": 9.678833324932614e-06, + "loss": 3.4231, + "step": 413 + }, + { + "epoch": 0.6140155728587319, + "grad_norm": 8.022090980079312, + "learning_rate": 9.675786338607096e-06, + "loss": 3.5337, + "step": 414 + }, + { + "epoch": 0.6154987022617724, + "grad_norm": 7.610989944940764, + "learning_rate": 9.672725450913256e-06, + "loss": 3.3264, + "step": 415 + }, + { + "epoch": 0.6169818316648128, + "grad_norm": 7.50329982026846, + "learning_rate": 9.669650670951282e-06, + "loss": 3.305, + "step": 416 + }, + { + "epoch": 0.6184649610678532, + "grad_norm": 7.024368496862084, + "learning_rate": 9.666562007862662e-06, + "loss": 3.4698, + "step": 417 + }, + { + "epoch": 0.6199480904708936, + "grad_norm": 7.085815923852789, + "learning_rate": 9.663459470830161e-06, + "loss": 3.4796, + "step": 418 + }, + { + "epoch": 0.621431219873934, + "grad_norm": 7.962508010888954, + "learning_rate": 9.66034306907779e-06, + "loss": 3.4966, + "step": 419 + }, + { + "epoch": 0.6229143492769744, + "grad_norm": 7.032250790931582, + "learning_rate": 9.657212811870783e-06, + "loss": 3.4255, + "step": 420 + }, + { + "epoch": 0.6243974786800148, + "grad_norm": 8.677172335134745, + "learning_rate": 9.654068708515564e-06, + "loss": 3.5318, + "step": 421 + }, + { + "epoch": 0.6258806080830552, + "grad_norm": 7.3482974381082276, + "learning_rate": 9.650910768359728e-06, + "loss": 3.4748, + "step": 422 + }, + { + "epoch": 0.6273637374860956, + "grad_norm": 7.641610903316742, + "learning_rate": 9.647739000791999e-06, + "loss": 3.4653, + "step": 423 + }, + { + "epoch": 0.628846866889136, + "grad_norm": 7.968215250049587, + "learning_rate": 9.644553415242218e-06, + "loss": 3.3936, + "step": 424 + }, + { + "epoch": 0.6303299962921765, + "grad_norm": 7.545486542586793, + "learning_rate": 9.641354021181304e-06, + "loss": 3.3646, + "step": 425 + }, + { + "epoch": 0.6318131256952169, + "grad_norm": 9.018481188250304, + "learning_rate": 9.638140828121232e-06, + "loss": 3.5188, + "step": 426 + }, + { + "epoch": 0.6332962550982574, + "grad_norm": 8.017759863311062, + "learning_rate": 9.634913845614998e-06, + "loss": 3.3066, + "step": 427 + }, + { + "epoch": 0.6347793845012978, + "grad_norm": 8.122565520987592, + "learning_rate": 9.631673083256599e-06, + "loss": 3.3927, + "step": 428 + }, + { + "epoch": 0.6362625139043382, + "grad_norm": 7.642294354760838, + "learning_rate": 9.628418550680996e-06, + "loss": 3.3247, + "step": 429 + }, + { + "epoch": 0.6377456433073786, + "grad_norm": 8.530502745328448, + "learning_rate": 9.625150257564097e-06, + "loss": 3.4227, + "step": 430 + }, + { + "epoch": 0.639228772710419, + "grad_norm": 7.522634802295953, + "learning_rate": 9.621868213622713e-06, + "loss": 3.4275, + "step": 431 + }, + { + "epoch": 0.6407119021134594, + "grad_norm": 8.266029831846513, + "learning_rate": 9.618572428614541e-06, + "loss": 3.4749, + "step": 432 + }, + { + "epoch": 0.6421950315164998, + "grad_norm": 7.496981361452346, + "learning_rate": 9.615262912338127e-06, + "loss": 3.4421, + "step": 433 + }, + { + "epoch": 0.6436781609195402, + "grad_norm": 7.816342123559298, + "learning_rate": 9.611939674632849e-06, + "loss": 3.3843, + "step": 434 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 7.5579177196495095, + "learning_rate": 9.60860272537887e-06, + "loss": 3.4877, + "step": 435 + }, + { + "epoch": 0.646644419725621, + "grad_norm": 7.319283058709796, + "learning_rate": 9.605252074497125e-06, + "loss": 3.4351, + "step": 436 + }, + { + "epoch": 0.6481275491286614, + "grad_norm": 7.931941185194092, + "learning_rate": 9.601887731949279e-06, + "loss": 3.389, + "step": 437 + }, + { + "epoch": 0.6496106785317018, + "grad_norm": 8.285109342231518, + "learning_rate": 9.598509707737711e-06, + "loss": 3.3513, + "step": 438 + }, + { + "epoch": 0.6510938079347423, + "grad_norm": 7.9469366558473356, + "learning_rate": 9.595118011905466e-06, + "loss": 3.5895, + "step": 439 + }, + { + "epoch": 0.6525769373377828, + "grad_norm": 7.189701722970986, + "learning_rate": 9.591712654536248e-06, + "loss": 3.3365, + "step": 440 + }, + { + "epoch": 0.6540600667408232, + "grad_norm": 7.82688208675346, + "learning_rate": 9.588293645754363e-06, + "loss": 3.5579, + "step": 441 + }, + { + "epoch": 0.6555431961438636, + "grad_norm": 8.190857610191657, + "learning_rate": 9.584860995724717e-06, + "loss": 3.5312, + "step": 442 + }, + { + "epoch": 0.657026325546904, + "grad_norm": 7.488634289187936, + "learning_rate": 9.581414714652763e-06, + "loss": 3.4418, + "step": 443 + }, + { + "epoch": 0.6585094549499444, + "grad_norm": 8.927575502138588, + "learning_rate": 9.577954812784484e-06, + "loss": 3.5487, + "step": 444 + }, + { + "epoch": 0.6599925843529848, + "grad_norm": 7.593482495218223, + "learning_rate": 9.574481300406356e-06, + "loss": 3.6178, + "step": 445 + }, + { + "epoch": 0.6614757137560252, + "grad_norm": 8.405319231154309, + "learning_rate": 9.570994187845323e-06, + "loss": 3.6035, + "step": 446 + }, + { + "epoch": 0.6629588431590656, + "grad_norm": 7.792655492753614, + "learning_rate": 9.56749348546876e-06, + "loss": 3.4876, + "step": 447 + }, + { + "epoch": 0.664441972562106, + "grad_norm": 7.48658146761914, + "learning_rate": 9.563979203684449e-06, + "loss": 3.3199, + "step": 448 + }, + { + "epoch": 0.6659251019651464, + "grad_norm": 7.764063495922079, + "learning_rate": 9.560451352940537e-06, + "loss": 3.5107, + "step": 449 + }, + { + "epoch": 0.6674082313681868, + "grad_norm": 7.014706745541734, + "learning_rate": 9.55690994372552e-06, + "loss": 3.4152, + "step": 450 + }, + { + "epoch": 0.6688913607712272, + "grad_norm": 8.222785248678518, + "learning_rate": 9.553354986568201e-06, + "loss": 3.5016, + "step": 451 + }, + { + "epoch": 0.6703744901742678, + "grad_norm": 7.108771836664713, + "learning_rate": 9.549786492037663e-06, + "loss": 3.1671, + "step": 452 + }, + { + "epoch": 0.6718576195773082, + "grad_norm": 7.793990299159232, + "learning_rate": 9.546204470743234e-06, + "loss": 3.3256, + "step": 453 + }, + { + "epoch": 0.6733407489803486, + "grad_norm": 7.8473920850404015, + "learning_rate": 9.542608933334459e-06, + "loss": 3.4585, + "step": 454 + }, + { + "epoch": 0.674823878383389, + "grad_norm": 8.3737582727186, + "learning_rate": 9.538999890501066e-06, + "loss": 3.5876, + "step": 455 + }, + { + "epoch": 0.6763070077864294, + "grad_norm": 8.526091292359727, + "learning_rate": 9.53537735297294e-06, + "loss": 3.5021, + "step": 456 + }, + { + "epoch": 0.6777901371894698, + "grad_norm": 8.072265424301845, + "learning_rate": 9.531741331520079e-06, + "loss": 3.3035, + "step": 457 + }, + { + "epoch": 0.6792732665925102, + "grad_norm": 7.399541330794975, + "learning_rate": 9.528091836952576e-06, + "loss": 3.3289, + "step": 458 + }, + { + "epoch": 0.6807563959955506, + "grad_norm": 7.52988521917795, + "learning_rate": 9.524428880120578e-06, + "loss": 3.3969, + "step": 459 + }, + { + "epoch": 0.682239525398591, + "grad_norm": 7.13194681381952, + "learning_rate": 9.520752471914255e-06, + "loss": 3.2851, + "step": 460 + }, + { + "epoch": 0.6837226548016314, + "grad_norm": 7.426669799215283, + "learning_rate": 9.517062623263768e-06, + "loss": 3.2956, + "step": 461 + }, + { + "epoch": 0.6852057842046718, + "grad_norm": 7.34757955830357, + "learning_rate": 9.513359345139242e-06, + "loss": 3.1261, + "step": 462 + }, + { + "epoch": 0.6866889136077122, + "grad_norm": 7.34375021223431, + "learning_rate": 9.509642648550723e-06, + "loss": 3.3791, + "step": 463 + }, + { + "epoch": 0.6881720430107527, + "grad_norm": 8.610575280374086, + "learning_rate": 9.505912544548155e-06, + "loss": 3.4845, + "step": 464 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 7.812634363013557, + "learning_rate": 9.502169044221338e-06, + "loss": 3.509, + "step": 465 + }, + { + "epoch": 0.6911383018168336, + "grad_norm": 6.875976729140504, + "learning_rate": 9.498412158699905e-06, + "loss": 3.2266, + "step": 466 + }, + { + "epoch": 0.692621431219874, + "grad_norm": 7.475574840639962, + "learning_rate": 9.494641899153283e-06, + "loss": 3.4525, + "step": 467 + }, + { + "epoch": 0.6941045606229144, + "grad_norm": 8.37850982202805, + "learning_rate": 9.490858276790658e-06, + "loss": 3.5523, + "step": 468 + }, + { + "epoch": 0.6955876900259548, + "grad_norm": 7.2765660941933525, + "learning_rate": 9.487061302860944e-06, + "loss": 3.478, + "step": 469 + }, + { + "epoch": 0.6970708194289952, + "grad_norm": 8.117490416515825, + "learning_rate": 9.483250988652757e-06, + "loss": 3.3949, + "step": 470 + }, + { + "epoch": 0.6985539488320356, + "grad_norm": 7.704021217395086, + "learning_rate": 9.479427345494366e-06, + "loss": 3.4251, + "step": 471 + }, + { + "epoch": 0.700037078235076, + "grad_norm": 7.090887181664769, + "learning_rate": 9.475590384753672e-06, + "loss": 3.4077, + "step": 472 + }, + { + "epoch": 0.7015202076381164, + "grad_norm": 7.346387845964113, + "learning_rate": 9.471740117838169e-06, + "loss": 3.1678, + "step": 473 + }, + { + "epoch": 0.7030033370411568, + "grad_norm": 7.709521608956179, + "learning_rate": 9.467876556194912e-06, + "loss": 3.4311, + "step": 474 + }, + { + "epoch": 0.7044864664441972, + "grad_norm": 7.44489900114914, + "learning_rate": 9.463999711310478e-06, + "loss": 3.3935, + "step": 475 + }, + { + "epoch": 0.7059695958472376, + "grad_norm": 8.127732643874536, + "learning_rate": 9.460109594710942e-06, + "loss": 3.4098, + "step": 476 + }, + { + "epoch": 0.7074527252502781, + "grad_norm": 7.234407759707704, + "learning_rate": 9.45620621796183e-06, + "loss": 3.455, + "step": 477 + }, + { + "epoch": 0.7089358546533185, + "grad_norm": 8.166241256464549, + "learning_rate": 9.452289592668099e-06, + "loss": 3.4385, + "step": 478 + }, + { + "epoch": 0.710418984056359, + "grad_norm": 7.401631004006855, + "learning_rate": 9.448359730474084e-06, + "loss": 3.6178, + "step": 479 + }, + { + "epoch": 0.7119021134593994, + "grad_norm": 7.594799495243321, + "learning_rate": 9.444416643063486e-06, + "loss": 3.5021, + "step": 480 + }, + { + "epoch": 0.7133852428624398, + "grad_norm": 6.952437198395618, + "learning_rate": 9.440460342159314e-06, + "loss": 3.4061, + "step": 481 + }, + { + "epoch": 0.7148683722654802, + "grad_norm": 7.743779388061549, + "learning_rate": 9.436490839523871e-06, + "loss": 3.4395, + "step": 482 + }, + { + "epoch": 0.7163515016685206, + "grad_norm": 7.23295847028892, + "learning_rate": 9.432508146958704e-06, + "loss": 3.1892, + "step": 483 + }, + { + "epoch": 0.717834631071561, + "grad_norm": 7.848076148641567, + "learning_rate": 9.428512276304574e-06, + "loss": 3.4579, + "step": 484 + }, + { + "epoch": 0.7193177604746014, + "grad_norm": 7.702440914459754, + "learning_rate": 9.424503239441424e-06, + "loss": 3.4289, + "step": 485 + }, + { + "epoch": 0.7208008898776418, + "grad_norm": 8.41259997885203, + "learning_rate": 9.42048104828834e-06, + "loss": 3.2975, + "step": 486 + }, + { + "epoch": 0.7222840192806822, + "grad_norm": 8.060720773230296, + "learning_rate": 9.41644571480352e-06, + "loss": 3.4615, + "step": 487 + }, + { + "epoch": 0.7237671486837226, + "grad_norm": 6.829490321332256, + "learning_rate": 9.412397250984222e-06, + "loss": 3.3812, + "step": 488 + }, + { + "epoch": 0.7252502780867631, + "grad_norm": 7.961936689136887, + "learning_rate": 9.408335668866757e-06, + "loss": 3.4636, + "step": 489 + }, + { + "epoch": 0.7267334074898035, + "grad_norm": 7.742538359482097, + "learning_rate": 9.40426098052643e-06, + "loss": 3.3811, + "step": 490 + }, + { + "epoch": 0.7282165368928439, + "grad_norm": 7.3122320154726, + "learning_rate": 9.40017319807751e-06, + "loss": 3.4401, + "step": 491 + }, + { + "epoch": 0.7296996662958843, + "grad_norm": 8.332695978454813, + "learning_rate": 9.396072333673196e-06, + "loss": 3.557, + "step": 492 + }, + { + "epoch": 0.7311827956989247, + "grad_norm": 7.1634312436712975, + "learning_rate": 9.391958399505588e-06, + "loss": 3.3755, + "step": 493 + }, + { + "epoch": 0.7326659251019652, + "grad_norm": 7.596114764641967, + "learning_rate": 9.387831407805632e-06, + "loss": 3.429, + "step": 494 + }, + { + "epoch": 0.7341490545050056, + "grad_norm": 7.861775677754605, + "learning_rate": 9.383691370843101e-06, + "loss": 3.419, + "step": 495 + }, + { + "epoch": 0.735632183908046, + "grad_norm": 7.293987873216931, + "learning_rate": 9.379538300926553e-06, + "loss": 3.4604, + "step": 496 + }, + { + "epoch": 0.7371153133110864, + "grad_norm": 8.059803671755619, + "learning_rate": 9.37537221040329e-06, + "loss": 3.5168, + "step": 497 + }, + { + "epoch": 0.7385984427141268, + "grad_norm": 7.658861574344317, + "learning_rate": 9.371193111659327e-06, + "loss": 3.4312, + "step": 498 + }, + { + "epoch": 0.7400815721171672, + "grad_norm": 7.921072440849309, + "learning_rate": 9.367001017119352e-06, + "loss": 3.4259, + "step": 499 + }, + { + "epoch": 0.7415647015202076, + "grad_norm": 8.077512844902563, + "learning_rate": 9.362795939246694e-06, + "loss": 3.546, + "step": 500 + }, + { + "epoch": 0.743047830923248, + "grad_norm": 7.849622029497966, + "learning_rate": 9.358577890543277e-06, + "loss": 3.4623, + "step": 501 + }, + { + "epoch": 0.7445309603262885, + "grad_norm": 8.211853093413296, + "learning_rate": 9.354346883549592e-06, + "loss": 3.6037, + "step": 502 + }, + { + "epoch": 0.7460140897293289, + "grad_norm": 7.26799840908144, + "learning_rate": 9.350102930844653e-06, + "loss": 3.4864, + "step": 503 + }, + { + "epoch": 0.7474972191323693, + "grad_norm": 7.500429827092913, + "learning_rate": 9.345846045045963e-06, + "loss": 3.3737, + "step": 504 + }, + { + "epoch": 0.7489803485354097, + "grad_norm": 7.530428369378243, + "learning_rate": 9.341576238809477e-06, + "loss": 3.4671, + "step": 505 + }, + { + "epoch": 0.7504634779384501, + "grad_norm": 7.255784682711592, + "learning_rate": 9.33729352482956e-06, + "loss": 3.3995, + "step": 506 + }, + { + "epoch": 0.7519466073414905, + "grad_norm": 8.19328521436644, + "learning_rate": 9.332997915838959e-06, + "loss": 3.4625, + "step": 507 + }, + { + "epoch": 0.753429736744531, + "grad_norm": 8.08362034101474, + "learning_rate": 9.32868942460875e-06, + "loss": 3.5005, + "step": 508 + }, + { + "epoch": 0.7549128661475714, + "grad_norm": 7.580033883604762, + "learning_rate": 9.324368063948315e-06, + "loss": 3.5405, + "step": 509 + }, + { + "epoch": 0.7563959955506118, + "grad_norm": 7.207399190278826, + "learning_rate": 9.320033846705295e-06, + "loss": 3.3603, + "step": 510 + }, + { + "epoch": 0.7578791249536522, + "grad_norm": 6.856439140313025, + "learning_rate": 9.315686785765556e-06, + "loss": 3.373, + "step": 511 + }, + { + "epoch": 0.7593622543566926, + "grad_norm": 8.678995927202713, + "learning_rate": 9.311326894053145e-06, + "loss": 3.4669, + "step": 512 + }, + { + "epoch": 0.760845383759733, + "grad_norm": 6.843832591885439, + "learning_rate": 9.306954184530261e-06, + "loss": 3.343, + "step": 513 + }, + { + "epoch": 0.7623285131627735, + "grad_norm": 7.6315921411319545, + "learning_rate": 9.302568670197207e-06, + "loss": 3.5155, + "step": 514 + }, + { + "epoch": 0.7638116425658139, + "grad_norm": 7.4239894884872015, + "learning_rate": 9.298170364092357e-06, + "loss": 3.2713, + "step": 515 + }, + { + "epoch": 0.7652947719688543, + "grad_norm": 7.515557058382063, + "learning_rate": 9.293759279292116e-06, + "loss": 3.5949, + "step": 516 + }, + { + "epoch": 0.7667779013718947, + "grad_norm": 6.955550759021473, + "learning_rate": 9.28933542891088e-06, + "loss": 3.3512, + "step": 517 + }, + { + "epoch": 0.7682610307749351, + "grad_norm": 8.137619108415128, + "learning_rate": 9.284898826100997e-06, + "loss": 3.4329, + "step": 518 + }, + { + "epoch": 0.7697441601779755, + "grad_norm": 7.664856125754757, + "learning_rate": 9.28044948405273e-06, + "loss": 3.4797, + "step": 519 + }, + { + "epoch": 0.7712272895810159, + "grad_norm": 7.1171782835341935, + "learning_rate": 9.275987415994215e-06, + "loss": 3.3476, + "step": 520 + }, + { + "epoch": 0.7727104189840563, + "grad_norm": 8.561046571659602, + "learning_rate": 9.271512635191427e-06, + "loss": 3.5725, + "step": 521 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 7.073280414347701, + "learning_rate": 9.267025154948133e-06, + "loss": 3.4487, + "step": 522 + }, + { + "epoch": 0.7756766777901372, + "grad_norm": 7.827590620156908, + "learning_rate": 9.262524988605855e-06, + "loss": 3.5669, + "step": 523 + }, + { + "epoch": 0.7771598071931776, + "grad_norm": 7.264716397928104, + "learning_rate": 9.258012149543836e-06, + "loss": 3.5454, + "step": 524 + }, + { + "epoch": 0.778642936596218, + "grad_norm": 7.893158072842291, + "learning_rate": 9.253486651178991e-06, + "loss": 3.4692, + "step": 525 + }, + { + "epoch": 0.7801260659992585, + "grad_norm": 8.109000559959037, + "learning_rate": 9.248948506965877e-06, + "loss": 3.4791, + "step": 526 + }, + { + "epoch": 0.7816091954022989, + "grad_norm": 7.540608521761882, + "learning_rate": 9.244397730396642e-06, + "loss": 3.6715, + "step": 527 + }, + { + "epoch": 0.7830923248053393, + "grad_norm": 7.008471671082116, + "learning_rate": 9.239834335000992e-06, + "loss": 3.3152, + "step": 528 + }, + { + "epoch": 0.7845754542083797, + "grad_norm": 7.475770142505553, + "learning_rate": 9.235258334346155e-06, + "loss": 3.1365, + "step": 529 + }, + { + "epoch": 0.7860585836114201, + "grad_norm": 7.579215801832432, + "learning_rate": 9.23066974203683e-06, + "loss": 3.4094, + "step": 530 + }, + { + "epoch": 0.7875417130144605, + "grad_norm": 8.218851344638656, + "learning_rate": 9.22606857171515e-06, + "loss": 3.4338, + "step": 531 + }, + { + "epoch": 0.7890248424175009, + "grad_norm": 7.862441669035907, + "learning_rate": 9.221454837060648e-06, + "loss": 3.4706, + "step": 532 + }, + { + "epoch": 0.7905079718205413, + "grad_norm": 7.288249112727001, + "learning_rate": 9.216828551790211e-06, + "loss": 3.2972, + "step": 533 + }, + { + "epoch": 0.7919911012235817, + "grad_norm": 8.255540967774133, + "learning_rate": 9.212189729658033e-06, + "loss": 3.5199, + "step": 534 + }, + { + "epoch": 0.7934742306266221, + "grad_norm": 6.957581639889089, + "learning_rate": 9.20753838445559e-06, + "loss": 3.4162, + "step": 535 + }, + { + "epoch": 0.7949573600296626, + "grad_norm": 7.273430950497293, + "learning_rate": 9.202874530011583e-06, + "loss": 3.3563, + "step": 536 + }, + { + "epoch": 0.796440489432703, + "grad_norm": 7.227695158136255, + "learning_rate": 9.19819818019191e-06, + "loss": 3.2721, + "step": 537 + }, + { + "epoch": 0.7979236188357434, + "grad_norm": 7.474884494425934, + "learning_rate": 9.193509348899609e-06, + "loss": 3.2949, + "step": 538 + }, + { + "epoch": 0.7994067482387839, + "grad_norm": 7.688193392188794, + "learning_rate": 9.188808050074836e-06, + "loss": 3.3499, + "step": 539 + }, + { + "epoch": 0.8008898776418243, + "grad_norm": 7.653133711303279, + "learning_rate": 9.184094297694807e-06, + "loss": 3.5268, + "step": 540 + }, + { + "epoch": 0.8023730070448647, + "grad_norm": 7.741762819627478, + "learning_rate": 9.179368105773768e-06, + "loss": 3.4841, + "step": 541 + }, + { + "epoch": 0.8038561364479051, + "grad_norm": 6.9180755563393515, + "learning_rate": 9.174629488362942e-06, + "loss": 3.3396, + "step": 542 + }, + { + "epoch": 0.8053392658509455, + "grad_norm": 6.696634086597803, + "learning_rate": 9.1698784595505e-06, + "loss": 3.3346, + "step": 543 + }, + { + "epoch": 0.8068223952539859, + "grad_norm": 7.649846225094083, + "learning_rate": 9.16511503346151e-06, + "loss": 3.3627, + "step": 544 + }, + { + "epoch": 0.8083055246570263, + "grad_norm": 7.319819931440595, + "learning_rate": 9.1603392242579e-06, + "loss": 3.4343, + "step": 545 + }, + { + "epoch": 0.8097886540600667, + "grad_norm": 7.6891993428807535, + "learning_rate": 9.155551046138408e-06, + "loss": 3.4624, + "step": 546 + }, + { + "epoch": 0.8112717834631071, + "grad_norm": 7.480666216270873, + "learning_rate": 9.150750513338552e-06, + "loss": 3.4602, + "step": 547 + }, + { + "epoch": 0.8127549128661475, + "grad_norm": 6.838485841623263, + "learning_rate": 9.145937640130575e-06, + "loss": 3.3187, + "step": 548 + }, + { + "epoch": 0.814238042269188, + "grad_norm": 7.283327019167655, + "learning_rate": 9.141112440823416e-06, + "loss": 3.309, + "step": 549 + }, + { + "epoch": 0.8157211716722284, + "grad_norm": 7.252447375476777, + "learning_rate": 9.136274929762657e-06, + "loss": 3.5322, + "step": 550 + }, + { + "epoch": 0.8172043010752689, + "grad_norm": 7.571522757047981, + "learning_rate": 9.131425121330477e-06, + "loss": 3.3856, + "step": 551 + }, + { + "epoch": 0.8186874304783093, + "grad_norm": 7.5714086517977375, + "learning_rate": 9.126563029945627e-06, + "loss": 3.2007, + "step": 552 + }, + { + "epoch": 0.8201705598813497, + "grad_norm": 7.23566206336203, + "learning_rate": 9.121688670063368e-06, + "loss": 3.3605, + "step": 553 + }, + { + "epoch": 0.8216536892843901, + "grad_norm": 7.575957537574694, + "learning_rate": 9.11680205617544e-06, + "loss": 3.2571, + "step": 554 + }, + { + "epoch": 0.8231368186874305, + "grad_norm": 8.330871360883748, + "learning_rate": 9.11190320281001e-06, + "loss": 3.5845, + "step": 555 + }, + { + "epoch": 0.8246199480904709, + "grad_norm": 8.426804331295374, + "learning_rate": 9.10699212453164e-06, + "loss": 3.3073, + "step": 556 + }, + { + "epoch": 0.8261030774935113, + "grad_norm": 7.2277253547953775, + "learning_rate": 9.102068835941232e-06, + "loss": 3.4849, + "step": 557 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 7.974236535331102, + "learning_rate": 9.097133351675993e-06, + "loss": 3.4483, + "step": 558 + }, + { + "epoch": 0.8290693362995921, + "grad_norm": 7.591539761958611, + "learning_rate": 9.092185686409388e-06, + "loss": 3.3414, + "step": 559 + }, + { + "epoch": 0.8305524657026325, + "grad_norm": 7.432078795893707, + "learning_rate": 9.087225854851094e-06, + "loss": 3.275, + "step": 560 + }, + { + "epoch": 0.8320355951056729, + "grad_norm": 9.413901650257367, + "learning_rate": 9.082253871746962e-06, + "loss": 3.4828, + "step": 561 + }, + { + "epoch": 0.8335187245087133, + "grad_norm": 9.19348209715371, + "learning_rate": 9.077269751878972e-06, + "loss": 3.5386, + "step": 562 + }, + { + "epoch": 0.8350018539117539, + "grad_norm": 7.734767815038607, + "learning_rate": 9.072273510065184e-06, + "loss": 3.2756, + "step": 563 + }, + { + "epoch": 0.8364849833147943, + "grad_norm": 7.131232845217595, + "learning_rate": 9.067265161159695e-06, + "loss": 3.341, + "step": 564 + }, + { + "epoch": 0.8379681127178347, + "grad_norm": 7.199106531176681, + "learning_rate": 9.062244720052603e-06, + "loss": 3.2062, + "step": 565 + }, + { + "epoch": 0.8394512421208751, + "grad_norm": 7.441508350370783, + "learning_rate": 9.057212201669952e-06, + "loss": 3.4419, + "step": 566 + }, + { + "epoch": 0.8409343715239155, + "grad_norm": 7.299245813881361, + "learning_rate": 9.052167620973697e-06, + "loss": 3.4887, + "step": 567 + }, + { + "epoch": 0.8424175009269559, + "grad_norm": 7.300274925561355, + "learning_rate": 9.047110992961648e-06, + "loss": 3.3824, + "step": 568 + }, + { + "epoch": 0.8439006303299963, + "grad_norm": 6.968324330149759, + "learning_rate": 9.042042332667439e-06, + "loss": 3.3665, + "step": 569 + }, + { + "epoch": 0.8453837597330367, + "grad_norm": 8.063274361092562, + "learning_rate": 9.036961655160474e-06, + "loss": 3.4814, + "step": 570 + }, + { + "epoch": 0.8468668891360771, + "grad_norm": 7.8001426094891215, + "learning_rate": 9.031868975545884e-06, + "loss": 3.3804, + "step": 571 + }, + { + "epoch": 0.8483500185391175, + "grad_norm": 8.032094330587642, + "learning_rate": 9.026764308964484e-06, + "loss": 3.5183, + "step": 572 + }, + { + "epoch": 0.8498331479421579, + "grad_norm": 7.979208771591406, + "learning_rate": 9.021647670592729e-06, + "loss": 3.3652, + "step": 573 + }, + { + "epoch": 0.8513162773451983, + "grad_norm": 7.018469682595828, + "learning_rate": 9.01651907564266e-06, + "loss": 3.3622, + "step": 574 + }, + { + "epoch": 0.8527994067482387, + "grad_norm": 6.979649230118227, + "learning_rate": 9.011378539361874e-06, + "loss": 3.4068, + "step": 575 + }, + { + "epoch": 0.8542825361512792, + "grad_norm": 7.535899794363741, + "learning_rate": 9.006226077033464e-06, + "loss": 3.437, + "step": 576 + }, + { + "epoch": 0.8557656655543197, + "grad_norm": 8.063871069865776, + "learning_rate": 9.001061703975983e-06, + "loss": 3.4626, + "step": 577 + }, + { + "epoch": 0.8572487949573601, + "grad_norm": 7.527338417276027, + "learning_rate": 8.995885435543395e-06, + "loss": 3.4589, + "step": 578 + }, + { + "epoch": 0.8587319243604005, + "grad_norm": 6.953827261798813, + "learning_rate": 8.990697287125028e-06, + "loss": 3.4144, + "step": 579 + }, + { + "epoch": 0.8602150537634409, + "grad_norm": 7.275636799768084, + "learning_rate": 8.985497274145531e-06, + "loss": 3.4938, + "step": 580 + }, + { + "epoch": 0.8616981831664813, + "grad_norm": 6.93600249243649, + "learning_rate": 8.980285412064827e-06, + "loss": 3.3222, + "step": 581 + }, + { + "epoch": 0.8631813125695217, + "grad_norm": 8.103136405236793, + "learning_rate": 8.975061716378066e-06, + "loss": 3.5318, + "step": 582 + }, + { + "epoch": 0.8646644419725621, + "grad_norm": 6.746270561607738, + "learning_rate": 8.969826202615583e-06, + "loss": 3.2462, + "step": 583 + }, + { + "epoch": 0.8661475713756025, + "grad_norm": 7.838271087693846, + "learning_rate": 8.964578886342842e-06, + "loss": 3.2622, + "step": 584 + }, + { + "epoch": 0.8676307007786429, + "grad_norm": 7.1601006094343385, + "learning_rate": 8.959319783160406e-06, + "loss": 3.2711, + "step": 585 + }, + { + "epoch": 0.8691138301816833, + "grad_norm": 7.22478952149325, + "learning_rate": 8.954048908703873e-06, + "loss": 3.4568, + "step": 586 + }, + { + "epoch": 0.8705969595847237, + "grad_norm": 7.381283061563745, + "learning_rate": 8.948766278643843e-06, + "loss": 3.3541, + "step": 587 + }, + { + "epoch": 0.8720800889877642, + "grad_norm": 7.60372838437291, + "learning_rate": 8.943471908685864e-06, + "loss": 3.5118, + "step": 588 + }, + { + "epoch": 0.8735632183908046, + "grad_norm": 7.538549906731478, + "learning_rate": 8.938165814570384e-06, + "loss": 3.45, + "step": 589 + }, + { + "epoch": 0.875046347793845, + "grad_norm": 7.663439532638983, + "learning_rate": 8.932848012072713e-06, + "loss": 3.4349, + "step": 590 + }, + { + "epoch": 0.8765294771968855, + "grad_norm": 7.2852141469115805, + "learning_rate": 8.92751851700297e-06, + "loss": 3.4344, + "step": 591 + }, + { + "epoch": 0.8780126065999259, + "grad_norm": 7.2524562399620525, + "learning_rate": 8.92217734520603e-06, + "loss": 3.4701, + "step": 592 + }, + { + "epoch": 0.8794957360029663, + "grad_norm": 6.854895429549145, + "learning_rate": 8.916824512561495e-06, + "loss": 3.3648, + "step": 593 + }, + { + "epoch": 0.8809788654060067, + "grad_norm": 7.3296091394105245, + "learning_rate": 8.911460034983624e-06, + "loss": 3.378, + "step": 594 + }, + { + "epoch": 0.8824619948090471, + "grad_norm": 7.720781932192655, + "learning_rate": 8.906083928421305e-06, + "loss": 3.3884, + "step": 595 + }, + { + "epoch": 0.8839451242120875, + "grad_norm": 7.466030152747836, + "learning_rate": 8.900696208857996e-06, + "loss": 3.4662, + "step": 596 + }, + { + "epoch": 0.8854282536151279, + "grad_norm": 7.70156916645016, + "learning_rate": 8.895296892311681e-06, + "loss": 3.4523, + "step": 597 + }, + { + "epoch": 0.8869113830181683, + "grad_norm": 7.617517874366238, + "learning_rate": 8.889885994834823e-06, + "loss": 3.5166, + "step": 598 + }, + { + "epoch": 0.8883945124212087, + "grad_norm": 7.988435279977703, + "learning_rate": 8.884463532514318e-06, + "loss": 3.3223, + "step": 599 + }, + { + "epoch": 0.8898776418242491, + "grad_norm": 8.13385262308737, + "learning_rate": 8.87902952147144e-06, + "loss": 3.4852, + "step": 600 + }, + { + "epoch": 0.8913607712272896, + "grad_norm": 7.629733370705478, + "learning_rate": 8.873583977861802e-06, + "loss": 3.399, + "step": 601 + }, + { + "epoch": 0.89284390063033, + "grad_norm": 7.266037410876641, + "learning_rate": 8.868126917875303e-06, + "loss": 3.4371, + "step": 602 + }, + { + "epoch": 0.8943270300333704, + "grad_norm": 7.37484772077461, + "learning_rate": 8.86265835773608e-06, + "loss": 3.4743, + "step": 603 + }, + { + "epoch": 0.8958101594364108, + "grad_norm": 7.36854973746887, + "learning_rate": 8.857178313702462e-06, + "loss": 3.3732, + "step": 604 + }, + { + "epoch": 0.8972932888394513, + "grad_norm": 6.843218591508188, + "learning_rate": 8.85168680206692e-06, + "loss": 3.1673, + "step": 605 + }, + { + "epoch": 0.8987764182424917, + "grad_norm": 7.105726880186516, + "learning_rate": 8.846183839156015e-06, + "loss": 3.2946, + "step": 606 + }, + { + "epoch": 0.9002595476455321, + "grad_norm": 6.9233229980582935, + "learning_rate": 8.840669441330361e-06, + "loss": 3.4149, + "step": 607 + }, + { + "epoch": 0.9017426770485725, + "grad_norm": 6.692287911099017, + "learning_rate": 8.835143624984558e-06, + "loss": 3.4977, + "step": 608 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 7.532028065109629, + "learning_rate": 8.829606406547167e-06, + "loss": 3.4788, + "step": 609 + }, + { + "epoch": 0.9047089358546533, + "grad_norm": 7.304037590764736, + "learning_rate": 8.824057802480637e-06, + "loss": 3.4079, + "step": 610 + }, + { + "epoch": 0.9061920652576937, + "grad_norm": 6.783915804794923, + "learning_rate": 8.818497829281272e-06, + "loss": 3.3073, + "step": 611 + }, + { + "epoch": 0.9076751946607341, + "grad_norm": 8.106629525949788, + "learning_rate": 8.812926503479175e-06, + "loss": 3.4455, + "step": 612 + }, + { + "epoch": 0.9091583240637746, + "grad_norm": 7.317822368942202, + "learning_rate": 8.807343841638205e-06, + "loss": 3.3616, + "step": 613 + }, + { + "epoch": 0.910641453466815, + "grad_norm": 7.099212394358248, + "learning_rate": 8.801749860355918e-06, + "loss": 3.2953, + "step": 614 + }, + { + "epoch": 0.9121245828698554, + "grad_norm": 7.049176237347966, + "learning_rate": 8.796144576263527e-06, + "loss": 3.4179, + "step": 615 + }, + { + "epoch": 0.9136077122728958, + "grad_norm": 7.472076369454894, + "learning_rate": 8.790528006025848e-06, + "loss": 3.3491, + "step": 616 + }, + { + "epoch": 0.9150908416759362, + "grad_norm": 7.42980163934149, + "learning_rate": 8.78490016634125e-06, + "loss": 3.3195, + "step": 617 + }, + { + "epoch": 0.9165739710789766, + "grad_norm": 7.480948350209177, + "learning_rate": 8.779261073941611e-06, + "loss": 3.5218, + "step": 618 + }, + { + "epoch": 0.918057100482017, + "grad_norm": 7.170564358031702, + "learning_rate": 8.773610745592259e-06, + "loss": 3.4276, + "step": 619 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 7.072090805530375, + "learning_rate": 8.767949198091926e-06, + "loss": 3.357, + "step": 620 + }, + { + "epoch": 0.9210233592880979, + "grad_norm": 7.020971024130809, + "learning_rate": 8.762276448272709e-06, + "loss": 3.4409, + "step": 621 + }, + { + "epoch": 0.9225064886911383, + "grad_norm": 7.240729846089699, + "learning_rate": 8.756592512999995e-06, + "loss": 3.3752, + "step": 622 + }, + { + "epoch": 0.9239896180941787, + "grad_norm": 7.203198419826792, + "learning_rate": 8.750897409172441e-06, + "loss": 3.4031, + "step": 623 + }, + { + "epoch": 0.9254727474972191, + "grad_norm": 7.3629433657058945, + "learning_rate": 8.7451911537219e-06, + "loss": 3.3932, + "step": 624 + }, + { + "epoch": 0.9269558769002596, + "grad_norm": 6.75166866494776, + "learning_rate": 8.739473763613381e-06, + "loss": 3.3598, + "step": 625 + }, + { + "epoch": 0.9284390063033, + "grad_norm": 7.411592031805779, + "learning_rate": 8.733745255844996e-06, + "loss": 3.3613, + "step": 626 + }, + { + "epoch": 0.9299221357063404, + "grad_norm": 7.0995421349794325, + "learning_rate": 8.728005647447913e-06, + "loss": 3.472, + "step": 627 + }, + { + "epoch": 0.9314052651093808, + "grad_norm": 7.52619982259025, + "learning_rate": 8.7222549554863e-06, + "loss": 3.5199, + "step": 628 + }, + { + "epoch": 0.9328883945124212, + "grad_norm": 7.515814764729413, + "learning_rate": 8.716493197057282e-06, + "loss": 3.293, + "step": 629 + }, + { + "epoch": 0.9343715239154616, + "grad_norm": 7.2001995653078845, + "learning_rate": 8.710720389290878e-06, + "loss": 3.4679, + "step": 630 + }, + { + "epoch": 0.935854653318502, + "grad_norm": 8.02806897399668, + "learning_rate": 8.70493654934996e-06, + "loss": 3.5241, + "step": 631 + }, + { + "epoch": 0.9373377827215424, + "grad_norm": 7.352009295959701, + "learning_rate": 8.699141694430207e-06, + "loss": 3.3687, + "step": 632 + }, + { + "epoch": 0.9388209121245829, + "grad_norm": 7.108487447595646, + "learning_rate": 8.693335841760033e-06, + "loss": 3.2885, + "step": 633 + }, + { + "epoch": 0.9403040415276233, + "grad_norm": 7.152973152612515, + "learning_rate": 8.687519008600555e-06, + "loss": 3.3664, + "step": 634 + }, + { + "epoch": 0.9417871709306637, + "grad_norm": 7.170717038898413, + "learning_rate": 8.68169121224554e-06, + "loss": 3.3642, + "step": 635 + }, + { + "epoch": 0.9432703003337041, + "grad_norm": 7.269469028892429, + "learning_rate": 8.675852470021344e-06, + "loss": 3.4081, + "step": 636 + }, + { + "epoch": 0.9447534297367445, + "grad_norm": 7.355639439465681, + "learning_rate": 8.670002799286866e-06, + "loss": 3.3522, + "step": 637 + }, + { + "epoch": 0.946236559139785, + "grad_norm": 7.058739165434565, + "learning_rate": 8.664142217433494e-06, + "loss": 3.2098, + "step": 638 + }, + { + "epoch": 0.9477196885428254, + "grad_norm": 8.143183969703992, + "learning_rate": 8.658270741885062e-06, + "loss": 3.3523, + "step": 639 + }, + { + "epoch": 0.9492028179458658, + "grad_norm": 7.537691294109748, + "learning_rate": 8.652388390097787e-06, + "loss": 3.5057, + "step": 640 + }, + { + "epoch": 0.9506859473489062, + "grad_norm": 6.935048291304917, + "learning_rate": 8.646495179560221e-06, + "loss": 3.3629, + "step": 641 + }, + { + "epoch": 0.9521690767519466, + "grad_norm": 7.521863805200067, + "learning_rate": 8.640591127793203e-06, + "loss": 3.4132, + "step": 642 + }, + { + "epoch": 0.953652206154987, + "grad_norm": 7.059963250839765, + "learning_rate": 8.634676252349799e-06, + "loss": 3.3729, + "step": 643 + }, + { + "epoch": 0.9551353355580274, + "grad_norm": 7.33698256342567, + "learning_rate": 8.628750570815259e-06, + "loss": 3.3414, + "step": 644 + }, + { + "epoch": 0.9566184649610678, + "grad_norm": 7.021550512240996, + "learning_rate": 8.622814100806958e-06, + "loss": 3.3746, + "step": 645 + }, + { + "epoch": 0.9581015943641082, + "grad_norm": 6.859516494418017, + "learning_rate": 8.616866859974344e-06, + "loss": 3.245, + "step": 646 + }, + { + "epoch": 0.9595847237671487, + "grad_norm": 7.309826460926291, + "learning_rate": 8.61090886599889e-06, + "loss": 3.4962, + "step": 647 + }, + { + "epoch": 0.9610678531701891, + "grad_norm": 6.531465878334422, + "learning_rate": 8.604940136594038e-06, + "loss": 3.4171, + "step": 648 + }, + { + "epoch": 0.9625509825732295, + "grad_norm": 7.800378562675778, + "learning_rate": 8.598960689505147e-06, + "loss": 3.3461, + "step": 649 + }, + { + "epoch": 0.96403411197627, + "grad_norm": 6.7555103926283, + "learning_rate": 8.59297054250944e-06, + "loss": 3.3634, + "step": 650 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 6.830465247009173, + "learning_rate": 8.586969713415949e-06, + "loss": 3.3801, + "step": 651 + }, + { + "epoch": 0.9670003707823508, + "grad_norm": 7.627369813320554, + "learning_rate": 8.58095822006547e-06, + "loss": 3.3564, + "step": 652 + }, + { + "epoch": 0.9684835001853912, + "grad_norm": 7.8338936344813455, + "learning_rate": 8.5749360803305e-06, + "loss": 3.457, + "step": 653 + }, + { + "epoch": 0.9699666295884316, + "grad_norm": 7.789041097352383, + "learning_rate": 8.56890331211519e-06, + "loss": 3.4474, + "step": 654 + }, + { + "epoch": 0.971449758991472, + "grad_norm": 7.005168191784098, + "learning_rate": 8.562859933355288e-06, + "loss": 3.249, + "step": 655 + }, + { + "epoch": 0.9729328883945124, + "grad_norm": 7.342260561601214, + "learning_rate": 8.556805962018091e-06, + "loss": 3.4149, + "step": 656 + }, + { + "epoch": 0.9744160177975528, + "grad_norm": 6.907981017456649, + "learning_rate": 8.550741416102387e-06, + "loss": 3.4241, + "step": 657 + }, + { + "epoch": 0.9758991472005932, + "grad_norm": 6.990769236054849, + "learning_rate": 8.544666313638401e-06, + "loss": 3.406, + "step": 658 + }, + { + "epoch": 0.9773822766036336, + "grad_norm": 7.0564383123575976, + "learning_rate": 8.538580672687746e-06, + "loss": 3.501, + "step": 659 + }, + { + "epoch": 0.978865406006674, + "grad_norm": 7.005239300434754, + "learning_rate": 8.532484511343361e-06, + "loss": 3.4163, + "step": 660 + }, + { + "epoch": 0.9803485354097144, + "grad_norm": 6.97714237795594, + "learning_rate": 8.526377847729475e-06, + "loss": 3.317, + "step": 661 + }, + { + "epoch": 0.981831664812755, + "grad_norm": 6.807111131374577, + "learning_rate": 8.520260700001525e-06, + "loss": 3.3869, + "step": 662 + }, + { + "epoch": 0.9833147942157954, + "grad_norm": 7.096914771791758, + "learning_rate": 8.514133086346128e-06, + "loss": 3.2948, + "step": 663 + }, + { + "epoch": 0.9847979236188358, + "grad_norm": 6.747414201204359, + "learning_rate": 8.507995024981014e-06, + "loss": 3.3352, + "step": 664 + }, + { + "epoch": 0.9862810530218762, + "grad_norm": 6.4067579225432105, + "learning_rate": 8.501846534154978e-06, + "loss": 3.3311, + "step": 665 + }, + { + "epoch": 0.9877641824249166, + "grad_norm": 8.39733088671572, + "learning_rate": 8.495687632147817e-06, + "loss": 3.4391, + "step": 666 + }, + { + "epoch": 0.989247311827957, + "grad_norm": 6.945746477425899, + "learning_rate": 8.489518337270281e-06, + "loss": 3.4723, + "step": 667 + }, + { + "epoch": 0.9907304412309974, + "grad_norm": 6.916202139362318, + "learning_rate": 8.483338667864023e-06, + "loss": 3.3078, + "step": 668 + }, + { + "epoch": 0.9922135706340378, + "grad_norm": 6.4655034190207505, + "learning_rate": 8.477148642301538e-06, + "loss": 3.3609, + "step": 669 + }, + { + "epoch": 0.9936967000370782, + "grad_norm": 8.045354280549422, + "learning_rate": 8.47094827898611e-06, + "loss": 3.5863, + "step": 670 + }, + { + "epoch": 0.9951798294401186, + "grad_norm": 7.04946687519346, + "learning_rate": 8.46473759635176e-06, + "loss": 3.2821, + "step": 671 + }, + { + "epoch": 0.996662958843159, + "grad_norm": 7.496063244272419, + "learning_rate": 8.458516612863183e-06, + "loss": 3.4115, + "step": 672 + }, + { + "epoch": 0.9981460882461994, + "grad_norm": 6.883345805487994, + "learning_rate": 8.452285347015705e-06, + "loss": 3.537, + "step": 673 + }, + { + "epoch": 0.9996292176492398, + "grad_norm": 6.9322293577317735, + "learning_rate": 8.44604381733522e-06, + "loss": 3.361, + "step": 674 + }, + { + "epoch": 1.0, + "grad_norm": 6.9322293577317735, + "learning_rate": 8.439792042378134e-06, + "loss": 3.4574, + "step": 675 + }, + { + "epoch": 1.0014831294030404, + "grad_norm": 14.756464278175445, + "learning_rate": 8.433530040731321e-06, + "loss": 2.9093, + "step": 676 + }, + { + "epoch": 1.0029662588060808, + "grad_norm": 6.913453756982226, + "learning_rate": 8.42725783101205e-06, + "loss": 2.632, + "step": 677 + }, + { + "epoch": 1.0044493882091212, + "grad_norm": 6.403191452382474, + "learning_rate": 8.420975431867945e-06, + "loss": 2.7756, + "step": 678 + }, + { + "epoch": 1.0059325176121616, + "grad_norm": 6.354469879604457, + "learning_rate": 8.414682861976923e-06, + "loss": 2.6391, + "step": 679 + }, + { + "epoch": 1.007415647015202, + "grad_norm": 7.1320299789145345, + "learning_rate": 8.408380140047134e-06, + "loss": 2.8266, + "step": 680 + }, + { + "epoch": 1.0088987764182424, + "grad_norm": 7.582434409914123, + "learning_rate": 8.402067284816919e-06, + "loss": 2.6475, + "step": 681 + }, + { + "epoch": 1.0103819058212828, + "grad_norm": 6.586958567075866, + "learning_rate": 8.39574431505474e-06, + "loss": 2.586, + "step": 682 + }, + { + "epoch": 1.0118650352243233, + "grad_norm": 6.673675381685765, + "learning_rate": 8.389411249559132e-06, + "loss": 2.7903, + "step": 683 + }, + { + "epoch": 1.0133481646273637, + "grad_norm": 6.61280092664344, + "learning_rate": 8.383068107158643e-06, + "loss": 2.552, + "step": 684 + }, + { + "epoch": 1.014831294030404, + "grad_norm": 6.956687321145202, + "learning_rate": 8.376714906711786e-06, + "loss": 2.8391, + "step": 685 + }, + { + "epoch": 1.0163144234334445, + "grad_norm": 7.569619819157358, + "learning_rate": 8.370351667106969e-06, + "loss": 2.7086, + "step": 686 + }, + { + "epoch": 1.0177975528364849, + "grad_norm": 6.7668010012091235, + "learning_rate": 8.363978407262453e-06, + "loss": 2.531, + "step": 687 + }, + { + "epoch": 1.0192806822395255, + "grad_norm": 7.203688338153521, + "learning_rate": 8.357595146126288e-06, + "loss": 2.6924, + "step": 688 + }, + { + "epoch": 1.020763811642566, + "grad_norm": 7.5850399480932, + "learning_rate": 8.351201902676258e-06, + "loss": 2.806, + "step": 689 + }, + { + "epoch": 1.0222469410456063, + "grad_norm": 6.932830650643897, + "learning_rate": 8.344798695919825e-06, + "loss": 2.7297, + "step": 690 + }, + { + "epoch": 1.0237300704486467, + "grad_norm": 7.133452041670085, + "learning_rate": 8.338385544894073e-06, + "loss": 2.6973, + "step": 691 + }, + { + "epoch": 1.0252131998516871, + "grad_norm": 7.7370328271533575, + "learning_rate": 8.331962468665649e-06, + "loss": 2.6108, + "step": 692 + }, + { + "epoch": 1.0266963292547275, + "grad_norm": 7.824583652518456, + "learning_rate": 8.32552948633071e-06, + "loss": 2.7134, + "step": 693 + }, + { + "epoch": 1.028179458657768, + "grad_norm": 7.181906541181876, + "learning_rate": 8.319086617014867e-06, + "loss": 2.5635, + "step": 694 + }, + { + "epoch": 1.0296625880608083, + "grad_norm": 6.688751127981275, + "learning_rate": 8.312633879873119e-06, + "loss": 2.4565, + "step": 695 + }, + { + "epoch": 1.0311457174638488, + "grad_norm": 7.418272698726414, + "learning_rate": 8.306171294089808e-06, + "loss": 2.6664, + "step": 696 + }, + { + "epoch": 1.0326288468668892, + "grad_norm": 7.918961125270951, + "learning_rate": 8.299698878878554e-06, + "loss": 2.7245, + "step": 697 + }, + { + "epoch": 1.0341119762699296, + "grad_norm": 7.955843915016436, + "learning_rate": 8.293216653482201e-06, + "loss": 2.6845, + "step": 698 + }, + { + "epoch": 1.03559510567297, + "grad_norm": 7.748514255495621, + "learning_rate": 8.286724637172761e-06, + "loss": 2.7777, + "step": 699 + }, + { + "epoch": 1.0370782350760104, + "grad_norm": 6.623563603202385, + "learning_rate": 8.280222849251351e-06, + "loss": 2.4522, + "step": 700 + }, + { + "epoch": 1.0385613644790508, + "grad_norm": 7.596934531432405, + "learning_rate": 8.273711309048145e-06, + "loss": 2.6357, + "step": 701 + }, + { + "epoch": 1.0400444938820912, + "grad_norm": 7.395176364558294, + "learning_rate": 8.267190035922303e-06, + "loss": 2.7867, + "step": 702 + }, + { + "epoch": 1.0415276232851316, + "grad_norm": 7.209549526135474, + "learning_rate": 8.260659049261932e-06, + "loss": 2.4961, + "step": 703 + }, + { + "epoch": 1.043010752688172, + "grad_norm": 7.247581306678462, + "learning_rate": 8.254118368484008e-06, + "loss": 2.5979, + "step": 704 + }, + { + "epoch": 1.0444938820912124, + "grad_norm": 6.7801667753628605, + "learning_rate": 8.247568013034332e-06, + "loss": 2.6991, + "step": 705 + }, + { + "epoch": 1.0459770114942528, + "grad_norm": 7.4782382052372585, + "learning_rate": 8.241008002387474e-06, + "loss": 2.5292, + "step": 706 + }, + { + "epoch": 1.0474601408972932, + "grad_norm": 7.894040076861897, + "learning_rate": 8.234438356046699e-06, + "loss": 2.7559, + "step": 707 + }, + { + "epoch": 1.0489432703003336, + "grad_norm": 7.610970384387454, + "learning_rate": 8.227859093543926e-06, + "loss": 2.9029, + "step": 708 + }, + { + "epoch": 1.050426399703374, + "grad_norm": 7.251372491665819, + "learning_rate": 8.221270234439665e-06, + "loss": 2.5646, + "step": 709 + }, + { + "epoch": 1.0519095291064144, + "grad_norm": 7.451997451305283, + "learning_rate": 8.214671798322949e-06, + "loss": 2.6136, + "step": 710 + }, + { + "epoch": 1.0533926585094548, + "grad_norm": 6.862459387155874, + "learning_rate": 8.208063804811293e-06, + "loss": 2.3676, + "step": 711 + }, + { + "epoch": 1.0548757879124953, + "grad_norm": 7.028277470194285, + "learning_rate": 8.20144627355062e-06, + "loss": 2.5689, + "step": 712 + }, + { + "epoch": 1.0563589173155359, + "grad_norm": 7.641817194040475, + "learning_rate": 8.194819224215215e-06, + "loss": 2.6907, + "step": 713 + }, + { + "epoch": 1.0578420467185763, + "grad_norm": 6.607229025375731, + "learning_rate": 8.188182676507657e-06, + "loss": 2.6023, + "step": 714 + }, + { + "epoch": 1.0593251761216167, + "grad_norm": 7.634291231120195, + "learning_rate": 8.181536650158763e-06, + "loss": 2.5174, + "step": 715 + }, + { + "epoch": 1.060808305524657, + "grad_norm": 7.367417430772135, + "learning_rate": 8.174881164927535e-06, + "loss": 2.5821, + "step": 716 + }, + { + "epoch": 1.0622914349276975, + "grad_norm": 6.799400853790991, + "learning_rate": 8.168216240601092e-06, + "loss": 2.5639, + "step": 717 + }, + { + "epoch": 1.063774564330738, + "grad_norm": 7.541209820676903, + "learning_rate": 8.161541896994622e-06, + "loss": 2.7413, + "step": 718 + }, + { + "epoch": 1.0652576937337783, + "grad_norm": 6.987648569297264, + "learning_rate": 8.15485815395131e-06, + "loss": 2.5483, + "step": 719 + }, + { + "epoch": 1.0667408231368187, + "grad_norm": 7.589515842317805, + "learning_rate": 8.14816503134229e-06, + "loss": 2.7103, + "step": 720 + }, + { + "epoch": 1.0682239525398591, + "grad_norm": 7.652217506632977, + "learning_rate": 8.141462549066581e-06, + "loss": 2.5747, + "step": 721 + }, + { + "epoch": 1.0697070819428995, + "grad_norm": 7.329114927949016, + "learning_rate": 8.134750727051032e-06, + "loss": 2.5766, + "step": 722 + }, + { + "epoch": 1.07119021134594, + "grad_norm": 7.726823788522437, + "learning_rate": 8.128029585250252e-06, + "loss": 2.5628, + "step": 723 + }, + { + "epoch": 1.0726733407489804, + "grad_norm": 7.362033133223791, + "learning_rate": 8.121299143646568e-06, + "loss": 2.6424, + "step": 724 + }, + { + "epoch": 1.0741564701520208, + "grad_norm": 7.18598873029501, + "learning_rate": 8.114559422249946e-06, + "loss": 2.629, + "step": 725 + }, + { + "epoch": 1.0756395995550612, + "grad_norm": 7.256218259141589, + "learning_rate": 8.107810441097948e-06, + "loss": 2.7019, + "step": 726 + }, + { + "epoch": 1.0771227289581016, + "grad_norm": 8.256810501691156, + "learning_rate": 8.101052220255664e-06, + "loss": 2.6977, + "step": 727 + }, + { + "epoch": 1.078605858361142, + "grad_norm": 8.958210245219146, + "learning_rate": 8.094284779815654e-06, + "loss": 3.0004, + "step": 728 + }, + { + "epoch": 1.0800889877641824, + "grad_norm": 7.524709001022201, + "learning_rate": 8.087508139897888e-06, + "loss": 2.6117, + "step": 729 + }, + { + "epoch": 1.0815721171672228, + "grad_norm": 7.097847258494094, + "learning_rate": 8.080722320649688e-06, + "loss": 2.4512, + "step": 730 + }, + { + "epoch": 1.0830552465702632, + "grad_norm": 7.932472715160055, + "learning_rate": 8.073927342245663e-06, + "loss": 2.7583, + "step": 731 + }, + { + "epoch": 1.0845383759733036, + "grad_norm": 7.969176132637275, + "learning_rate": 8.067123224887658e-06, + "loss": 2.6962, + "step": 732 + }, + { + "epoch": 1.086021505376344, + "grad_norm": 7.038159578203386, + "learning_rate": 8.060309988804681e-06, + "loss": 2.5987, + "step": 733 + }, + { + "epoch": 1.0875046347793844, + "grad_norm": 7.854380252875784, + "learning_rate": 8.053487654252861e-06, + "loss": 2.7793, + "step": 734 + }, + { + "epoch": 1.0889877641824248, + "grad_norm": 8.158059299819822, + "learning_rate": 8.04665624151537e-06, + "loss": 2.7349, + "step": 735 + }, + { + "epoch": 1.0904708935854652, + "grad_norm": 7.91684275129247, + "learning_rate": 8.039815770902368e-06, + "loss": 2.7419, + "step": 736 + }, + { + "epoch": 1.0919540229885056, + "grad_norm": 7.752395546061646, + "learning_rate": 8.032966262750951e-06, + "loss": 2.7817, + "step": 737 + }, + { + "epoch": 1.0934371523915463, + "grad_norm": 7.2927712070196105, + "learning_rate": 8.02610773742508e-06, + "loss": 2.661, + "step": 738 + }, + { + "epoch": 1.0949202817945867, + "grad_norm": 7.3501092309758125, + "learning_rate": 8.019240215315527e-06, + "loss": 2.7004, + "step": 739 + }, + { + "epoch": 1.096403411197627, + "grad_norm": 7.669888311934199, + "learning_rate": 8.01236371683981e-06, + "loss": 2.6014, + "step": 740 + }, + { + "epoch": 1.0978865406006675, + "grad_norm": 7.20537855932707, + "learning_rate": 8.005478262442132e-06, + "loss": 2.7078, + "step": 741 + }, + { + "epoch": 1.0993696700037079, + "grad_norm": 7.061061955192372, + "learning_rate": 7.998583872593329e-06, + "loss": 2.5032, + "step": 742 + }, + { + "epoch": 1.1008527994067483, + "grad_norm": 7.1651006161801405, + "learning_rate": 7.991680567790798e-06, + "loss": 2.6053, + "step": 743 + }, + { + "epoch": 1.1023359288097887, + "grad_norm": 7.694851704915138, + "learning_rate": 7.98476836855844e-06, + "loss": 2.5892, + "step": 744 + }, + { + "epoch": 1.103819058212829, + "grad_norm": 7.492600589270321, + "learning_rate": 7.977847295446602e-06, + "loss": 2.5348, + "step": 745 + }, + { + "epoch": 1.1053021876158695, + "grad_norm": 7.614888753969497, + "learning_rate": 7.970917369032011e-06, + "loss": 2.8545, + "step": 746 + }, + { + "epoch": 1.10678531701891, + "grad_norm": 7.511060147739389, + "learning_rate": 7.963978609917722e-06, + "loss": 2.4157, + "step": 747 + }, + { + "epoch": 1.1082684464219503, + "grad_norm": 7.571699796527694, + "learning_rate": 7.957031038733038e-06, + "loss": 2.6314, + "step": 748 + }, + { + "epoch": 1.1097515758249907, + "grad_norm": 7.560177626941447, + "learning_rate": 7.950074676133472e-06, + "loss": 2.612, + "step": 749 + }, + { + "epoch": 1.1112347052280311, + "grad_norm": 7.741588372110643, + "learning_rate": 7.943109542800671e-06, + "loss": 2.7674, + "step": 750 + }, + { + "epoch": 1.1127178346310715, + "grad_norm": 7.6013189323555, + "learning_rate": 7.936135659442355e-06, + "loss": 2.6605, + "step": 751 + }, + { + "epoch": 1.114200964034112, + "grad_norm": 7.482652145278554, + "learning_rate": 7.92915304679226e-06, + "loss": 2.6637, + "step": 752 + }, + { + "epoch": 1.1156840934371524, + "grad_norm": 7.404346666219344, + "learning_rate": 7.922161725610078e-06, + "loss": 2.7998, + "step": 753 + }, + { + "epoch": 1.1171672228401928, + "grad_norm": 7.844642895329257, + "learning_rate": 7.915161716681385e-06, + "loss": 2.7277, + "step": 754 + }, + { + "epoch": 1.1186503522432332, + "grad_norm": 7.864664833110861, + "learning_rate": 7.908153040817592e-06, + "loss": 2.7308, + "step": 755 + }, + { + "epoch": 1.1201334816462736, + "grad_norm": 7.765810441886353, + "learning_rate": 7.901135718855877e-06, + "loss": 2.7312, + "step": 756 + }, + { + "epoch": 1.121616611049314, + "grad_norm": 7.877331700281659, + "learning_rate": 7.894109771659118e-06, + "loss": 2.7634, + "step": 757 + }, + { + "epoch": 1.1230997404523544, + "grad_norm": 7.326482161337163, + "learning_rate": 7.887075220115843e-06, + "loss": 2.7966, + "step": 758 + }, + { + "epoch": 1.1245828698553948, + "grad_norm": 7.3994249326888175, + "learning_rate": 7.880032085140159e-06, + "loss": 2.6299, + "step": 759 + }, + { + "epoch": 1.1260659992584352, + "grad_norm": 7.505955124746553, + "learning_rate": 7.872980387671685e-06, + "loss": 2.6393, + "step": 760 + }, + { + "epoch": 1.1275491286614758, + "grad_norm": 7.500107542173576, + "learning_rate": 7.86592014867551e-06, + "loss": 2.556, + "step": 761 + }, + { + "epoch": 1.129032258064516, + "grad_norm": 7.453716918197831, + "learning_rate": 7.85885138914211e-06, + "loss": 2.8765, + "step": 762 + }, + { + "epoch": 1.1305153874675566, + "grad_norm": 6.617161586648517, + "learning_rate": 7.851774130087287e-06, + "loss": 2.5257, + "step": 763 + }, + { + "epoch": 1.131998516870597, + "grad_norm": 8.198439288432656, + "learning_rate": 7.844688392552126e-06, + "loss": 2.7808, + "step": 764 + }, + { + "epoch": 1.1334816462736375, + "grad_norm": 8.857726212116852, + "learning_rate": 7.837594197602906e-06, + "loss": 2.6597, + "step": 765 + }, + { + "epoch": 1.1349647756766779, + "grad_norm": 7.103570149276009, + "learning_rate": 7.830491566331063e-06, + "loss": 2.516, + "step": 766 + }, + { + "epoch": 1.1364479050797183, + "grad_norm": 7.218944078097645, + "learning_rate": 7.823380519853102e-06, + "loss": 2.7351, + "step": 767 + }, + { + "epoch": 1.1379310344827587, + "grad_norm": 7.5772540850118455, + "learning_rate": 7.816261079310556e-06, + "loss": 2.7126, + "step": 768 + }, + { + "epoch": 1.139414163885799, + "grad_norm": 8.136511212328696, + "learning_rate": 7.80913326586991e-06, + "loss": 2.6302, + "step": 769 + }, + { + "epoch": 1.1408972932888395, + "grad_norm": 7.888008370166658, + "learning_rate": 7.801997100722542e-06, + "loss": 2.5241, + "step": 770 + }, + { + "epoch": 1.14238042269188, + "grad_norm": 7.4232362359526025, + "learning_rate": 7.794852605084661e-06, + "loss": 2.5761, + "step": 771 + }, + { + "epoch": 1.1438635520949203, + "grad_norm": 8.013909489625723, + "learning_rate": 7.787699800197245e-06, + "loss": 2.6608, + "step": 772 + }, + { + "epoch": 1.1453466814979607, + "grad_norm": 7.468166976199977, + "learning_rate": 7.780538707325974e-06, + "loss": 2.6008, + "step": 773 + }, + { + "epoch": 1.1468298109010011, + "grad_norm": 7.193364111364191, + "learning_rate": 7.773369347761167e-06, + "loss": 2.6049, + "step": 774 + }, + { + "epoch": 1.1483129403040415, + "grad_norm": 8.412611074715276, + "learning_rate": 7.766191742817719e-06, + "loss": 2.7711, + "step": 775 + }, + { + "epoch": 1.149796069707082, + "grad_norm": 7.103004494788614, + "learning_rate": 7.759005913835048e-06, + "loss": 2.6583, + "step": 776 + }, + { + "epoch": 1.1512791991101223, + "grad_norm": 7.86798695406255, + "learning_rate": 7.75181188217701e-06, + "loss": 2.7128, + "step": 777 + }, + { + "epoch": 1.1527623285131627, + "grad_norm": 7.8486827931342695, + "learning_rate": 7.744609669231858e-06, + "loss": 2.5114, + "step": 778 + }, + { + "epoch": 1.1542454579162031, + "grad_norm": 6.868764986211571, + "learning_rate": 7.737399296412162e-06, + "loss": 2.5079, + "step": 779 + }, + { + "epoch": 1.1557285873192435, + "grad_norm": 7.186946928798629, + "learning_rate": 7.730180785154759e-06, + "loss": 2.5633, + "step": 780 + }, + { + "epoch": 1.157211716722284, + "grad_norm": 7.291919323451493, + "learning_rate": 7.722954156920675e-06, + "loss": 2.5449, + "step": 781 + }, + { + "epoch": 1.1586948461253244, + "grad_norm": 7.260862615729957, + "learning_rate": 7.715719433195068e-06, + "loss": 2.6639, + "step": 782 + }, + { + "epoch": 1.1601779755283648, + "grad_norm": 7.464000148732533, + "learning_rate": 7.708476635487172e-06, + "loss": 2.5453, + "step": 783 + }, + { + "epoch": 1.1616611049314052, + "grad_norm": 8.111175883985315, + "learning_rate": 7.701225785330219e-06, + "loss": 2.6202, + "step": 784 + }, + { + "epoch": 1.1631442343344456, + "grad_norm": 7.9722395851557355, + "learning_rate": 7.693966904281378e-06, + "loss": 2.8597, + "step": 785 + }, + { + "epoch": 1.1646273637374862, + "grad_norm": 6.936183746255636, + "learning_rate": 7.686700013921704e-06, + "loss": 2.6221, + "step": 786 + }, + { + "epoch": 1.1661104931405264, + "grad_norm": 7.728390272522586, + "learning_rate": 7.67942513585606e-06, + "loss": 2.5929, + "step": 787 + }, + { + "epoch": 1.167593622543567, + "grad_norm": 8.295248650353933, + "learning_rate": 7.672142291713052e-06, + "loss": 2.6311, + "step": 788 + }, + { + "epoch": 1.1690767519466074, + "grad_norm": 7.425034485846196, + "learning_rate": 7.664851503144976e-06, + "loss": 2.7281, + "step": 789 + }, + { + "epoch": 1.1705598813496478, + "grad_norm": 7.907455468169775, + "learning_rate": 7.657552791827744e-06, + "loss": 2.6995, + "step": 790 + }, + { + "epoch": 1.1720430107526882, + "grad_norm": 8.151396581393362, + "learning_rate": 7.650246179460826e-06, + "loss": 2.7787, + "step": 791 + }, + { + "epoch": 1.1735261401557286, + "grad_norm": 7.019184987943653, + "learning_rate": 7.642931687767176e-06, + "loss": 2.516, + "step": 792 + }, + { + "epoch": 1.175009269558769, + "grad_norm": 7.2571096773027275, + "learning_rate": 7.63560933849318e-06, + "loss": 2.4549, + "step": 793 + }, + { + "epoch": 1.1764923989618095, + "grad_norm": 8.308795511120564, + "learning_rate": 7.628279153408582e-06, + "loss": 2.622, + "step": 794 + }, + { + "epoch": 1.1779755283648499, + "grad_norm": 7.643323329563048, + "learning_rate": 7.620941154306424e-06, + "loss": 2.7149, + "step": 795 + }, + { + "epoch": 1.1794586577678903, + "grad_norm": 8.002265270178212, + "learning_rate": 7.613595363002977e-06, + "loss": 2.6053, + "step": 796 + }, + { + "epoch": 1.1809417871709307, + "grad_norm": 8.05972693635747, + "learning_rate": 7.6062418013376795e-06, + "loss": 2.6219, + "step": 797 + }, + { + "epoch": 1.182424916573971, + "grad_norm": 7.1453316758892305, + "learning_rate": 7.598880491173075e-06, + "loss": 2.6703, + "step": 798 + }, + { + "epoch": 1.1839080459770115, + "grad_norm": 8.74984551084227, + "learning_rate": 7.591511454394736e-06, + "loss": 2.7095, + "step": 799 + }, + { + "epoch": 1.185391175380052, + "grad_norm": 7.0344038340912265, + "learning_rate": 7.5841347129112165e-06, + "loss": 2.6571, + "step": 800 + }, + { + "epoch": 1.1868743047830923, + "grad_norm": 6.989935004805803, + "learning_rate": 7.57675028865397e-06, + "loss": 2.4404, + "step": 801 + }, + { + "epoch": 1.1883574341861327, + "grad_norm": 6.605496853655883, + "learning_rate": 7.569358203577294e-06, + "loss": 2.4323, + "step": 802 + }, + { + "epoch": 1.1898405635891731, + "grad_norm": 7.928855669996584, + "learning_rate": 7.56195847965826e-06, + "loss": 2.6066, + "step": 803 + }, + { + "epoch": 1.1913236929922135, + "grad_norm": 7.534652648351735, + "learning_rate": 7.554551138896653e-06, + "loss": 2.6471, + "step": 804 + }, + { + "epoch": 1.192806822395254, + "grad_norm": 8.818960365424703, + "learning_rate": 7.547136203314901e-06, + "loss": 2.743, + "step": 805 + }, + { + "epoch": 1.1942899517982943, + "grad_norm": 7.524131460529281, + "learning_rate": 7.539713694958013e-06, + "loss": 2.6695, + "step": 806 + }, + { + "epoch": 1.1957730812013347, + "grad_norm": 6.80490134629294, + "learning_rate": 7.532283635893514e-06, + "loss": 2.7033, + "step": 807 + }, + { + "epoch": 1.1972562106043751, + "grad_norm": 7.224667982775907, + "learning_rate": 7.524846048211371e-06, + "loss": 2.5436, + "step": 808 + }, + { + "epoch": 1.1987393400074156, + "grad_norm": 7.453400753917666, + "learning_rate": 7.517400954023943e-06, + "loss": 2.7469, + "step": 809 + }, + { + "epoch": 1.200222469410456, + "grad_norm": 7.343047504834424, + "learning_rate": 7.509948375465901e-06, + "loss": 2.7866, + "step": 810 + }, + { + "epoch": 1.2017055988134966, + "grad_norm": 7.732408734295248, + "learning_rate": 7.502488334694167e-06, + "loss": 2.5481, + "step": 811 + }, + { + "epoch": 1.2031887282165368, + "grad_norm": 7.580808300121397, + "learning_rate": 7.49502085388785e-06, + "loss": 2.788, + "step": 812 + }, + { + "epoch": 1.2046718576195774, + "grad_norm": 8.13673492483867, + "learning_rate": 7.487545955248179e-06, + "loss": 2.715, + "step": 813 + }, + { + "epoch": 1.2061549870226178, + "grad_norm": 7.918686764250153, + "learning_rate": 7.480063660998438e-06, + "loss": 2.6162, + "step": 814 + }, + { + "epoch": 1.2076381164256582, + "grad_norm": 8.395084086339601, + "learning_rate": 7.472573993383893e-06, + "loss": 2.6927, + "step": 815 + }, + { + "epoch": 1.2091212458286986, + "grad_norm": 7.4767791257682505, + "learning_rate": 7.465076974671739e-06, + "loss": 2.5235, + "step": 816 + }, + { + "epoch": 1.210604375231739, + "grad_norm": 7.280137288084981, + "learning_rate": 7.4575726271510195e-06, + "loss": 2.6275, + "step": 817 + }, + { + "epoch": 1.2120875046347794, + "grad_norm": 7.517224832874892, + "learning_rate": 7.4500609731325715e-06, + "loss": 2.5561, + "step": 818 + }, + { + "epoch": 1.2135706340378198, + "grad_norm": 7.341932396380326, + "learning_rate": 7.442542034948951e-06, + "loss": 2.6016, + "step": 819 + }, + { + "epoch": 1.2150537634408602, + "grad_norm": 7.275889771273208, + "learning_rate": 7.4350158349543745e-06, + "loss": 2.5616, + "step": 820 + }, + { + "epoch": 1.2165368928439007, + "grad_norm": 7.182805778248395, + "learning_rate": 7.427482395524646e-06, + "loss": 2.5605, + "step": 821 + }, + { + "epoch": 1.218020022246941, + "grad_norm": 8.281665859296297, + "learning_rate": 7.419941739057087e-06, + "loss": 2.7134, + "step": 822 + }, + { + "epoch": 1.2195031516499815, + "grad_norm": 7.176272735660641, + "learning_rate": 7.41239388797049e-06, + "loss": 2.6489, + "step": 823 + }, + { + "epoch": 1.2209862810530219, + "grad_norm": 7.8360578104306, + "learning_rate": 7.404838864705023e-06, + "loss": 2.4683, + "step": 824 + }, + { + "epoch": 1.2224694104560623, + "grad_norm": 8.002553574183745, + "learning_rate": 7.397276691722185e-06, + "loss": 2.572, + "step": 825 + }, + { + "epoch": 1.2239525398591027, + "grad_norm": 7.685523998201807, + "learning_rate": 7.389707391504728e-06, + "loss": 2.7755, + "step": 826 + }, + { + "epoch": 1.225435669262143, + "grad_norm": 7.70486611448321, + "learning_rate": 7.382130986556597e-06, + "loss": 2.619, + "step": 827 + }, + { + "epoch": 1.2269187986651835, + "grad_norm": 7.928163228861628, + "learning_rate": 7.374547499402857e-06, + "loss": 2.4905, + "step": 828 + }, + { + "epoch": 1.228401928068224, + "grad_norm": 6.89542491416719, + "learning_rate": 7.366956952589631e-06, + "loss": 2.4747, + "step": 829 + }, + { + "epoch": 1.2298850574712643, + "grad_norm": 8.038439245340312, + "learning_rate": 7.359359368684027e-06, + "loss": 2.6219, + "step": 830 + }, + { + "epoch": 1.2313681868743047, + "grad_norm": 7.422639322116277, + "learning_rate": 7.35175477027408e-06, + "loss": 2.7507, + "step": 831 + }, + { + "epoch": 1.2328513162773451, + "grad_norm": 7.432215608649549, + "learning_rate": 7.344143179968674e-06, + "loss": 2.7673, + "step": 832 + }, + { + "epoch": 1.2343344456803855, + "grad_norm": 8.077960605097887, + "learning_rate": 7.3365246203974845e-06, + "loss": 2.6161, + "step": 833 + }, + { + "epoch": 1.235817575083426, + "grad_norm": 7.347914351711277, + "learning_rate": 7.328899114210908e-06, + "loss": 2.829, + "step": 834 + }, + { + "epoch": 1.2373007044864663, + "grad_norm": 7.697647273202459, + "learning_rate": 7.3212666840799865e-06, + "loss": 2.6413, + "step": 835 + }, + { + "epoch": 1.238783833889507, + "grad_norm": 7.2146791690481615, + "learning_rate": 7.313627352696353e-06, + "loss": 2.495, + "step": 836 + }, + { + "epoch": 1.2402669632925472, + "grad_norm": 7.501308495410181, + "learning_rate": 7.305981142772158e-06, + "loss": 2.586, + "step": 837 + }, + { + "epoch": 1.2417500926955878, + "grad_norm": 6.831337887662955, + "learning_rate": 7.298328077040002e-06, + "loss": 2.9329, + "step": 838 + }, + { + "epoch": 1.2432332220986282, + "grad_norm": 8.000220782961357, + "learning_rate": 7.290668178252869e-06, + "loss": 2.6078, + "step": 839 + }, + { + "epoch": 1.2447163515016686, + "grad_norm": 7.624608774290171, + "learning_rate": 7.283001469184052e-06, + "loss": 2.6519, + "step": 840 + }, + { + "epoch": 1.246199480904709, + "grad_norm": 7.5809363817807025, + "learning_rate": 7.2753279726271e-06, + "loss": 2.5065, + "step": 841 + }, + { + "epoch": 1.2476826103077494, + "grad_norm": 7.606627612226252, + "learning_rate": 7.267647711395738e-06, + "loss": 2.7265, + "step": 842 + }, + { + "epoch": 1.2491657397107898, + "grad_norm": 7.5079122917081555, + "learning_rate": 7.2599607083238e-06, + "loss": 2.6868, + "step": 843 + }, + { + "epoch": 1.2506488691138302, + "grad_norm": 7.748392914430649, + "learning_rate": 7.2522669862651665e-06, + "loss": 2.588, + "step": 844 + }, + { + "epoch": 1.2521319985168706, + "grad_norm": 7.200521656342836, + "learning_rate": 7.244566568093694e-06, + "loss": 2.5592, + "step": 845 + }, + { + "epoch": 1.253615127919911, + "grad_norm": 7.4272484591289825, + "learning_rate": 7.236859476703148e-06, + "loss": 2.8474, + "step": 846 + }, + { + "epoch": 1.2550982573229514, + "grad_norm": 7.30690907473671, + "learning_rate": 7.2291457350071315e-06, + "loss": 2.59, + "step": 847 + }, + { + "epoch": 1.2565813867259918, + "grad_norm": 7.1411905854223, + "learning_rate": 7.221425365939019e-06, + "loss": 2.5109, + "step": 848 + }, + { + "epoch": 1.2580645161290323, + "grad_norm": 7.5088517087430535, + "learning_rate": 7.213698392451891e-06, + "loss": 2.5935, + "step": 849 + }, + { + "epoch": 1.2595476455320727, + "grad_norm": 7.872745593897126, + "learning_rate": 7.205964837518461e-06, + "loss": 2.5402, + "step": 850 + }, + { + "epoch": 1.261030774935113, + "grad_norm": 7.064371642594142, + "learning_rate": 7.198224724131012e-06, + "loss": 2.834, + "step": 851 + }, + { + "epoch": 1.2625139043381535, + "grad_norm": 7.541838982634661, + "learning_rate": 7.190478075301323e-06, + "loss": 2.574, + "step": 852 + }, + { + "epoch": 1.2639970337411939, + "grad_norm": 7.520313628384289, + "learning_rate": 7.182724914060604e-06, + "loss": 2.4731, + "step": 853 + }, + { + "epoch": 1.2654801631442343, + "grad_norm": 7.696797548975337, + "learning_rate": 7.174965263459427e-06, + "loss": 2.6688, + "step": 854 + }, + { + "epoch": 1.2669632925472747, + "grad_norm": 7.2321589036985, + "learning_rate": 7.1671991465676584e-06, + "loss": 2.5622, + "step": 855 + }, + { + "epoch": 1.268446421950315, + "grad_norm": 7.2498940225364015, + "learning_rate": 7.159426586474388e-06, + "loss": 2.5318, + "step": 856 + }, + { + "epoch": 1.2699295513533555, + "grad_norm": 6.590467256797954, + "learning_rate": 7.151647606287861e-06, + "loss": 2.5192, + "step": 857 + }, + { + "epoch": 1.271412680756396, + "grad_norm": 7.346601456671925, + "learning_rate": 7.143862229135411e-06, + "loss": 2.6513, + "step": 858 + }, + { + "epoch": 1.2728958101594365, + "grad_norm": 8.40645703099893, + "learning_rate": 7.1360704781633884e-06, + "loss": 2.6891, + "step": 859 + }, + { + "epoch": 1.2743789395624767, + "grad_norm": 7.45363829045274, + "learning_rate": 7.128272376537097e-06, + "loss": 2.6347, + "step": 860 + }, + { + "epoch": 1.2758620689655173, + "grad_norm": 9.846297208361444, + "learning_rate": 7.120467947440719e-06, + "loss": 2.7792, + "step": 861 + }, + { + "epoch": 1.2773451983685575, + "grad_norm": 7.5120364194178375, + "learning_rate": 7.112657214077247e-06, + "loss": 2.6989, + "step": 862 + }, + { + "epoch": 1.2788283277715982, + "grad_norm": 7.474564998439737, + "learning_rate": 7.104840199668419e-06, + "loss": 2.7064, + "step": 863 + }, + { + "epoch": 1.2803114571746386, + "grad_norm": 7.353798220822734, + "learning_rate": 7.097016927454645e-06, + "loss": 2.4662, + "step": 864 + }, + { + "epoch": 1.281794586577679, + "grad_norm": 7.199623180450064, + "learning_rate": 7.089187420694943e-06, + "loss": 2.6867, + "step": 865 + }, + { + "epoch": 1.2832777159807194, + "grad_norm": 7.984247826840506, + "learning_rate": 7.081351702666863e-06, + "loss": 2.6026, + "step": 866 + }, + { + "epoch": 1.2847608453837598, + "grad_norm": 7.905236815208939, + "learning_rate": 7.073509796666422e-06, + "loss": 2.551, + "step": 867 + }, + { + "epoch": 1.2862439747868002, + "grad_norm": 7.81088981559433, + "learning_rate": 7.065661726008036e-06, + "loss": 2.5596, + "step": 868 + }, + { + "epoch": 1.2877271041898406, + "grad_norm": 7.932944747437912, + "learning_rate": 7.057807514024449e-06, + "loss": 2.6798, + "step": 869 + }, + { + "epoch": 1.289210233592881, + "grad_norm": 7.489113678515073, + "learning_rate": 7.0499471840666624e-06, + "loss": 2.5759, + "step": 870 + }, + { + "epoch": 1.2906933629959214, + "grad_norm": 7.693204748152731, + "learning_rate": 7.042080759503866e-06, + "loss": 2.5205, + "step": 871 + }, + { + "epoch": 1.2921764923989618, + "grad_norm": 7.911741581629546, + "learning_rate": 7.03420826372337e-06, + "loss": 2.4498, + "step": 872 + }, + { + "epoch": 1.2936596218020022, + "grad_norm": 7.290417154706425, + "learning_rate": 7.026329720130534e-06, + "loss": 2.7301, + "step": 873 + }, + { + "epoch": 1.2951427512050426, + "grad_norm": 7.8425655669987, + "learning_rate": 7.018445152148698e-06, + "loss": 2.8801, + "step": 874 + }, + { + "epoch": 1.296625880608083, + "grad_norm": 8.170600618831344, + "learning_rate": 7.010554583219117e-06, + "loss": 2.6219, + "step": 875 + }, + { + "epoch": 1.2981090100111234, + "grad_norm": 7.53538661975812, + "learning_rate": 7.00265803680088e-06, + "loss": 2.667, + "step": 876 + }, + { + "epoch": 1.2995921394141638, + "grad_norm": 7.845646787993774, + "learning_rate": 6.994755536370853e-06, + "loss": 2.5536, + "step": 877 + }, + { + "epoch": 1.3010752688172043, + "grad_norm": 7.48793588379386, + "learning_rate": 6.9868471054236e-06, + "loss": 2.5317, + "step": 878 + }, + { + "epoch": 1.3025583982202447, + "grad_norm": 7.677690565913934, + "learning_rate": 6.9789327674713215e-06, + "loss": 2.5215, + "step": 879 + }, + { + "epoch": 1.304041527623285, + "grad_norm": 8.500834254081374, + "learning_rate": 6.971012546043774e-06, + "loss": 2.6754, + "step": 880 + }, + { + "epoch": 1.3055246570263255, + "grad_norm": 7.973001592236973, + "learning_rate": 6.963086464688209e-06, + "loss": 2.7651, + "step": 881 + }, + { + "epoch": 1.3070077864293659, + "grad_norm": 7.623208958620058, + "learning_rate": 6.9551545469693e-06, + "loss": 2.8152, + "step": 882 + }, + { + "epoch": 1.3084909158324063, + "grad_norm": 7.462455895386634, + "learning_rate": 6.947216816469071e-06, + "loss": 2.6816, + "step": 883 + }, + { + "epoch": 1.309974045235447, + "grad_norm": 7.156622279217329, + "learning_rate": 6.939273296786831e-06, + "loss": 2.4788, + "step": 884 + }, + { + "epoch": 1.311457174638487, + "grad_norm": 7.6776335104270155, + "learning_rate": 6.931324011539096e-06, + "loss": 2.662, + "step": 885 + }, + { + "epoch": 1.3129403040415277, + "grad_norm": 8.018763315282328, + "learning_rate": 6.923368984359526e-06, + "loss": 2.7286, + "step": 886 + }, + { + "epoch": 1.314423433444568, + "grad_norm": 7.621563482524791, + "learning_rate": 6.9154082388988505e-06, + "loss": 2.5458, + "step": 887 + }, + { + "epoch": 1.3159065628476085, + "grad_norm": 7.695755714324214, + "learning_rate": 6.907441798824803e-06, + "loss": 2.7529, + "step": 888 + }, + { + "epoch": 1.317389692250649, + "grad_norm": 7.274376333547552, + "learning_rate": 6.899469687822044e-06, + "loss": 2.6301, + "step": 889 + }, + { + "epoch": 1.3188728216536894, + "grad_norm": 7.332242701010122, + "learning_rate": 6.891491929592095e-06, + "loss": 2.6982, + "step": 890 + }, + { + "epoch": 1.3203559510567298, + "grad_norm": 8.109163629388016, + "learning_rate": 6.883508547853268e-06, + "loss": 2.6599, + "step": 891 + }, + { + "epoch": 1.3218390804597702, + "grad_norm": 7.987368968565911, + "learning_rate": 6.8755195663405925e-06, + "loss": 2.626, + "step": 892 + }, + { + "epoch": 1.3233222098628106, + "grad_norm": 7.5981880848353605, + "learning_rate": 6.867525008805748e-06, + "loss": 2.6075, + "step": 893 + }, + { + "epoch": 1.324805339265851, + "grad_norm": 7.762779891669422, + "learning_rate": 6.85952489901699e-06, + "loss": 2.4159, + "step": 894 + }, + { + "epoch": 1.3262884686688914, + "grad_norm": 7.89519912850623, + "learning_rate": 6.851519260759082e-06, + "loss": 2.618, + "step": 895 + }, + { + "epoch": 1.3277715980719318, + "grad_norm": 7.7070905775285, + "learning_rate": 6.843508117833224e-06, + "loss": 2.6209, + "step": 896 + }, + { + "epoch": 1.3292547274749722, + "grad_norm": 8.356515853487165, + "learning_rate": 6.835491494056983e-06, + "loss": 2.7993, + "step": 897 + }, + { + "epoch": 1.3307378568780126, + "grad_norm": 8.547742399789547, + "learning_rate": 6.827469413264219e-06, + "loss": 2.6777, + "step": 898 + }, + { + "epoch": 1.332220986281053, + "grad_norm": 7.626243579842217, + "learning_rate": 6.819441899305017e-06, + "loss": 2.5648, + "step": 899 + }, + { + "epoch": 1.3337041156840934, + "grad_norm": 7.859068139804514, + "learning_rate": 6.811408976045613e-06, + "loss": 2.5597, + "step": 900 + }, + { + "epoch": 1.3351872450871338, + "grad_norm": 6.933640378300493, + "learning_rate": 6.8033706673683276e-06, + "loss": 2.6008, + "step": 901 + }, + { + "epoch": 1.3366703744901742, + "grad_norm": 7.851280591429177, + "learning_rate": 6.795326997171494e-06, + "loss": 2.7614, + "step": 902 + }, + { + "epoch": 1.3381535038932146, + "grad_norm": 7.543794910058923, + "learning_rate": 6.78727798936938e-06, + "loss": 2.6521, + "step": 903 + }, + { + "epoch": 1.339636633296255, + "grad_norm": 7.204690719875039, + "learning_rate": 6.779223667892127e-06, + "loss": 2.4795, + "step": 904 + }, + { + "epoch": 1.3411197626992954, + "grad_norm": 7.921184426184614, + "learning_rate": 6.771164056685674e-06, + "loss": 2.6673, + "step": 905 + }, + { + "epoch": 1.3426028921023359, + "grad_norm": 7.310640042358955, + "learning_rate": 6.763099179711685e-06, + "loss": 2.6627, + "step": 906 + }, + { + "epoch": 1.3440860215053765, + "grad_norm": 7.650910874315525, + "learning_rate": 6.755029060947478e-06, + "loss": 2.645, + "step": 907 + }, + { + "epoch": 1.3455691509084167, + "grad_norm": 7.448216400887375, + "learning_rate": 6.746953724385961e-06, + "loss": 2.5591, + "step": 908 + }, + { + "epoch": 1.3470522803114573, + "grad_norm": 7.222570639059203, + "learning_rate": 6.738873194035548e-06, + "loss": 2.8737, + "step": 909 + }, + { + "epoch": 1.3485354097144975, + "grad_norm": 8.148988013797442, + "learning_rate": 6.730787493920097e-06, + "loss": 2.5704, + "step": 910 + }, + { + "epoch": 1.350018539117538, + "grad_norm": 7.8424653781156115, + "learning_rate": 6.722696648078838e-06, + "loss": 2.534, + "step": 911 + }, + { + "epoch": 1.3515016685205783, + "grad_norm": 8.508830093230188, + "learning_rate": 6.714600680566297e-06, + "loss": 2.6389, + "step": 912 + }, + { + "epoch": 1.352984797923619, + "grad_norm": 7.159396952224177, + "learning_rate": 6.70649961545223e-06, + "loss": 2.6177, + "step": 913 + }, + { + "epoch": 1.3544679273266593, + "grad_norm": 7.549089303592334, + "learning_rate": 6.698393476821542e-06, + "loss": 2.6325, + "step": 914 + }, + { + "epoch": 1.3559510567296997, + "grad_norm": 7.582280148146, + "learning_rate": 6.690282288774229e-06, + "loss": 2.6276, + "step": 915 + }, + { + "epoch": 1.3574341861327401, + "grad_norm": 7.558614771177053, + "learning_rate": 6.682166075425298e-06, + "loss": 2.7104, + "step": 916 + }, + { + "epoch": 1.3589173155357805, + "grad_norm": 8.141204403521092, + "learning_rate": 6.674044860904692e-06, + "loss": 2.5937, + "step": 917 + }, + { + "epoch": 1.360400444938821, + "grad_norm": 7.02251901488611, + "learning_rate": 6.665918669357225e-06, + "loss": 2.7, + "step": 918 + }, + { + "epoch": 1.3618835743418614, + "grad_norm": 7.623379317303192, + "learning_rate": 6.65778752494251e-06, + "loss": 2.606, + "step": 919 + }, + { + "epoch": 1.3633667037449018, + "grad_norm": 6.929047007923183, + "learning_rate": 6.649651451834884e-06, + "loss": 2.6954, + "step": 920 + }, + { + "epoch": 1.3648498331479422, + "grad_norm": 7.262363115924987, + "learning_rate": 6.641510474223338e-06, + "loss": 2.6066, + "step": 921 + }, + { + "epoch": 1.3663329625509826, + "grad_norm": 7.296813834485141, + "learning_rate": 6.633364616311442e-06, + "loss": 2.519, + "step": 922 + }, + { + "epoch": 1.367816091954023, + "grad_norm": 7.21584950552155, + "learning_rate": 6.625213902317276e-06, + "loss": 2.6099, + "step": 923 + }, + { + "epoch": 1.3692992213570634, + "grad_norm": 7.851402411720136, + "learning_rate": 6.617058356473358e-06, + "loss": 2.5097, + "step": 924 + }, + { + "epoch": 1.3707823507601038, + "grad_norm": 7.825620549346312, + "learning_rate": 6.608898003026574e-06, + "loss": 2.6848, + "step": 925 + }, + { + "epoch": 1.3722654801631442, + "grad_norm": 7.181877422982237, + "learning_rate": 6.600732866238097e-06, + "loss": 2.4427, + "step": 926 + }, + { + "epoch": 1.3737486095661846, + "grad_norm": 7.710044486062551, + "learning_rate": 6.59256297038333e-06, + "loss": 2.5983, + "step": 927 + }, + { + "epoch": 1.375231738969225, + "grad_norm": 7.839631643275588, + "learning_rate": 6.584388339751816e-06, + "loss": 2.6214, + "step": 928 + }, + { + "epoch": 1.3767148683722654, + "grad_norm": 7.638159817122085, + "learning_rate": 6.5762089986471785e-06, + "loss": 2.6263, + "step": 929 + }, + { + "epoch": 1.3781979977753058, + "grad_norm": 7.747609109282608, + "learning_rate": 6.568024971387048e-06, + "loss": 2.7503, + "step": 930 + }, + { + "epoch": 1.3796811271783462, + "grad_norm": 6.907202712946399, + "learning_rate": 6.559836282302984e-06, + "loss": 2.5868, + "step": 931 + }, + { + "epoch": 1.3811642565813869, + "grad_norm": 7.044761753920458, + "learning_rate": 6.551642955740405e-06, + "loss": 2.5566, + "step": 932 + }, + { + "epoch": 1.382647385984427, + "grad_norm": 7.418925947968054, + "learning_rate": 6.5434450160585195e-06, + "loss": 2.4247, + "step": 933 + }, + { + "epoch": 1.3841305153874677, + "grad_norm": 7.90624041461375, + "learning_rate": 6.535242487630251e-06, + "loss": 2.686, + "step": 934 + }, + { + "epoch": 1.3856136447905079, + "grad_norm": 7.031993137110295, + "learning_rate": 6.527035394842165e-06, + "loss": 2.5415, + "step": 935 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 7.484500071219778, + "learning_rate": 6.5188237620943965e-06, + "loss": 2.6565, + "step": 936 + }, + { + "epoch": 1.3885799035965887, + "grad_norm": 7.553431801287872, + "learning_rate": 6.510607613800577e-06, + "loss": 2.6567, + "step": 937 + }, + { + "epoch": 1.3900630329996293, + "grad_norm": 7.340844271778105, + "learning_rate": 6.502386974387765e-06, + "loss": 2.5225, + "step": 938 + }, + { + "epoch": 1.3915461624026697, + "grad_norm": 6.747077231543657, + "learning_rate": 6.494161868296372e-06, + "loss": 2.4638, + "step": 939 + }, + { + "epoch": 1.3930292918057101, + "grad_norm": 8.065705636161889, + "learning_rate": 6.485932319980088e-06, + "loss": 2.6585, + "step": 940 + }, + { + "epoch": 1.3945124212087505, + "grad_norm": 8.357452763505258, + "learning_rate": 6.477698353905808e-06, + "loss": 2.8086, + "step": 941 + }, + { + "epoch": 1.395995550611791, + "grad_norm": 8.169690481896048, + "learning_rate": 6.469459994553565e-06, + "loss": 2.6484, + "step": 942 + }, + { + "epoch": 1.3974786800148313, + "grad_norm": 7.52396115410674, + "learning_rate": 6.46121726641645e-06, + "loss": 2.5818, + "step": 943 + }, + { + "epoch": 1.3989618094178717, + "grad_norm": 7.426973861655574, + "learning_rate": 6.452970194000546e-06, + "loss": 2.4932, + "step": 944 + }, + { + "epoch": 1.4004449388209121, + "grad_norm": 7.785545357552555, + "learning_rate": 6.444718801824849e-06, + "loss": 2.6897, + "step": 945 + }, + { + "epoch": 1.4019280682239526, + "grad_norm": 7.344637343578896, + "learning_rate": 6.436463114421199e-06, + "loss": 2.5642, + "step": 946 + }, + { + "epoch": 1.403411197626993, + "grad_norm": 7.758374902637188, + "learning_rate": 6.428203156334205e-06, + "loss": 2.6706, + "step": 947 + }, + { + "epoch": 1.4048943270300334, + "grad_norm": 7.121332189347917, + "learning_rate": 6.419938952121174e-06, + "loss": 2.4949, + "step": 948 + }, + { + "epoch": 1.4063774564330738, + "grad_norm": 6.843700936747663, + "learning_rate": 6.4116705263520396e-06, + "loss": 2.6552, + "step": 949 + }, + { + "epoch": 1.4078605858361142, + "grad_norm": 8.456943695936088, + "learning_rate": 6.403397903609279e-06, + "loss": 2.6144, + "step": 950 + }, + { + "epoch": 1.4093437152391546, + "grad_norm": 6.825046191354937, + "learning_rate": 6.395121108487855e-06, + "loss": 2.567, + "step": 951 + }, + { + "epoch": 1.410826844642195, + "grad_norm": 7.528847142710838, + "learning_rate": 6.386840165595131e-06, + "loss": 2.7682, + "step": 952 + }, + { + "epoch": 1.4123099740452354, + "grad_norm": 7.48209319670537, + "learning_rate": 6.378555099550803e-06, + "loss": 2.4529, + "step": 953 + }, + { + "epoch": 1.4137931034482758, + "grad_norm": 7.32613298037404, + "learning_rate": 6.370265934986823e-06, + "loss": 2.5474, + "step": 954 + }, + { + "epoch": 1.4152762328513162, + "grad_norm": 7.032750647563915, + "learning_rate": 6.361972696547333e-06, + "loss": 2.5757, + "step": 955 + }, + { + "epoch": 1.4167593622543566, + "grad_norm": 7.180726945820388, + "learning_rate": 6.353675408888582e-06, + "loss": 2.5476, + "step": 956 + }, + { + "epoch": 1.4182424916573972, + "grad_norm": 7.461067958269411, + "learning_rate": 6.3453740966788625e-06, + "loss": 2.7449, + "step": 957 + }, + { + "epoch": 1.4197256210604374, + "grad_norm": 8.38939182524137, + "learning_rate": 6.337068784598428e-06, + "loss": 2.6759, + "step": 958 + }, + { + "epoch": 1.421208750463478, + "grad_norm": 7.857772985921061, + "learning_rate": 6.328759497339423e-06, + "loss": 2.6291, + "step": 959 + }, + { + "epoch": 1.4226918798665182, + "grad_norm": 7.456374940554493, + "learning_rate": 6.320446259605815e-06, + "loss": 2.7197, + "step": 960 + }, + { + "epoch": 1.4241750092695589, + "grad_norm": 7.897973557630277, + "learning_rate": 6.312129096113313e-06, + "loss": 2.7717, + "step": 961 + }, + { + "epoch": 1.425658138672599, + "grad_norm": 8.35867117154764, + "learning_rate": 6.3038080315893e-06, + "loss": 2.6334, + "step": 962 + }, + { + "epoch": 1.4271412680756397, + "grad_norm": 7.413129496141235, + "learning_rate": 6.2954830907727545e-06, + "loss": 2.7417, + "step": 963 + }, + { + "epoch": 1.42862439747868, + "grad_norm": 8.673464294265317, + "learning_rate": 6.287154298414182e-06, + "loss": 2.7933, + "step": 964 + }, + { + "epoch": 1.4301075268817205, + "grad_norm": 7.438859305607203, + "learning_rate": 6.278821679275534e-06, + "loss": 2.5905, + "step": 965 + }, + { + "epoch": 1.431590656284761, + "grad_norm": 7.618681109630608, + "learning_rate": 6.270485258130146e-06, + "loss": 2.7046, + "step": 966 + }, + { + "epoch": 1.4330737856878013, + "grad_norm": 7.400352618561047, + "learning_rate": 6.262145059762652e-06, + "loss": 2.585, + "step": 967 + }, + { + "epoch": 1.4345569150908417, + "grad_norm": 6.866503464029429, + "learning_rate": 6.253801108968918e-06, + "loss": 2.7119, + "step": 968 + }, + { + "epoch": 1.4360400444938821, + "grad_norm": 7.190690621736264, + "learning_rate": 6.245453430555966e-06, + "loss": 2.5945, + "step": 969 + }, + { + "epoch": 1.4375231738969225, + "grad_norm": 7.6718795136745594, + "learning_rate": 6.237102049341897e-06, + "loss": 2.5745, + "step": 970 + }, + { + "epoch": 1.439006303299963, + "grad_norm": 7.877703967227689, + "learning_rate": 6.228746990155831e-06, + "loss": 2.6108, + "step": 971 + }, + { + "epoch": 1.4404894327030033, + "grad_norm": 7.521664906190522, + "learning_rate": 6.220388277837809e-06, + "loss": 2.6214, + "step": 972 + }, + { + "epoch": 1.4419725621060437, + "grad_norm": 7.181114804057131, + "learning_rate": 6.212025937238742e-06, + "loss": 2.6205, + "step": 973 + }, + { + "epoch": 1.4434556915090841, + "grad_norm": 7.900840346264698, + "learning_rate": 6.203659993220325e-06, + "loss": 2.6459, + "step": 974 + }, + { + "epoch": 1.4449388209121246, + "grad_norm": 7.440158090467247, + "learning_rate": 6.195290470654966e-06, + "loss": 2.763, + "step": 975 + }, + { + "epoch": 1.446421950315165, + "grad_norm": 7.808642320826962, + "learning_rate": 6.186917394425715e-06, + "loss": 2.6509, + "step": 976 + }, + { + "epoch": 1.4479050797182054, + "grad_norm": 7.983509031864924, + "learning_rate": 6.178540789426183e-06, + "loss": 2.7146, + "step": 977 + }, + { + "epoch": 1.4493882091212458, + "grad_norm": 7.201275241714201, + "learning_rate": 6.170160680560478e-06, + "loss": 2.6521, + "step": 978 + }, + { + "epoch": 1.4508713385242862, + "grad_norm": 7.800482606805078, + "learning_rate": 6.161777092743117e-06, + "loss": 2.5838, + "step": 979 + }, + { + "epoch": 1.4523544679273266, + "grad_norm": 7.947318879571231, + "learning_rate": 6.153390050898968e-06, + "loss": 2.6528, + "step": 980 + }, + { + "epoch": 1.453837597330367, + "grad_norm": 7.292407964776668, + "learning_rate": 6.144999579963164e-06, + "loss": 2.7104, + "step": 981 + }, + { + "epoch": 1.4553207267334076, + "grad_norm": 7.4505483932983285, + "learning_rate": 6.136605704881034e-06, + "loss": 2.5722, + "step": 982 + }, + { + "epoch": 1.4568038561364478, + "grad_norm": 7.666658608603322, + "learning_rate": 6.128208450608026e-06, + "loss": 2.7361, + "step": 983 + }, + { + "epoch": 1.4582869855394884, + "grad_norm": 7.182541937156599, + "learning_rate": 6.119807842109636e-06, + "loss": 2.4755, + "step": 984 + }, + { + "epoch": 1.4597701149425286, + "grad_norm": 7.611408320973418, + "learning_rate": 6.1114039043613335e-06, + "loss": 2.6533, + "step": 985 + }, + { + "epoch": 1.4612532443455692, + "grad_norm": 7.018752923823523, + "learning_rate": 6.102996662348485e-06, + "loss": 2.5334, + "step": 986 + }, + { + "epoch": 1.4627363737486094, + "grad_norm": 7.355348008339624, + "learning_rate": 6.09458614106628e-06, + "loss": 2.5451, + "step": 987 + }, + { + "epoch": 1.46421950315165, + "grad_norm": 7.611560927122905, + "learning_rate": 6.0861723655196554e-06, + "loss": 2.4957, + "step": 988 + }, + { + "epoch": 1.4657026325546905, + "grad_norm": 7.674781850891444, + "learning_rate": 6.0777553607232276e-06, + "loss": 2.6462, + "step": 989 + }, + { + "epoch": 1.4671857619577309, + "grad_norm": 8.009302507383385, + "learning_rate": 6.069335151701214e-06, + "loss": 2.7533, + "step": 990 + }, + { + "epoch": 1.4686688913607713, + "grad_norm": 7.734252587035851, + "learning_rate": 6.060911763487353e-06, + "loss": 2.8154, + "step": 991 + }, + { + "epoch": 1.4701520207638117, + "grad_norm": 7.735564454363332, + "learning_rate": 6.05248522112484e-06, + "loss": 2.6538, + "step": 992 + }, + { + "epoch": 1.471635150166852, + "grad_norm": 8.359728070231318, + "learning_rate": 6.044055549666245e-06, + "loss": 2.4944, + "step": 993 + }, + { + "epoch": 1.4731182795698925, + "grad_norm": 6.923066818982717, + "learning_rate": 6.035622774173442e-06, + "loss": 2.6283, + "step": 994 + }, + { + "epoch": 1.474601408972933, + "grad_norm": 7.380469122114445, + "learning_rate": 6.027186919717534e-06, + "loss": 2.6132, + "step": 995 + }, + { + "epoch": 1.4760845383759733, + "grad_norm": 7.835293549752973, + "learning_rate": 6.0187480113787765e-06, + "loss": 2.7371, + "step": 996 + }, + { + "epoch": 1.4775676677790137, + "grad_norm": 7.418880035826508, + "learning_rate": 6.010306074246506e-06, + "loss": 2.8416, + "step": 997 + }, + { + "epoch": 1.4790507971820541, + "grad_norm": 6.983606261102796, + "learning_rate": 6.001861133419062e-06, + "loss": 2.3772, + "step": 998 + }, + { + "epoch": 1.4805339265850945, + "grad_norm": 7.607052270977504, + "learning_rate": 5.993413214003716e-06, + "loss": 2.5814, + "step": 999 + }, + { + "epoch": 1.482017055988135, + "grad_norm": 7.46853072494551, + "learning_rate": 5.984962341116593e-06, + "loss": 2.5882, + "step": 1000 + }, + { + "epoch": 1.4835001853911753, + "grad_norm": 8.247266054385399, + "learning_rate": 5.976508539882604e-06, + "loss": 2.6448, + "step": 1001 + }, + { + "epoch": 1.4849833147942157, + "grad_norm": 7.619327178571397, + "learning_rate": 5.968051835435356e-06, + "loss": 2.6198, + "step": 1002 + }, + { + "epoch": 1.4864664441972562, + "grad_norm": 7.438039608614693, + "learning_rate": 5.959592252917096e-06, + "loss": 2.5695, + "step": 1003 + }, + { + "epoch": 1.4879495736002966, + "grad_norm": 7.818042847951643, + "learning_rate": 5.951129817478626e-06, + "loss": 2.6354, + "step": 1004 + }, + { + "epoch": 1.489432703003337, + "grad_norm": 7.295183284802695, + "learning_rate": 5.942664554279229e-06, + "loss": 2.4838, + "step": 1005 + }, + { + "epoch": 1.4909158324063774, + "grad_norm": 8.15627657589047, + "learning_rate": 5.934196488486594e-06, + "loss": 2.7357, + "step": 1006 + }, + { + "epoch": 1.492398961809418, + "grad_norm": 7.54916292343978, + "learning_rate": 5.925725645276744e-06, + "loss": 2.7499, + "step": 1007 + }, + { + "epoch": 1.4938820912124582, + "grad_norm": 7.757205826037745, + "learning_rate": 5.91725204983396e-06, + "loss": 2.6007, + "step": 1008 + }, + { + "epoch": 1.4953652206154988, + "grad_norm": 7.970916877633584, + "learning_rate": 5.908775727350703e-06, + "loss": 2.7328, + "step": 1009 + }, + { + "epoch": 1.496848350018539, + "grad_norm": 7.4408038961946295, + "learning_rate": 5.900296703027542e-06, + "loss": 2.6447, + "step": 1010 + }, + { + "epoch": 1.4983314794215796, + "grad_norm": 7.287201783095939, + "learning_rate": 5.891815002073081e-06, + "loss": 2.4421, + "step": 1011 + }, + { + "epoch": 1.4998146088246198, + "grad_norm": 7.9186636845204115, + "learning_rate": 5.883330649703881e-06, + "loss": 2.7324, + "step": 1012 + }, + { + "epoch": 1.5012977382276604, + "grad_norm": 6.912065149487174, + "learning_rate": 5.874843671144385e-06, + "loss": 2.6019, + "step": 1013 + }, + { + "epoch": 1.5027808676307006, + "grad_norm": 7.743238053848098, + "learning_rate": 5.866354091626842e-06, + "loss": 2.6367, + "step": 1014 + }, + { + "epoch": 1.5042639970337413, + "grad_norm": 7.76268129567675, + "learning_rate": 5.857861936391239e-06, + "loss": 2.5789, + "step": 1015 + }, + { + "epoch": 1.5057471264367817, + "grad_norm": 7.473784925683235, + "learning_rate": 5.849367230685214e-06, + "loss": 2.6606, + "step": 1016 + }, + { + "epoch": 1.507230255839822, + "grad_norm": 8.068069543654797, + "learning_rate": 5.840869999763996e-06, + "loss": 2.719, + "step": 1017 + }, + { + "epoch": 1.5087133852428625, + "grad_norm": 7.867358911350109, + "learning_rate": 5.8323702688903125e-06, + "loss": 2.8051, + "step": 1018 + }, + { + "epoch": 1.5101965146459029, + "grad_norm": 8.40888059769916, + "learning_rate": 5.8238680633343325e-06, + "loss": 2.7015, + "step": 1019 + }, + { + "epoch": 1.5116796440489433, + "grad_norm": 8.205884259364863, + "learning_rate": 5.8153634083735725e-06, + "loss": 2.8152, + "step": 1020 + }, + { + "epoch": 1.5131627734519837, + "grad_norm": 6.871191456538998, + "learning_rate": 5.806856329292839e-06, + "loss": 2.6239, + "step": 1021 + }, + { + "epoch": 1.514645902855024, + "grad_norm": 7.490619857913594, + "learning_rate": 5.7983468513841445e-06, + "loss": 2.6802, + "step": 1022 + }, + { + "epoch": 1.5161290322580645, + "grad_norm": 7.188813191668052, + "learning_rate": 5.78983499994663e-06, + "loss": 2.5896, + "step": 1023 + }, + { + "epoch": 1.517612161661105, + "grad_norm": 6.699522842703055, + "learning_rate": 5.781320800286496e-06, + "loss": 2.6983, + "step": 1024 + }, + { + "epoch": 1.5190952910641453, + "grad_norm": 8.63744246206734, + "learning_rate": 5.772804277716921e-06, + "loss": 2.5837, + "step": 1025 + }, + { + "epoch": 1.5205784204671857, + "grad_norm": 6.9556540288785875, + "learning_rate": 5.764285457557994e-06, + "loss": 2.6688, + "step": 1026 + }, + { + "epoch": 1.5220615498702261, + "grad_norm": 7.933059043792353, + "learning_rate": 5.755764365136634e-06, + "loss": 2.673, + "step": 1027 + }, + { + "epoch": 1.5235446792732668, + "grad_norm": 7.526546421529693, + "learning_rate": 5.747241025786514e-06, + "loss": 2.6704, + "step": 1028 + }, + { + "epoch": 1.525027808676307, + "grad_norm": 7.952772661352156, + "learning_rate": 5.738715464847985e-06, + "loss": 2.6272, + "step": 1029 + }, + { + "epoch": 1.5265109380793476, + "grad_norm": 7.295928804134089, + "learning_rate": 5.730187707668008e-06, + "loss": 2.6493, + "step": 1030 + }, + { + "epoch": 1.5279940674823878, + "grad_norm": 7.819273557746103, + "learning_rate": 5.721657779600071e-06, + "loss": 2.6588, + "step": 1031 + }, + { + "epoch": 1.5294771968854284, + "grad_norm": 7.050951160141296, + "learning_rate": 5.713125706004117e-06, + "loss": 2.5094, + "step": 1032 + }, + { + "epoch": 1.5309603262884686, + "grad_norm": 7.148673534842139, + "learning_rate": 5.704591512246465e-06, + "loss": 2.5442, + "step": 1033 + }, + { + "epoch": 1.5324434556915092, + "grad_norm": 8.110480119527708, + "learning_rate": 5.69605522369974e-06, + "loss": 2.7889, + "step": 1034 + }, + { + "epoch": 1.5339265850945494, + "grad_norm": 7.671631236087667, + "learning_rate": 5.687516865742795e-06, + "loss": 2.5979, + "step": 1035 + }, + { + "epoch": 1.53540971449759, + "grad_norm": 7.047804918061569, + "learning_rate": 5.678976463760635e-06, + "loss": 2.4589, + "step": 1036 + }, + { + "epoch": 1.5368928439006302, + "grad_norm": 7.449183590181193, + "learning_rate": 5.670434043144342e-06, + "loss": 2.6506, + "step": 1037 + }, + { + "epoch": 1.5383759733036708, + "grad_norm": 7.986794436835838, + "learning_rate": 5.6618896292909985e-06, + "loss": 2.7178, + "step": 1038 + }, + { + "epoch": 1.539859102706711, + "grad_norm": 7.96754160540462, + "learning_rate": 5.653343247603613e-06, + "loss": 2.6437, + "step": 1039 + }, + { + "epoch": 1.5413422321097516, + "grad_norm": 7.557302117765593, + "learning_rate": 5.644794923491048e-06, + "loss": 2.5223, + "step": 1040 + }, + { + "epoch": 1.542825361512792, + "grad_norm": 7.593123081874571, + "learning_rate": 5.636244682367937e-06, + "loss": 2.5097, + "step": 1041 + }, + { + "epoch": 1.5443084909158324, + "grad_norm": 7.507060751031951, + "learning_rate": 5.6276925496546145e-06, + "loss": 2.7612, + "step": 1042 + }, + { + "epoch": 1.5457916203188728, + "grad_norm": 8.238472746252153, + "learning_rate": 5.619138550777035e-06, + "loss": 2.5753, + "step": 1043 + }, + { + "epoch": 1.5472747497219133, + "grad_norm": 8.178940807783052, + "learning_rate": 5.61058271116671e-06, + "loss": 2.5762, + "step": 1044 + }, + { + "epoch": 1.5487578791249537, + "grad_norm": 6.967967849148484, + "learning_rate": 5.602025056260615e-06, + "loss": 2.8093, + "step": 1045 + }, + { + "epoch": 1.550241008527994, + "grad_norm": 6.902484895112419, + "learning_rate": 5.593465611501127e-06, + "loss": 2.5712, + "step": 1046 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 7.247100532521389, + "learning_rate": 5.584904402335942e-06, + "loss": 2.6035, + "step": 1047 + }, + { + "epoch": 1.5532072673340749, + "grad_norm": 7.84314425402938, + "learning_rate": 5.5763414542180035e-06, + "loss": 2.7038, + "step": 1048 + }, + { + "epoch": 1.5546903967371153, + "grad_norm": 8.007628032052589, + "learning_rate": 5.5677767926054235e-06, + "loss": 2.6108, + "step": 1049 + }, + { + "epoch": 1.5561735261401557, + "grad_norm": 6.810176129005464, + "learning_rate": 5.559210442961412e-06, + "loss": 2.6476, + "step": 1050 + }, + { + "epoch": 1.557656655543196, + "grad_norm": 7.238893003075199, + "learning_rate": 5.5506424307541895e-06, + "loss": 2.6324, + "step": 1051 + }, + { + "epoch": 1.5591397849462365, + "grad_norm": 7.494631800223103, + "learning_rate": 5.542072781456929e-06, + "loss": 2.6523, + "step": 1052 + }, + { + "epoch": 1.5606229143492771, + "grad_norm": 7.181776388196009, + "learning_rate": 5.533501520547662e-06, + "loss": 2.6985, + "step": 1053 + }, + { + "epoch": 1.5621060437523173, + "grad_norm": 7.009434559509705, + "learning_rate": 5.52492867350922e-06, + "loss": 2.7706, + "step": 1054 + }, + { + "epoch": 1.563589173155358, + "grad_norm": 7.414563912537575, + "learning_rate": 5.516354265829143e-06, + "loss": 2.5024, + "step": 1055 + }, + { + "epoch": 1.5650723025583981, + "grad_norm": 7.573887532667192, + "learning_rate": 5.507778322999615e-06, + "loss": 2.8327, + "step": 1056 + }, + { + "epoch": 1.5665554319614388, + "grad_norm": 7.6614560686249735, + "learning_rate": 5.499200870517382e-06, + "loss": 2.652, + "step": 1057 + }, + { + "epoch": 1.568038561364479, + "grad_norm": 8.682001911835714, + "learning_rate": 5.490621933883678e-06, + "loss": 2.6994, + "step": 1058 + }, + { + "epoch": 1.5695216907675196, + "grad_norm": 7.134325539813912, + "learning_rate": 5.482041538604154e-06, + "loss": 2.5983, + "step": 1059 + }, + { + "epoch": 1.5710048201705598, + "grad_norm": 7.199515929572808, + "learning_rate": 5.473459710188791e-06, + "loss": 2.6232, + "step": 1060 + }, + { + "epoch": 1.5724879495736004, + "grad_norm": 7.791959385472083, + "learning_rate": 5.464876474151835e-06, + "loss": 2.5395, + "step": 1061 + }, + { + "epoch": 1.5739710789766406, + "grad_norm": 7.463894894623018, + "learning_rate": 5.456291856011713e-06, + "loss": 2.7268, + "step": 1062 + }, + { + "epoch": 1.5754542083796812, + "grad_norm": 7.932429720766833, + "learning_rate": 5.447705881290967e-06, + "loss": 2.7309, + "step": 1063 + }, + { + "epoch": 1.5769373377827214, + "grad_norm": 7.355440758159457, + "learning_rate": 5.43911857551617e-06, + "loss": 2.7382, + "step": 1064 + }, + { + "epoch": 1.578420467185762, + "grad_norm": 7.24817220686489, + "learning_rate": 5.430529964217849e-06, + "loss": 2.673, + "step": 1065 + }, + { + "epoch": 1.5799035965888024, + "grad_norm": 7.973290673936505, + "learning_rate": 5.421940072930415e-06, + "loss": 2.7356, + "step": 1066 + }, + { + "epoch": 1.5813867259918428, + "grad_norm": 7.396971166883623, + "learning_rate": 5.413348927192085e-06, + "loss": 2.6115, + "step": 1067 + }, + { + "epoch": 1.5828698553948832, + "grad_norm": 7.281653061485157, + "learning_rate": 5.404756552544804e-06, + "loss": 2.5191, + "step": 1068 + }, + { + "epoch": 1.5843529847979236, + "grad_norm": 7.952410133424926, + "learning_rate": 5.396162974534173e-06, + "loss": 2.8078, + "step": 1069 + }, + { + "epoch": 1.585836114200964, + "grad_norm": 7.680759636940419, + "learning_rate": 5.3875682187093685e-06, + "loss": 2.6021, + "step": 1070 + }, + { + "epoch": 1.5873192436040044, + "grad_norm": 7.615556808858322, + "learning_rate": 5.3789723106230675e-06, + "loss": 2.3874, + "step": 1071 + }, + { + "epoch": 1.5888023730070449, + "grad_norm": 7.058223057482544, + "learning_rate": 5.370375275831377e-06, + "loss": 2.5969, + "step": 1072 + }, + { + "epoch": 1.5902855024100853, + "grad_norm": 7.016943085378889, + "learning_rate": 5.361777139893748e-06, + "loss": 2.6042, + "step": 1073 + }, + { + "epoch": 1.5917686318131257, + "grad_norm": 7.212340625795994, + "learning_rate": 5.3531779283729124e-06, + "loss": 2.7586, + "step": 1074 + }, + { + "epoch": 1.593251761216166, + "grad_norm": 7.268862455207194, + "learning_rate": 5.344577666834795e-06, + "loss": 2.587, + "step": 1075 + }, + { + "epoch": 1.5947348906192065, + "grad_norm": 7.365467177682772, + "learning_rate": 5.3359763808484396e-06, + "loss": 2.813, + "step": 1076 + }, + { + "epoch": 1.5962180200222469, + "grad_norm": 7.256666818423536, + "learning_rate": 5.327374095985944e-06, + "loss": 2.5884, + "step": 1077 + }, + { + "epoch": 1.5977011494252875, + "grad_norm": 7.492446370763703, + "learning_rate": 5.318770837822371e-06, + "loss": 2.8181, + "step": 1078 + }, + { + "epoch": 1.5991842788283277, + "grad_norm": 7.285474518016684, + "learning_rate": 5.310166631935676e-06, + "loss": 2.6029, + "step": 1079 + }, + { + "epoch": 1.6006674082313683, + "grad_norm": 7.682578667465408, + "learning_rate": 5.301561503906632e-06, + "loss": 2.5765, + "step": 1080 + }, + { + "epoch": 1.6021505376344085, + "grad_norm": 7.584762015438338, + "learning_rate": 5.292955479318756e-06, + "loss": 2.5904, + "step": 1081 + }, + { + "epoch": 1.6036336670374491, + "grad_norm": 7.139219281826187, + "learning_rate": 5.284348583758231e-06, + "loss": 2.6246, + "step": 1082 + }, + { + "epoch": 1.6051167964404893, + "grad_norm": 7.1966190157510495, + "learning_rate": 5.275740842813827e-06, + "loss": 2.6029, + "step": 1083 + }, + { + "epoch": 1.60659992584353, + "grad_norm": 7.96467702317325, + "learning_rate": 5.267132282076826e-06, + "loss": 2.666, + "step": 1084 + }, + { + "epoch": 1.6080830552465701, + "grad_norm": 7.963961276045941, + "learning_rate": 5.258522927140952e-06, + "loss": 2.5922, + "step": 1085 + }, + { + "epoch": 1.6095661846496108, + "grad_norm": 7.369913830332525, + "learning_rate": 5.249912803602287e-06, + "loss": 2.5573, + "step": 1086 + }, + { + "epoch": 1.611049314052651, + "grad_norm": 8.063803432571623, + "learning_rate": 5.2413019370592e-06, + "loss": 2.9028, + "step": 1087 + }, + { + "epoch": 1.6125324434556916, + "grad_norm": 7.351740843143062, + "learning_rate": 5.232690353112265e-06, + "loss": 2.637, + "step": 1088 + }, + { + "epoch": 1.6140155728587318, + "grad_norm": 7.217215066599258, + "learning_rate": 5.224078077364195e-06, + "loss": 2.5129, + "step": 1089 + }, + { + "epoch": 1.6154987022617724, + "grad_norm": 7.665156675343125, + "learning_rate": 5.215465135419754e-06, + "loss": 2.7467, + "step": 1090 + }, + { + "epoch": 1.6169818316648128, + "grad_norm": 7.7463267744350635, + "learning_rate": 5.206851552885691e-06, + "loss": 2.7025, + "step": 1091 + }, + { + "epoch": 1.6184649610678532, + "grad_norm": 7.721035229698619, + "learning_rate": 5.198237355370655e-06, + "loss": 2.6363, + "step": 1092 + }, + { + "epoch": 1.6199480904708936, + "grad_norm": 6.926034156100187, + "learning_rate": 5.18962256848513e-06, + "loss": 2.6046, + "step": 1093 + }, + { + "epoch": 1.621431219873934, + "grad_norm": 7.866647851472695, + "learning_rate": 5.181007217841344e-06, + "loss": 2.6528, + "step": 1094 + }, + { + "epoch": 1.6229143492769744, + "grad_norm": 6.679388699645385, + "learning_rate": 5.172391329053207e-06, + "loss": 2.4447, + "step": 1095 + }, + { + "epoch": 1.6243974786800148, + "grad_norm": 7.514682391505155, + "learning_rate": 5.163774927736228e-06, + "loss": 2.5152, + "step": 1096 + }, + { + "epoch": 1.6258806080830552, + "grad_norm": 6.891498820883739, + "learning_rate": 5.15515803950744e-06, + "loss": 2.494, + "step": 1097 + }, + { + "epoch": 1.6273637374860956, + "grad_norm": 7.259100611495041, + "learning_rate": 5.146540689985319e-06, + "loss": 2.5529, + "step": 1098 + }, + { + "epoch": 1.628846866889136, + "grad_norm": 7.247497140902815, + "learning_rate": 5.137922904789718e-06, + "loss": 2.453, + "step": 1099 + }, + { + "epoch": 1.6303299962921765, + "grad_norm": 7.455655942320046, + "learning_rate": 5.129304709541784e-06, + "loss": 2.7932, + "step": 1100 + }, + { + "epoch": 1.6318131256952169, + "grad_norm": 7.6863500078241636, + "learning_rate": 5.120686129863882e-06, + "loss": 2.6519, + "step": 1101 + }, + { + "epoch": 1.6332962550982573, + "grad_norm": 6.865337204481967, + "learning_rate": 5.1120671913795206e-06, + "loss": 2.504, + "step": 1102 + }, + { + "epoch": 1.634779384501298, + "grad_norm": 7.255175217499525, + "learning_rate": 5.103447919713274e-06, + "loss": 2.4854, + "step": 1103 + }, + { + "epoch": 1.636262513904338, + "grad_norm": 7.218005338599912, + "learning_rate": 5.094828340490707e-06, + "loss": 2.5337, + "step": 1104 + }, + { + "epoch": 1.6377456433073787, + "grad_norm": 7.787542377449046, + "learning_rate": 5.086208479338304e-06, + "loss": 2.5284, + "step": 1105 + }, + { + "epoch": 1.639228772710419, + "grad_norm": 6.886895959119668, + "learning_rate": 5.077588361883379e-06, + "loss": 2.5655, + "step": 1106 + }, + { + "epoch": 1.6407119021134595, + "grad_norm": 7.2647805642147025, + "learning_rate": 5.068968013754015e-06, + "loss": 2.6138, + "step": 1107 + }, + { + "epoch": 1.6421950315164997, + "grad_norm": 8.07524491347167, + "learning_rate": 5.060347460578976e-06, + "loss": 2.6894, + "step": 1108 + }, + { + "epoch": 1.6436781609195403, + "grad_norm": 7.329502176959184, + "learning_rate": 5.051726727987639e-06, + "loss": 2.6685, + "step": 1109 + }, + { + "epoch": 1.6451612903225805, + "grad_norm": 8.174770616506022, + "learning_rate": 5.043105841609912e-06, + "loss": 2.6732, + "step": 1110 + }, + { + "epoch": 1.6466444197256211, + "grad_norm": 7.2376518326919745, + "learning_rate": 5.0344848270761635e-06, + "loss": 2.6711, + "step": 1111 + }, + { + "epoch": 1.6481275491286613, + "grad_norm": 7.503117728925517, + "learning_rate": 5.02586371001714e-06, + "loss": 2.6619, + "step": 1112 + }, + { + "epoch": 1.649610678531702, + "grad_norm": 7.5276290459763455, + "learning_rate": 5.017242516063891e-06, + "loss": 2.6141, + "step": 1113 + }, + { + "epoch": 1.6510938079347421, + "grad_norm": 8.049557122171734, + "learning_rate": 5.0086212708477e-06, + "loss": 2.6955, + "step": 1114 + }, + { + "epoch": 1.6525769373377828, + "grad_norm": 7.021785705807164, + "learning_rate": 5e-06, + "loss": 2.6385, + "step": 1115 + }, + { + "epoch": 1.6540600667408232, + "grad_norm": 7.010162815402169, + "learning_rate": 4.9913787291523e-06, + "loss": 2.6213, + "step": 1116 + }, + { + "epoch": 1.6555431961438636, + "grad_norm": 7.40373275470094, + "learning_rate": 4.98275748393611e-06, + "loss": 2.7577, + "step": 1117 + }, + { + "epoch": 1.657026325546904, + "grad_norm": 8.424075865434029, + "learning_rate": 4.974136289982862e-06, + "loss": 2.7095, + "step": 1118 + }, + { + "epoch": 1.6585094549499444, + "grad_norm": 7.823668001993629, + "learning_rate": 4.9655151729238365e-06, + "loss": 2.6614, + "step": 1119 + }, + { + "epoch": 1.6599925843529848, + "grad_norm": 7.238854264281595, + "learning_rate": 4.9568941583900884e-06, + "loss": 2.6647, + "step": 1120 + }, + { + "epoch": 1.6614757137560252, + "grad_norm": 7.574893856188629, + "learning_rate": 4.948273272012363e-06, + "loss": 2.4856, + "step": 1121 + }, + { + "epoch": 1.6629588431590656, + "grad_norm": 7.796552355060217, + "learning_rate": 4.9396525394210256e-06, + "loss": 2.59, + "step": 1122 + }, + { + "epoch": 1.664441972562106, + "grad_norm": 7.6961811482884945, + "learning_rate": 4.9310319862459865e-06, + "loss": 2.6009, + "step": 1123 + }, + { + "epoch": 1.6659251019651464, + "grad_norm": 7.601138136440337, + "learning_rate": 4.922411638116622e-06, + "loss": 2.7143, + "step": 1124 + }, + { + "epoch": 1.6674082313681868, + "grad_norm": 7.455811172221146, + "learning_rate": 4.913791520661699e-06, + "loss": 2.5067, + "step": 1125 + }, + { + "epoch": 1.6688913607712272, + "grad_norm": 7.570493652342521, + "learning_rate": 4.905171659509294e-06, + "loss": 2.6715, + "step": 1126 + }, + { + "epoch": 1.6703744901742676, + "grad_norm": 6.938941185521578, + "learning_rate": 4.896552080286728e-06, + "loss": 2.543, + "step": 1127 + }, + { + "epoch": 1.6718576195773083, + "grad_norm": 7.20098536770145, + "learning_rate": 4.887932808620483e-06, + "loss": 2.559, + "step": 1128 + }, + { + "epoch": 1.6733407489803485, + "grad_norm": 8.038517429738508, + "learning_rate": 4.87931387013612e-06, + "loss": 2.7752, + "step": 1129 + }, + { + "epoch": 1.674823878383389, + "grad_norm": 7.3654324006215015, + "learning_rate": 4.870695290458217e-06, + "loss": 2.6896, + "step": 1130 + }, + { + "epoch": 1.6763070077864293, + "grad_norm": 7.4852801449275574, + "learning_rate": 4.862077095210284e-06, + "loss": 2.5185, + "step": 1131 + }, + { + "epoch": 1.67779013718947, + "grad_norm": 7.594859086313013, + "learning_rate": 4.853459310014683e-06, + "loss": 2.6235, + "step": 1132 + }, + { + "epoch": 1.67927326659251, + "grad_norm": 7.756794874074127, + "learning_rate": 4.8448419604925636e-06, + "loss": 2.6903, + "step": 1133 + }, + { + "epoch": 1.6807563959955507, + "grad_norm": 7.12866818800397, + "learning_rate": 4.8362250722637735e-06, + "loss": 2.5831, + "step": 1134 + }, + { + "epoch": 1.682239525398591, + "grad_norm": 8.010217337317963, + "learning_rate": 4.827608670946794e-06, + "loss": 2.7106, + "step": 1135 + }, + { + "epoch": 1.6837226548016315, + "grad_norm": 7.755319213111724, + "learning_rate": 4.818992782158658e-06, + "loss": 2.5828, + "step": 1136 + }, + { + "epoch": 1.6852057842046717, + "grad_norm": 7.314301912652033, + "learning_rate": 4.810377431514871e-06, + "loss": 2.5473, + "step": 1137 + }, + { + "epoch": 1.6866889136077123, + "grad_norm": 7.534613519797203, + "learning_rate": 4.8017626446293446e-06, + "loss": 2.5231, + "step": 1138 + }, + { + "epoch": 1.6881720430107527, + "grad_norm": 7.587761901024501, + "learning_rate": 4.793148447114311e-06, + "loss": 2.7445, + "step": 1139 + }, + { + "epoch": 1.6896551724137931, + "grad_norm": 7.944650153948703, + "learning_rate": 4.784534864580247e-06, + "loss": 2.7671, + "step": 1140 + }, + { + "epoch": 1.6911383018168336, + "grad_norm": 7.479506025613036, + "learning_rate": 4.775921922635806e-06, + "loss": 2.5898, + "step": 1141 + }, + { + "epoch": 1.692621431219874, + "grad_norm": 7.020267848721467, + "learning_rate": 4.767309646887736e-06, + "loss": 2.6823, + "step": 1142 + }, + { + "epoch": 1.6941045606229144, + "grad_norm": 7.235559461413167, + "learning_rate": 4.758698062940801e-06, + "loss": 2.6887, + "step": 1143 + }, + { + "epoch": 1.6955876900259548, + "grad_norm": 7.451520669406084, + "learning_rate": 4.750087196397715e-06, + "loss": 2.5242, + "step": 1144 + }, + { + "epoch": 1.6970708194289952, + "grad_norm": 7.772193331826746, + "learning_rate": 4.741477072859049e-06, + "loss": 2.637, + "step": 1145 + }, + { + "epoch": 1.6985539488320356, + "grad_norm": 7.1486121883572356, + "learning_rate": 4.732867717923174e-06, + "loss": 2.6001, + "step": 1146 + }, + { + "epoch": 1.700037078235076, + "grad_norm": 7.714952077020558, + "learning_rate": 4.724259157186176e-06, + "loss": 2.7307, + "step": 1147 + }, + { + "epoch": 1.7015202076381164, + "grad_norm": 7.75306924929624, + "learning_rate": 4.715651416241771e-06, + "loss": 2.5378, + "step": 1148 + }, + { + "epoch": 1.7030033370411568, + "grad_norm": 7.893112075942268, + "learning_rate": 4.707044520681245e-06, + "loss": 2.6684, + "step": 1149 + }, + { + "epoch": 1.7044864664441972, + "grad_norm": 8.308824317337532, + "learning_rate": 4.698438496093369e-06, + "loss": 2.7962, + "step": 1150 + }, + { + "epoch": 1.7059695958472376, + "grad_norm": 7.575564867884775, + "learning_rate": 4.689833368064326e-06, + "loss": 2.752, + "step": 1151 + }, + { + "epoch": 1.707452725250278, + "grad_norm": 7.776573174931593, + "learning_rate": 4.681229162177629e-06, + "loss": 2.5589, + "step": 1152 + }, + { + "epoch": 1.7089358546533187, + "grad_norm": 7.1581568540506835, + "learning_rate": 4.672625904014057e-06, + "loss": 2.6307, + "step": 1153 + }, + { + "epoch": 1.7104189840563588, + "grad_norm": 7.209010738458278, + "learning_rate": 4.664023619151561e-06, + "loss": 2.68, + "step": 1154 + }, + { + "epoch": 1.7119021134593995, + "grad_norm": 7.4969051806324325, + "learning_rate": 4.655422333165208e-06, + "loss": 2.6919, + "step": 1155 + }, + { + "epoch": 1.7133852428624397, + "grad_norm": 7.493797794833906, + "learning_rate": 4.646822071627089e-06, + "loss": 2.694, + "step": 1156 + }, + { + "epoch": 1.7148683722654803, + "grad_norm": 6.806500818640617, + "learning_rate": 4.638222860106252e-06, + "loss": 2.5601, + "step": 1157 + }, + { + "epoch": 1.7163515016685205, + "grad_norm": 6.701327643374928, + "learning_rate": 4.6296247241686265e-06, + "loss": 2.5167, + "step": 1158 + }, + { + "epoch": 1.717834631071561, + "grad_norm": 6.6329447103977355, + "learning_rate": 4.621027689376934e-06, + "loss": 2.6084, + "step": 1159 + }, + { + "epoch": 1.7193177604746013, + "grad_norm": 7.455182174583054, + "learning_rate": 4.612431781290632e-06, + "loss": 2.4819, + "step": 1160 + }, + { + "epoch": 1.720800889877642, + "grad_norm": 7.742110401980965, + "learning_rate": 4.603837025465829e-06, + "loss": 2.6803, + "step": 1161 + }, + { + "epoch": 1.722284019280682, + "grad_norm": 7.705673887310891, + "learning_rate": 4.595243447455197e-06, + "loss": 2.6384, + "step": 1162 + }, + { + "epoch": 1.7237671486837227, + "grad_norm": 7.519495561991448, + "learning_rate": 4.586651072807916e-06, + "loss": 2.4323, + "step": 1163 + }, + { + "epoch": 1.7252502780867631, + "grad_norm": 7.065277939994052, + "learning_rate": 4.5780599270695866e-06, + "loss": 2.5524, + "step": 1164 + }, + { + "epoch": 1.7267334074898035, + "grad_norm": 7.880143053524115, + "learning_rate": 4.569470035782153e-06, + "loss": 2.6714, + "step": 1165 + }, + { + "epoch": 1.728216536892844, + "grad_norm": 7.211838814147003, + "learning_rate": 4.560881424483833e-06, + "loss": 2.3647, + "step": 1166 + }, + { + "epoch": 1.7296996662958843, + "grad_norm": 7.40358276976979, + "learning_rate": 4.552294118709034e-06, + "loss": 2.6626, + "step": 1167 + }, + { + "epoch": 1.7311827956989247, + "grad_norm": 7.444849752406141, + "learning_rate": 4.543708143988288e-06, + "loss": 2.6792, + "step": 1168 + }, + { + "epoch": 1.7326659251019652, + "grad_norm": 7.52747677451145, + "learning_rate": 4.5351235258481685e-06, + "loss": 2.5427, + "step": 1169 + }, + { + "epoch": 1.7341490545050056, + "grad_norm": 8.230366507903664, + "learning_rate": 4.526540289811211e-06, + "loss": 2.5538, + "step": 1170 + }, + { + "epoch": 1.735632183908046, + "grad_norm": 6.996224574101844, + "learning_rate": 4.517958461395846e-06, + "loss": 2.6434, + "step": 1171 + }, + { + "epoch": 1.7371153133110864, + "grad_norm": 7.098201613174376, + "learning_rate": 4.509378066116323e-06, + "loss": 2.6402, + "step": 1172 + }, + { + "epoch": 1.7385984427141268, + "grad_norm": 7.162470088631321, + "learning_rate": 4.50079912948262e-06, + "loss": 2.5514, + "step": 1173 + }, + { + "epoch": 1.7400815721171672, + "grad_norm": 7.281727399730713, + "learning_rate": 4.492221677000388e-06, + "loss": 2.5272, + "step": 1174 + }, + { + "epoch": 1.7415647015202076, + "grad_norm": 7.9369106934631075, + "learning_rate": 4.483645734170858e-06, + "loss": 2.7254, + "step": 1175 + }, + { + "epoch": 1.743047830923248, + "grad_norm": 7.444363414494004, + "learning_rate": 4.475071326490781e-06, + "loss": 2.6317, + "step": 1176 + }, + { + "epoch": 1.7445309603262884, + "grad_norm": 8.12136766045917, + "learning_rate": 4.466498479452339e-06, + "loss": 2.7061, + "step": 1177 + }, + { + "epoch": 1.746014089729329, + "grad_norm": 7.55944985004547, + "learning_rate": 4.457927218543073e-06, + "loss": 2.4953, + "step": 1178 + }, + { + "epoch": 1.7474972191323692, + "grad_norm": 7.40082056466059, + "learning_rate": 4.449357569245811e-06, + "loss": 2.6162, + "step": 1179 + }, + { + "epoch": 1.7489803485354098, + "grad_norm": 7.801114008610061, + "learning_rate": 4.440789557038591e-06, + "loss": 2.6297, + "step": 1180 + }, + { + "epoch": 1.75046347793845, + "grad_norm": 8.137567796345333, + "learning_rate": 4.432223207394577e-06, + "loss": 2.5629, + "step": 1181 + }, + { + "epoch": 1.7519466073414907, + "grad_norm": 7.4207141531886, + "learning_rate": 4.423658545781997e-06, + "loss": 2.5768, + "step": 1182 + }, + { + "epoch": 1.7534297367445308, + "grad_norm": 7.187139374151907, + "learning_rate": 4.415095597664059e-06, + "loss": 2.654, + "step": 1183 + }, + { + "epoch": 1.7549128661475715, + "grad_norm": 7.200098349602716, + "learning_rate": 4.4065343884988735e-06, + "loss": 2.582, + "step": 1184 + }, + { + "epoch": 1.7563959955506117, + "grad_norm": 7.013104955339882, + "learning_rate": 4.397974943739387e-06, + "loss": 2.5374, + "step": 1185 + }, + { + "epoch": 1.7578791249536523, + "grad_norm": 7.569023627633825, + "learning_rate": 4.389417288833292e-06, + "loss": 2.5665, + "step": 1186 + }, + { + "epoch": 1.7593622543566925, + "grad_norm": 7.404685166190474, + "learning_rate": 4.380861449222965e-06, + "loss": 2.6484, + "step": 1187 + }, + { + "epoch": 1.760845383759733, + "grad_norm": 7.780423845194966, + "learning_rate": 4.372307450345389e-06, + "loss": 2.5232, + "step": 1188 + }, + { + "epoch": 1.7623285131627735, + "grad_norm": 7.389160125487332, + "learning_rate": 4.3637553176320645e-06, + "loss": 2.6254, + "step": 1189 + }, + { + "epoch": 1.763811642565814, + "grad_norm": 7.311682813590182, + "learning_rate": 4.355205076508953e-06, + "loss": 2.5859, + "step": 1190 + }, + { + "epoch": 1.7652947719688543, + "grad_norm": 7.216731763328469, + "learning_rate": 4.346656752396388e-06, + "loss": 2.443, + "step": 1191 + }, + { + "epoch": 1.7667779013718947, + "grad_norm": 7.4227326439445, + "learning_rate": 4.338110370709003e-06, + "loss": 2.7292, + "step": 1192 + }, + { + "epoch": 1.7682610307749351, + "grad_norm": 7.439643978596773, + "learning_rate": 4.329565956855659e-06, + "loss": 2.6834, + "step": 1193 + }, + { + "epoch": 1.7697441601779755, + "grad_norm": 6.9458572222055, + "learning_rate": 4.321023536239367e-06, + "loss": 2.4566, + "step": 1194 + }, + { + "epoch": 1.771227289581016, + "grad_norm": 8.029490002736553, + "learning_rate": 4.3124831342572064e-06, + "loss": 2.7542, + "step": 1195 + }, + { + "epoch": 1.7727104189840563, + "grad_norm": 8.26353458084028, + "learning_rate": 4.303944776300262e-06, + "loss": 2.6633, + "step": 1196 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 7.383819656956144, + "learning_rate": 4.295408487753537e-06, + "loss": 2.5327, + "step": 1197 + }, + { + "epoch": 1.7756766777901372, + "grad_norm": 7.480416209768692, + "learning_rate": 4.286874293995885e-06, + "loss": 2.6224, + "step": 1198 + }, + { + "epoch": 1.7771598071931776, + "grad_norm": 7.410713357162025, + "learning_rate": 4.278342220399931e-06, + "loss": 2.7195, + "step": 1199 + }, + { + "epoch": 1.778642936596218, + "grad_norm": 7.1699131066985045, + "learning_rate": 4.269812292331993e-06, + "loss": 2.7236, + "step": 1200 + }, + { + "epoch": 1.7801260659992586, + "grad_norm": 7.196961005603686, + "learning_rate": 4.261284535152016e-06, + "loss": 2.4664, + "step": 1201 + }, + { + "epoch": 1.7816091954022988, + "grad_norm": 8.143938646476515, + "learning_rate": 4.2527589742134885e-06, + "loss": 2.6765, + "step": 1202 + }, + { + "epoch": 1.7830923248053394, + "grad_norm": 7.753927567730745, + "learning_rate": 4.2442356348633665e-06, + "loss": 2.6301, + "step": 1203 + }, + { + "epoch": 1.7845754542083796, + "grad_norm": 7.141306929712399, + "learning_rate": 4.235714542442006e-06, + "loss": 2.562, + "step": 1204 + }, + { + "epoch": 1.7860585836114202, + "grad_norm": 7.721838602278573, + "learning_rate": 4.22719572228308e-06, + "loss": 2.6166, + "step": 1205 + }, + { + "epoch": 1.7875417130144604, + "grad_norm": 8.012294979441725, + "learning_rate": 4.218679199713505e-06, + "loss": 2.4832, + "step": 1206 + }, + { + "epoch": 1.789024842417501, + "grad_norm": 7.037072153931807, + "learning_rate": 4.2101650000533724e-06, + "loss": 2.7573, + "step": 1207 + }, + { + "epoch": 1.7905079718205412, + "grad_norm": 8.779771837262302, + "learning_rate": 4.201653148615857e-06, + "loss": 2.6993, + "step": 1208 + }, + { + "epoch": 1.7919911012235819, + "grad_norm": 7.389035976961033, + "learning_rate": 4.1931436707071615e-06, + "loss": 2.5469, + "step": 1209 + }, + { + "epoch": 1.793474230626622, + "grad_norm": 7.066337358614287, + "learning_rate": 4.184636591626429e-06, + "loss": 2.6628, + "step": 1210 + }, + { + "epoch": 1.7949573600296627, + "grad_norm": 7.663910965202793, + "learning_rate": 4.176131936665669e-06, + "loss": 2.6406, + "step": 1211 + }, + { + "epoch": 1.7964404894327028, + "grad_norm": 8.132458698134343, + "learning_rate": 4.167629731109687e-06, + "loss": 2.7199, + "step": 1212 + }, + { + "epoch": 1.7979236188357435, + "grad_norm": 7.4109258205648665, + "learning_rate": 4.1591300002360055e-06, + "loss": 2.6454, + "step": 1213 + }, + { + "epoch": 1.7994067482387839, + "grad_norm": 7.755800577237642, + "learning_rate": 4.1506327693147865e-06, + "loss": 2.6449, + "step": 1214 + }, + { + "epoch": 1.8008898776418243, + "grad_norm": 7.430668468872214, + "learning_rate": 4.142138063608763e-06, + "loss": 2.6922, + "step": 1215 + }, + { + "epoch": 1.8023730070448647, + "grad_norm": 8.023629317277779, + "learning_rate": 4.133645908373159e-06, + "loss": 2.642, + "step": 1216 + }, + { + "epoch": 1.803856136447905, + "grad_norm": 7.092145335013112, + "learning_rate": 4.125156328855617e-06, + "loss": 2.3491, + "step": 1217 + }, + { + "epoch": 1.8053392658509455, + "grad_norm": 7.705949322968855, + "learning_rate": 4.116669350296122e-06, + "loss": 2.8141, + "step": 1218 + }, + { + "epoch": 1.806822395253986, + "grad_norm": 7.902539588383633, + "learning_rate": 4.10818499792692e-06, + "loss": 2.6794, + "step": 1219 + }, + { + "epoch": 1.8083055246570263, + "grad_norm": 7.636114953039626, + "learning_rate": 4.099703296972458e-06, + "loss": 2.532, + "step": 1220 + }, + { + "epoch": 1.8097886540600667, + "grad_norm": 8.165882599643336, + "learning_rate": 4.0912242726493e-06, + "loss": 2.7769, + "step": 1221 + }, + { + "epoch": 1.8112717834631071, + "grad_norm": 6.813496364540669, + "learning_rate": 4.082747950166042e-06, + "loss": 2.621, + "step": 1222 + }, + { + "epoch": 1.8127549128661475, + "grad_norm": 8.779386669514873, + "learning_rate": 4.074274354723256e-06, + "loss": 2.785, + "step": 1223 + }, + { + "epoch": 1.814238042269188, + "grad_norm": 7.432275481889324, + "learning_rate": 4.065803511513407e-06, + "loss": 2.6166, + "step": 1224 + }, + { + "epoch": 1.8157211716722284, + "grad_norm": 8.489193682320346, + "learning_rate": 4.057335445720772e-06, + "loss": 2.781, + "step": 1225 + }, + { + "epoch": 1.817204301075269, + "grad_norm": 8.018006146803135, + "learning_rate": 4.048870182521374e-06, + "loss": 2.7177, + "step": 1226 + }, + { + "epoch": 1.8186874304783092, + "grad_norm": 7.260604113112304, + "learning_rate": 4.0404077470829065e-06, + "loss": 2.5977, + "step": 1227 + }, + { + "epoch": 1.8201705598813498, + "grad_norm": 6.931350334096077, + "learning_rate": 4.031948164564647e-06, + "loss": 2.6502, + "step": 1228 + }, + { + "epoch": 1.82165368928439, + "grad_norm": 7.321354122203878, + "learning_rate": 4.023491460117399e-06, + "loss": 2.5243, + "step": 1229 + }, + { + "epoch": 1.8231368186874306, + "grad_norm": 7.537440163073761, + "learning_rate": 4.015037658883408e-06, + "loss": 2.6853, + "step": 1230 + }, + { + "epoch": 1.8246199480904708, + "grad_norm": 7.101568093656647, + "learning_rate": 4.006586785996285e-06, + "loss": 2.5935, + "step": 1231 + }, + { + "epoch": 1.8261030774935114, + "grad_norm": 8.04671634608455, + "learning_rate": 3.998138866580941e-06, + "loss": 2.6726, + "step": 1232 + }, + { + "epoch": 1.8275862068965516, + "grad_norm": 7.070327753165328, + "learning_rate": 3.989693925753495e-06, + "loss": 2.3579, + "step": 1233 + }, + { + "epoch": 1.8290693362995922, + "grad_norm": 7.2454530784815585, + "learning_rate": 3.981251988621224e-06, + "loss": 2.6039, + "step": 1234 + }, + { + "epoch": 1.8305524657026324, + "grad_norm": 7.389379290010657, + "learning_rate": 3.972813080282468e-06, + "loss": 2.5277, + "step": 1235 + }, + { + "epoch": 1.832035595105673, + "grad_norm": 7.714735431363324, + "learning_rate": 3.96437722582656e-06, + "loss": 2.5846, + "step": 1236 + }, + { + "epoch": 1.8335187245087132, + "grad_norm": 7.180060940199202, + "learning_rate": 3.955944450333756e-06, + "loss": 2.6733, + "step": 1237 + }, + { + "epoch": 1.8350018539117539, + "grad_norm": 7.876841272613581, + "learning_rate": 3.947514778875162e-06, + "loss": 2.5973, + "step": 1238 + }, + { + "epoch": 1.8364849833147943, + "grad_norm": 7.3545997486726415, + "learning_rate": 3.9390882365126474e-06, + "loss": 2.722, + "step": 1239 + }, + { + "epoch": 1.8379681127178347, + "grad_norm": 7.425044579658441, + "learning_rate": 3.930664848298788e-06, + "loss": 2.5654, + "step": 1240 + }, + { + "epoch": 1.839451242120875, + "grad_norm": 8.033294582227041, + "learning_rate": 3.922244639276773e-06, + "loss": 2.5454, + "step": 1241 + }, + { + "epoch": 1.8409343715239155, + "grad_norm": 7.084413184724079, + "learning_rate": 3.913827634480346e-06, + "loss": 2.5445, + "step": 1242 + }, + { + "epoch": 1.8424175009269559, + "grad_norm": 8.15219819125829, + "learning_rate": 3.905413858933723e-06, + "loss": 2.6087, + "step": 1243 + }, + { + "epoch": 1.8439006303299963, + "grad_norm": 7.275750581695436, + "learning_rate": 3.8970033376515156e-06, + "loss": 2.5908, + "step": 1244 + }, + { + "epoch": 1.8453837597330367, + "grad_norm": 7.6770247075529925, + "learning_rate": 3.8885960956386665e-06, + "loss": 2.6803, + "step": 1245 + }, + { + "epoch": 1.846866889136077, + "grad_norm": 7.339499134655671, + "learning_rate": 3.880192157890365e-06, + "loss": 2.6786, + "step": 1246 + }, + { + "epoch": 1.8483500185391175, + "grad_norm": 7.745824844017759, + "learning_rate": 3.871791549391976e-06, + "loss": 2.4404, + "step": 1247 + }, + { + "epoch": 1.849833147942158, + "grad_norm": 7.662872366358528, + "learning_rate": 3.863394295118967e-06, + "loss": 2.7148, + "step": 1248 + }, + { + "epoch": 1.8513162773451983, + "grad_norm": 7.207658364949551, + "learning_rate": 3.855000420036837e-06, + "loss": 2.5597, + "step": 1249 + }, + { + "epoch": 1.8527994067482387, + "grad_norm": 6.688682127812666, + "learning_rate": 3.846609949101033e-06, + "loss": 2.6213, + "step": 1250 + }, + { + "epoch": 1.8542825361512794, + "grad_norm": 7.871241563544073, + "learning_rate": 3.838222907256884e-06, + "loss": 2.6759, + "step": 1251 + }, + { + "epoch": 1.8557656655543195, + "grad_norm": 7.066362384657902, + "learning_rate": 3.829839319439524e-06, + "loss": 2.456, + "step": 1252 + }, + { + "epoch": 1.8572487949573602, + "grad_norm": 7.211435447655129, + "learning_rate": 3.821459210573817e-06, + "loss": 2.7404, + "step": 1253 + }, + { + "epoch": 1.8587319243604004, + "grad_norm": 7.945751734434838, + "learning_rate": 3.8130826055742874e-06, + "loss": 2.6078, + "step": 1254 + }, + { + "epoch": 1.860215053763441, + "grad_norm": 7.205344434394137, + "learning_rate": 3.804709529345036e-06, + "loss": 2.5008, + "step": 1255 + }, + { + "epoch": 1.8616981831664812, + "grad_norm": 7.442652394646696, + "learning_rate": 3.7963400067796774e-06, + "loss": 2.7075, + "step": 1256 + }, + { + "epoch": 1.8631813125695218, + "grad_norm": 7.718083820292284, + "learning_rate": 3.7879740627612604e-06, + "loss": 2.5876, + "step": 1257 + }, + { + "epoch": 1.864664441972562, + "grad_norm": 7.670171639028288, + "learning_rate": 3.7796117221621926e-06, + "loss": 2.6672, + "step": 1258 + }, + { + "epoch": 1.8661475713756026, + "grad_norm": 7.026316266421653, + "learning_rate": 3.7712530098441703e-06, + "loss": 2.7322, + "step": 1259 + }, + { + "epoch": 1.8676307007786428, + "grad_norm": 7.243791485044106, + "learning_rate": 3.7628979506581035e-06, + "loss": 2.5757, + "step": 1260 + }, + { + "epoch": 1.8691138301816834, + "grad_norm": 6.690846241968255, + "learning_rate": 3.7545465694440363e-06, + "loss": 2.464, + "step": 1261 + }, + { + "epoch": 1.8705969595847236, + "grad_norm": 7.364190477441397, + "learning_rate": 3.746198891031084e-06, + "loss": 2.5045, + "step": 1262 + }, + { + "epoch": 1.8720800889877642, + "grad_norm": 7.4703690541571195, + "learning_rate": 3.73785494023735e-06, + "loss": 2.5445, + "step": 1263 + }, + { + "epoch": 1.8735632183908046, + "grad_norm": 7.102833763176907, + "learning_rate": 3.729514741869855e-06, + "loss": 2.5755, + "step": 1264 + }, + { + "epoch": 1.875046347793845, + "grad_norm": 7.498847551686389, + "learning_rate": 3.721178320724468e-06, + "loss": 2.7397, + "step": 1265 + }, + { + "epoch": 1.8765294771968855, + "grad_norm": 8.570064202991226, + "learning_rate": 3.7128457015858198e-06, + "loss": 2.6915, + "step": 1266 + }, + { + "epoch": 1.8780126065999259, + "grad_norm": 7.471993207213762, + "learning_rate": 3.704516909227246e-06, + "loss": 2.5191, + "step": 1267 + }, + { + "epoch": 1.8794957360029663, + "grad_norm": 7.574309081279895, + "learning_rate": 3.6961919684107017e-06, + "loss": 2.595, + "step": 1268 + }, + { + "epoch": 1.8809788654060067, + "grad_norm": 8.133584889263567, + "learning_rate": 3.687870903886688e-06, + "loss": 2.6021, + "step": 1269 + }, + { + "epoch": 1.882461994809047, + "grad_norm": 7.434995866967206, + "learning_rate": 3.679553740394186e-06, + "loss": 2.7237, + "step": 1270 + }, + { + "epoch": 1.8839451242120875, + "grad_norm": 7.480050277443192, + "learning_rate": 3.6712405026605792e-06, + "loss": 2.5668, + "step": 1271 + }, + { + "epoch": 1.885428253615128, + "grad_norm": 7.818857504982332, + "learning_rate": 3.662931215401574e-06, + "loss": 2.685, + "step": 1272 + }, + { + "epoch": 1.8869113830181683, + "grad_norm": 7.896035824531997, + "learning_rate": 3.6546259033211396e-06, + "loss": 2.5518, + "step": 1273 + }, + { + "epoch": 1.8883945124212087, + "grad_norm": 7.733893717989194, + "learning_rate": 3.646324591111419e-06, + "loss": 2.6719, + "step": 1274 + }, + { + "epoch": 1.889877641824249, + "grad_norm": 7.582429041439558, + "learning_rate": 3.638027303452668e-06, + "loss": 2.6027, + "step": 1275 + }, + { + "epoch": 1.8913607712272897, + "grad_norm": 7.847585341500606, + "learning_rate": 3.6297340650131785e-06, + "loss": 2.616, + "step": 1276 + }, + { + "epoch": 1.89284390063033, + "grad_norm": 7.0213742123279514, + "learning_rate": 3.6214449004491994e-06, + "loss": 2.6475, + "step": 1277 + }, + { + "epoch": 1.8943270300333706, + "grad_norm": 7.952045179723355, + "learning_rate": 3.61315983440487e-06, + "loss": 2.5253, + "step": 1278 + }, + { + "epoch": 1.8958101594364107, + "grad_norm": 7.204182403859123, + "learning_rate": 3.604878891512146e-06, + "loss": 2.6387, + "step": 1279 + }, + { + "epoch": 1.8972932888394514, + "grad_norm": 7.463489402462066, + "learning_rate": 3.596602096390721e-06, + "loss": 2.5541, + "step": 1280 + }, + { + "epoch": 1.8987764182424915, + "grad_norm": 7.848473990679133, + "learning_rate": 3.5883294736479612e-06, + "loss": 2.7145, + "step": 1281 + }, + { + "epoch": 1.9002595476455322, + "grad_norm": 7.049059089379158, + "learning_rate": 3.580061047878827e-06, + "loss": 2.5183, + "step": 1282 + }, + { + "epoch": 1.9017426770485724, + "grad_norm": 7.483251935905954, + "learning_rate": 3.571796843665796e-06, + "loss": 2.7667, + "step": 1283 + }, + { + "epoch": 1.903225806451613, + "grad_norm": 7.224632092493819, + "learning_rate": 3.563536885578803e-06, + "loss": 2.6747, + "step": 1284 + }, + { + "epoch": 1.9047089358546532, + "grad_norm": 8.017525106224094, + "learning_rate": 3.5552811981751523e-06, + "loss": 2.6674, + "step": 1285 + }, + { + "epoch": 1.9061920652576938, + "grad_norm": 7.687107413083308, + "learning_rate": 3.5470298059994545e-06, + "loss": 2.8062, + "step": 1286 + }, + { + "epoch": 1.907675194660734, + "grad_norm": 7.900557461709359, + "learning_rate": 3.5387827335835513e-06, + "loss": 2.706, + "step": 1287 + }, + { + "epoch": 1.9091583240637746, + "grad_norm": 7.997153409150714, + "learning_rate": 3.530540005446437e-06, + "loss": 2.6462, + "step": 1288 + }, + { + "epoch": 1.910641453466815, + "grad_norm": 7.721516719431432, + "learning_rate": 3.5223016460941926e-06, + "loss": 2.6045, + "step": 1289 + }, + { + "epoch": 1.9121245828698554, + "grad_norm": 6.905592308199384, + "learning_rate": 3.5140676800199138e-06, + "loss": 2.7303, + "step": 1290 + }, + { + "epoch": 1.9136077122728958, + "grad_norm": 7.160931046181157, + "learning_rate": 3.5058381317036285e-06, + "loss": 2.6534, + "step": 1291 + }, + { + "epoch": 1.9150908416759362, + "grad_norm": 7.510433686847534, + "learning_rate": 3.4976130256122374e-06, + "loss": 2.6866, + "step": 1292 + }, + { + "epoch": 1.9165739710789766, + "grad_norm": 6.770834489186918, + "learning_rate": 3.489392386199425e-06, + "loss": 2.5237, + "step": 1293 + }, + { + "epoch": 1.918057100482017, + "grad_norm": 7.480114397938108, + "learning_rate": 3.4811762379056047e-06, + "loss": 2.6558, + "step": 1294 + }, + { + "epoch": 1.9195402298850575, + "grad_norm": 7.092126280073772, + "learning_rate": 3.472964605157837e-06, + "loss": 2.5546, + "step": 1295 + }, + { + "epoch": 1.9210233592880979, + "grad_norm": 7.786423315662889, + "learning_rate": 3.46475751236975e-06, + "loss": 2.8502, + "step": 1296 + }, + { + "epoch": 1.9225064886911383, + "grad_norm": 6.694843798663476, + "learning_rate": 3.4565549839414813e-06, + "loss": 2.6202, + "step": 1297 + }, + { + "epoch": 1.9239896180941787, + "grad_norm": 7.681374698752166, + "learning_rate": 3.448357044259597e-06, + "loss": 2.7199, + "step": 1298 + }, + { + "epoch": 1.925472747497219, + "grad_norm": 7.689405444256508, + "learning_rate": 3.4401637176970176e-06, + "loss": 2.5886, + "step": 1299 + }, + { + "epoch": 1.9269558769002595, + "grad_norm": 7.638029366136586, + "learning_rate": 3.431975028612952e-06, + "loss": 2.6293, + "step": 1300 + }, + { + "epoch": 1.9284390063033001, + "grad_norm": 8.182366032781445, + "learning_rate": 3.423791001352823e-06, + "loss": 2.7446, + "step": 1301 + }, + { + "epoch": 1.9299221357063403, + "grad_norm": 7.867001641273275, + "learning_rate": 3.415611660248186e-06, + "loss": 2.6613, + "step": 1302 + }, + { + "epoch": 1.931405265109381, + "grad_norm": 7.241419773629577, + "learning_rate": 3.407437029616672e-06, + "loss": 2.6755, + "step": 1303 + }, + { + "epoch": 1.9328883945124211, + "grad_norm": 7.635718219552529, + "learning_rate": 3.399267133761903e-06, + "loss": 2.6581, + "step": 1304 + }, + { + "epoch": 1.9343715239154617, + "grad_norm": 7.959386426878792, + "learning_rate": 3.3911019969734273e-06, + "loss": 2.6612, + "step": 1305 + }, + { + "epoch": 1.935854653318502, + "grad_norm": 7.638950163877713, + "learning_rate": 3.382941643526644e-06, + "loss": 2.5616, + "step": 1306 + }, + { + "epoch": 1.9373377827215426, + "grad_norm": 7.487017315061391, + "learning_rate": 3.3747860976827256e-06, + "loss": 2.6958, + "step": 1307 + }, + { + "epoch": 1.9388209121245827, + "grad_norm": 6.940307864342042, + "learning_rate": 3.3666353836885592e-06, + "loss": 2.6622, + "step": 1308 + }, + { + "epoch": 1.9403040415276234, + "grad_norm": 7.559944094516612, + "learning_rate": 3.3584895257766637e-06, + "loss": 2.6535, + "step": 1309 + }, + { + "epoch": 1.9417871709306636, + "grad_norm": 7.2542314348654156, + "learning_rate": 3.3503485481651166e-06, + "loss": 2.4791, + "step": 1310 + }, + { + "epoch": 1.9432703003337042, + "grad_norm": 7.038002126452892, + "learning_rate": 3.3422124750574902e-06, + "loss": 2.649, + "step": 1311 + }, + { + "epoch": 1.9447534297367444, + "grad_norm": 7.621147042132722, + "learning_rate": 3.3340813306427767e-06, + "loss": 2.532, + "step": 1312 + }, + { + "epoch": 1.946236559139785, + "grad_norm": 7.162371268820004, + "learning_rate": 3.3259551390953103e-06, + "loss": 2.5968, + "step": 1313 + }, + { + "epoch": 1.9477196885428254, + "grad_norm": 7.8190319510904684, + "learning_rate": 3.3178339245747047e-06, + "loss": 2.642, + "step": 1314 + }, + { + "epoch": 1.9492028179458658, + "grad_norm": 7.3148250743763255, + "learning_rate": 3.309717711225772e-06, + "loss": 2.5992, + "step": 1315 + }, + { + "epoch": 1.9506859473489062, + "grad_norm": 7.332648973508316, + "learning_rate": 3.3016065231784587e-06, + "loss": 2.7094, + "step": 1316 + }, + { + "epoch": 1.9521690767519466, + "grad_norm": 7.613768873332103, + "learning_rate": 3.2935003845477724e-06, + "loss": 2.4015, + "step": 1317 + }, + { + "epoch": 1.953652206154987, + "grad_norm": 7.794427725154595, + "learning_rate": 3.2853993194337034e-06, + "loss": 2.7922, + "step": 1318 + }, + { + "epoch": 1.9551353355580274, + "grad_norm": 7.9404422904629035, + "learning_rate": 3.2773033519211627e-06, + "loss": 2.637, + "step": 1319 + }, + { + "epoch": 1.9566184649610678, + "grad_norm": 7.0095150128221695, + "learning_rate": 3.269212506079905e-06, + "loss": 2.5607, + "step": 1320 + }, + { + "epoch": 1.9581015943641082, + "grad_norm": 7.759235530340331, + "learning_rate": 3.2611268059644535e-06, + "loss": 2.5514, + "step": 1321 + }, + { + "epoch": 1.9595847237671487, + "grad_norm": 7.211479213008056, + "learning_rate": 3.2530462756140395e-06, + "loss": 2.7228, + "step": 1322 + }, + { + "epoch": 1.961067853170189, + "grad_norm": 8.464381556947925, + "learning_rate": 3.2449709390525232e-06, + "loss": 2.739, + "step": 1323 + }, + { + "epoch": 1.9625509825732295, + "grad_norm": 6.7935503399616755, + "learning_rate": 3.2369008202883168e-06, + "loss": 2.4699, + "step": 1324 + }, + { + "epoch": 1.9640341119762699, + "grad_norm": 7.302304307878529, + "learning_rate": 3.228835943314328e-06, + "loss": 2.5402, + "step": 1325 + }, + { + "epoch": 1.9655172413793105, + "grad_norm": 7.550365033915087, + "learning_rate": 3.2207763321078737e-06, + "loss": 2.6494, + "step": 1326 + }, + { + "epoch": 1.9670003707823507, + "grad_norm": 7.8751054819945505, + "learning_rate": 3.2127220106306213e-06, + "loss": 2.8039, + "step": 1327 + }, + { + "epoch": 1.9684835001853913, + "grad_norm": 8.371956365094903, + "learning_rate": 3.204673002828509e-06, + "loss": 2.6181, + "step": 1328 + }, + { + "epoch": 1.9699666295884315, + "grad_norm": 6.919923111441938, + "learning_rate": 3.1966293326316737e-06, + "loss": 2.4934, + "step": 1329 + }, + { + "epoch": 1.9714497589914721, + "grad_norm": 7.0997050356533205, + "learning_rate": 3.1885910239543884e-06, + "loss": 2.6097, + "step": 1330 + }, + { + "epoch": 1.9729328883945123, + "grad_norm": 8.14435401868006, + "learning_rate": 3.1805581006949856e-06, + "loss": 2.7884, + "step": 1331 + }, + { + "epoch": 1.974416017797553, + "grad_norm": 7.228319631271579, + "learning_rate": 3.172530586735782e-06, + "loss": 2.7414, + "step": 1332 + }, + { + "epoch": 1.9758991472005931, + "grad_norm": 7.892661438906289, + "learning_rate": 3.164508505943017e-06, + "loss": 2.4393, + "step": 1333 + }, + { + "epoch": 1.9773822766036337, + "grad_norm": 6.983193564895386, + "learning_rate": 3.156491882166778e-06, + "loss": 2.4519, + "step": 1334 + }, + { + "epoch": 1.978865406006674, + "grad_norm": 8.169839950912372, + "learning_rate": 3.14848073924092e-06, + "loss": 2.6865, + "step": 1335 + }, + { + "epoch": 1.9803485354097146, + "grad_norm": 7.658615933771062, + "learning_rate": 3.1404751009830124e-06, + "loss": 2.5854, + "step": 1336 + }, + { + "epoch": 1.981831664812755, + "grad_norm": 7.959442976792441, + "learning_rate": 3.1324749911942536e-06, + "loss": 2.5354, + "step": 1337 + }, + { + "epoch": 1.9833147942157954, + "grad_norm": 7.038959472632175, + "learning_rate": 3.124480433659408e-06, + "loss": 2.5831, + "step": 1338 + }, + { + "epoch": 1.9847979236188358, + "grad_norm": 7.936259436732476, + "learning_rate": 3.116491452146733e-06, + "loss": 2.6868, + "step": 1339 + }, + { + "epoch": 1.9862810530218762, + "grad_norm": 7.296894517906489, + "learning_rate": 3.108508070407905e-06, + "loss": 2.7539, + "step": 1340 + }, + { + "epoch": 1.9877641824249166, + "grad_norm": 7.763119465644392, + "learning_rate": 3.100530312177956e-06, + "loss": 2.7131, + "step": 1341 + }, + { + "epoch": 1.989247311827957, + "grad_norm": 7.322750809704654, + "learning_rate": 3.0925582011751987e-06, + "loss": 2.6178, + "step": 1342 + }, + { + "epoch": 1.9907304412309974, + "grad_norm": 7.376024015700687, + "learning_rate": 3.08459176110115e-06, + "loss": 2.612, + "step": 1343 + }, + { + "epoch": 1.9922135706340378, + "grad_norm": 7.383281532807992, + "learning_rate": 3.0766310156404754e-06, + "loss": 2.5847, + "step": 1344 + }, + { + "epoch": 1.9936967000370782, + "grad_norm": 7.836338310185923, + "learning_rate": 3.068675988460906e-06, + "loss": 2.8174, + "step": 1345 + }, + { + "epoch": 1.9951798294401186, + "grad_norm": 8.3264148699153, + "learning_rate": 3.0607267032131704e-06, + "loss": 2.7224, + "step": 1346 + }, + { + "epoch": 1.996662958843159, + "grad_norm": 8.052358272838813, + "learning_rate": 3.0527831835309307e-06, + "loss": 2.8042, + "step": 1347 + }, + { + "epoch": 1.9981460882461994, + "grad_norm": 7.384122229549461, + "learning_rate": 3.0448454530307025e-06, + "loss": 2.4856, + "step": 1348 + }, + { + "epoch": 1.9996292176492398, + "grad_norm": 7.817304073029412, + "learning_rate": 3.036913535311793e-06, + "loss": 2.7195, + "step": 1349 + }, + { + "epoch": 2.0, + "grad_norm": 7.817304073029412, + "learning_rate": 3.0289874539562285e-06, + "loss": 2.6673, + "step": 1350 + }, + { + "epoch": 2.0014831294030406, + "grad_norm": 16.014973965320447, + "learning_rate": 3.0210672325286806e-06, + "loss": 2.0483, + "step": 1351 + }, + { + "epoch": 2.002966258806081, + "grad_norm": 6.4523173217667615, + "learning_rate": 3.0131528945764e-06, + "loss": 1.8577, + "step": 1352 + }, + { + "epoch": 2.0044493882091214, + "grad_norm": 6.895140523882464, + "learning_rate": 3.005244463629149e-06, + "loss": 1.8756, + "step": 1353 + }, + { + "epoch": 2.0059325176121616, + "grad_norm": 6.535392234363076, + "learning_rate": 2.997341963199121e-06, + "loss": 1.8968, + "step": 1354 + }, + { + "epoch": 2.0074156470152023, + "grad_norm": 6.612943634853017, + "learning_rate": 2.989445416780884e-06, + "loss": 2.0493, + "step": 1355 + }, + { + "epoch": 2.0088987764182424, + "grad_norm": 6.666137218776649, + "learning_rate": 2.9815548478513034e-06, + "loss": 1.7742, + "step": 1356 + }, + { + "epoch": 2.010381905821283, + "grad_norm": 7.063807280579233, + "learning_rate": 2.973670279869468e-06, + "loss": 1.9827, + "step": 1357 + }, + { + "epoch": 2.0118650352243233, + "grad_norm": 6.616394236942342, + "learning_rate": 2.9657917362766326e-06, + "loss": 2.0236, + "step": 1358 + }, + { + "epoch": 2.013348164627364, + "grad_norm": 6.506187348200441, + "learning_rate": 2.9579192404961356e-06, + "loss": 1.8852, + "step": 1359 + }, + { + "epoch": 2.014831294030404, + "grad_norm": 6.968352014868156, + "learning_rate": 2.950052815933338e-06, + "loss": 1.7107, + "step": 1360 + }, + { + "epoch": 2.0163144234334447, + "grad_norm": 6.922979316177255, + "learning_rate": 2.9421924859755525e-06, + "loss": 2.0068, + "step": 1361 + }, + { + "epoch": 2.017797552836485, + "grad_norm": 6.731919015048393, + "learning_rate": 2.934338273991965e-06, + "loss": 1.7075, + "step": 1362 + }, + { + "epoch": 2.0192806822395255, + "grad_norm": 7.184891226427115, + "learning_rate": 2.9264902033335798e-06, + "loss": 1.818, + "step": 1363 + }, + { + "epoch": 2.0207638116425657, + "grad_norm": 7.913040399813153, + "learning_rate": 2.918648297333141e-06, + "loss": 1.8655, + "step": 1364 + }, + { + "epoch": 2.0222469410456063, + "grad_norm": 8.4615838590331, + "learning_rate": 2.910812579305059e-06, + "loss": 1.959, + "step": 1365 + }, + { + "epoch": 2.0237300704486465, + "grad_norm": 7.956890790426695, + "learning_rate": 2.9029830725453545e-06, + "loss": 1.8579, + "step": 1366 + }, + { + "epoch": 2.025213199851687, + "grad_norm": 7.563356100476528, + "learning_rate": 2.895159800331584e-06, + "loss": 1.8131, + "step": 1367 + }, + { + "epoch": 2.0266963292547273, + "grad_norm": 7.963260220825928, + "learning_rate": 2.8873427859227543e-06, + "loss": 1.6161, + "step": 1368 + }, + { + "epoch": 2.028179458657768, + "grad_norm": 8.848814300252274, + "learning_rate": 2.8795320525592822e-06, + "loss": 1.8698, + "step": 1369 + }, + { + "epoch": 2.029662588060808, + "grad_norm": 7.979227699429963, + "learning_rate": 2.8717276234629043e-06, + "loss": 1.7978, + "step": 1370 + }, + { + "epoch": 2.0311457174638488, + "grad_norm": 9.137603090579846, + "learning_rate": 2.8639295218366115e-06, + "loss": 2.0976, + "step": 1371 + }, + { + "epoch": 2.032628846866889, + "grad_norm": 7.804929503372267, + "learning_rate": 2.856137770864592e-06, + "loss": 1.8697, + "step": 1372 + }, + { + "epoch": 2.0341119762699296, + "grad_norm": 8.660884112238922, + "learning_rate": 2.8483523937121405e-06, + "loss": 1.7313, + "step": 1373 + }, + { + "epoch": 2.0355951056729698, + "grad_norm": 7.987392685248076, + "learning_rate": 2.8405734135256136e-06, + "loss": 1.7754, + "step": 1374 + }, + { + "epoch": 2.0370782350760104, + "grad_norm": 7.910338683015856, + "learning_rate": 2.8328008534323436e-06, + "loss": 1.7415, + "step": 1375 + }, + { + "epoch": 2.038561364479051, + "grad_norm": 7.424939468208836, + "learning_rate": 2.8250347365405737e-06, + "loss": 1.9033, + "step": 1376 + }, + { + "epoch": 2.040044493882091, + "grad_norm": 8.20129204837157, + "learning_rate": 2.8172750859393975e-06, + "loss": 1.707, + "step": 1377 + }, + { + "epoch": 2.041527623285132, + "grad_norm": 7.323147430309461, + "learning_rate": 2.8095219246986792e-06, + "loss": 1.618, + "step": 1378 + }, + { + "epoch": 2.043010752688172, + "grad_norm": 7.3672590910090925, + "learning_rate": 2.80177527586899e-06, + "loss": 1.6461, + "step": 1379 + }, + { + "epoch": 2.0444938820912126, + "grad_norm": 7.393617940353629, + "learning_rate": 2.794035162481541e-06, + "loss": 1.8887, + "step": 1380 + }, + { + "epoch": 2.045977011494253, + "grad_norm": 7.677688437743355, + "learning_rate": 2.78630160754811e-06, + "loss": 1.6908, + "step": 1381 + }, + { + "epoch": 2.0474601408972934, + "grad_norm": 7.892503846566394, + "learning_rate": 2.7785746340609826e-06, + "loss": 1.6825, + "step": 1382 + }, + { + "epoch": 2.0489432703003336, + "grad_norm": 6.712684549665773, + "learning_rate": 2.7708542649928705e-06, + "loss": 1.7337, + "step": 1383 + }, + { + "epoch": 2.0504263997033743, + "grad_norm": 7.310128934090708, + "learning_rate": 2.7631405232968524e-06, + "loss": 1.9775, + "step": 1384 + }, + { + "epoch": 2.0519095291064144, + "grad_norm": 7.752382790238439, + "learning_rate": 2.7554334319063064e-06, + "loss": 2.0155, + "step": 1385 + }, + { + "epoch": 2.053392658509455, + "grad_norm": 7.100481607725241, + "learning_rate": 2.747733013734835e-06, + "loss": 1.7972, + "step": 1386 + }, + { + "epoch": 2.0548757879124953, + "grad_norm": 7.398074471963342, + "learning_rate": 2.7400392916762024e-06, + "loss": 1.8743, + "step": 1387 + }, + { + "epoch": 2.056358917315536, + "grad_norm": 7.795220088658436, + "learning_rate": 2.732352288604263e-06, + "loss": 1.85, + "step": 1388 + }, + { + "epoch": 2.057842046718576, + "grad_norm": 7.362358964472853, + "learning_rate": 2.7246720273729e-06, + "loss": 1.9272, + "step": 1389 + }, + { + "epoch": 2.0593251761216167, + "grad_norm": 7.582125164608114, + "learning_rate": 2.7169985308159485e-06, + "loss": 2.034, + "step": 1390 + }, + { + "epoch": 2.060808305524657, + "grad_norm": 6.788880156128689, + "learning_rate": 2.709331821747133e-06, + "loss": 1.6398, + "step": 1391 + }, + { + "epoch": 2.0622914349276975, + "grad_norm": 6.979062369175105, + "learning_rate": 2.7016719229599995e-06, + "loss": 1.8832, + "step": 1392 + }, + { + "epoch": 2.0637745643307377, + "grad_norm": 8.013058747153273, + "learning_rate": 2.6940188572278426e-06, + "loss": 1.9121, + "step": 1393 + }, + { + "epoch": 2.0652576937337783, + "grad_norm": 7.1195336971010885, + "learning_rate": 2.6863726473036484e-06, + "loss": 1.8437, + "step": 1394 + }, + { + "epoch": 2.0667408231368185, + "grad_norm": 7.414462822274761, + "learning_rate": 2.6787333159200155e-06, + "loss": 1.9647, + "step": 1395 + }, + { + "epoch": 2.068223952539859, + "grad_norm": 8.575597743757303, + "learning_rate": 2.6711008857890928e-06, + "loss": 1.7838, + "step": 1396 + }, + { + "epoch": 2.0697070819428993, + "grad_norm": 7.885723088647661, + "learning_rate": 2.6634753796025146e-06, + "loss": 1.7519, + "step": 1397 + }, + { + "epoch": 2.07119021134594, + "grad_norm": 6.988161149038834, + "learning_rate": 2.6558568200313266e-06, + "loss": 1.8529, + "step": 1398 + }, + { + "epoch": 2.07267334074898, + "grad_norm": 7.707062632910955, + "learning_rate": 2.64824522972592e-06, + "loss": 1.7332, + "step": 1399 + }, + { + "epoch": 2.0741564701520208, + "grad_norm": 6.692012079946339, + "learning_rate": 2.640640631315975e-06, + "loss": 1.9119, + "step": 1400 + }, + { + "epoch": 2.0756395995550614, + "grad_norm": 7.1496946708820515, + "learning_rate": 2.63304304741037e-06, + "loss": 1.8719, + "step": 1401 + }, + { + "epoch": 2.0771227289581016, + "grad_norm": 7.56570490228474, + "learning_rate": 2.6254525005971444e-06, + "loss": 1.902, + "step": 1402 + }, + { + "epoch": 2.078605858361142, + "grad_norm": 7.989136723718025, + "learning_rate": 2.617869013443405e-06, + "loss": 1.9561, + "step": 1403 + }, + { + "epoch": 2.0800889877641824, + "grad_norm": 7.871920797992686, + "learning_rate": 2.6102926084952717e-06, + "loss": 1.8413, + "step": 1404 + }, + { + "epoch": 2.081572117167223, + "grad_norm": 7.443262563087353, + "learning_rate": 2.602723308277818e-06, + "loss": 1.7772, + "step": 1405 + }, + { + "epoch": 2.083055246570263, + "grad_norm": 7.261975383700088, + "learning_rate": 2.595161135294978e-06, + "loss": 1.9898, + "step": 1406 + }, + { + "epoch": 2.084538375973304, + "grad_norm": 7.84317915348744, + "learning_rate": 2.5876061120295116e-06, + "loss": 1.7686, + "step": 1407 + }, + { + "epoch": 2.086021505376344, + "grad_norm": 7.836385111390714, + "learning_rate": 2.5800582609429136e-06, + "loss": 1.8799, + "step": 1408 + }, + { + "epoch": 2.0875046347793846, + "grad_norm": 7.48770432611723, + "learning_rate": 2.5725176044753564e-06, + "loss": 1.8321, + "step": 1409 + }, + { + "epoch": 2.088987764182425, + "grad_norm": 8.14063862506377, + "learning_rate": 2.5649841650456263e-06, + "loss": 1.8244, + "step": 1410 + }, + { + "epoch": 2.0904708935854655, + "grad_norm": 8.190886811106466, + "learning_rate": 2.55745796505105e-06, + "loss": 1.8104, + "step": 1411 + }, + { + "epoch": 2.0919540229885056, + "grad_norm": 7.702130273223612, + "learning_rate": 2.5499390268674298e-06, + "loss": 1.7103, + "step": 1412 + }, + { + "epoch": 2.0934371523915463, + "grad_norm": 7.202275476098316, + "learning_rate": 2.5424273728489834e-06, + "loss": 1.7604, + "step": 1413 + }, + { + "epoch": 2.0949202817945864, + "grad_norm": 7.258643706300253, + "learning_rate": 2.5349230253282626e-06, + "loss": 1.8346, + "step": 1414 + }, + { + "epoch": 2.096403411197627, + "grad_norm": 8.209029826268697, + "learning_rate": 2.5274260066161083e-06, + "loss": 1.8954, + "step": 1415 + }, + { + "epoch": 2.0978865406006673, + "grad_norm": 7.76747832076686, + "learning_rate": 2.5199363390015645e-06, + "loss": 1.7294, + "step": 1416 + }, + { + "epoch": 2.099369670003708, + "grad_norm": 7.313761491748735, + "learning_rate": 2.5124540447518208e-06, + "loss": 1.796, + "step": 1417 + }, + { + "epoch": 2.100852799406748, + "grad_norm": 7.314173108927601, + "learning_rate": 2.504979146112151e-06, + "loss": 1.7872, + "step": 1418 + }, + { + "epoch": 2.1023359288097887, + "grad_norm": 7.1491685122417605, + "learning_rate": 2.4975116653058353e-06, + "loss": 1.9038, + "step": 1419 + }, + { + "epoch": 2.103819058212829, + "grad_norm": 7.145590605415172, + "learning_rate": 2.4900516245341017e-06, + "loss": 1.6991, + "step": 1420 + }, + { + "epoch": 2.1053021876158695, + "grad_norm": 7.725924448765102, + "learning_rate": 2.482599045976059e-06, + "loss": 1.8099, + "step": 1421 + }, + { + "epoch": 2.1067853170189097, + "grad_norm": 7.563418218708688, + "learning_rate": 2.4751539517886296e-06, + "loss": 1.8004, + "step": 1422 + }, + { + "epoch": 2.1082684464219503, + "grad_norm": 6.942288472021978, + "learning_rate": 2.4677163641064883e-06, + "loss": 1.7463, + "step": 1423 + }, + { + "epoch": 2.1097515758249905, + "grad_norm": 7.2720503145035, + "learning_rate": 2.4602863050419884e-06, + "loss": 1.7857, + "step": 1424 + }, + { + "epoch": 2.111234705228031, + "grad_norm": 7.460112002060626, + "learning_rate": 2.4528637966850995e-06, + "loss": 1.979, + "step": 1425 + }, + { + "epoch": 2.1127178346310718, + "grad_norm": 7.819228916486082, + "learning_rate": 2.445448861103348e-06, + "loss": 1.7678, + "step": 1426 + }, + { + "epoch": 2.114200964034112, + "grad_norm": 7.321168251053696, + "learning_rate": 2.4380415203417414e-06, + "loss": 1.868, + "step": 1427 + }, + { + "epoch": 2.1156840934371526, + "grad_norm": 7.977710376544346, + "learning_rate": 2.430641796422708e-06, + "loss": 1.985, + "step": 1428 + }, + { + "epoch": 2.1171672228401928, + "grad_norm": 7.264345609375405, + "learning_rate": 2.4232497113460303e-06, + "loss": 1.7711, + "step": 1429 + }, + { + "epoch": 2.1186503522432334, + "grad_norm": 7.565471590795656, + "learning_rate": 2.415865287088784e-06, + "loss": 1.7305, + "step": 1430 + }, + { + "epoch": 2.1201334816462736, + "grad_norm": 6.87709341437061, + "learning_rate": 2.408488545605265e-06, + "loss": 1.9808, + "step": 1431 + }, + { + "epoch": 2.121616611049314, + "grad_norm": 8.283965618292306, + "learning_rate": 2.401119508826928e-06, + "loss": 1.7894, + "step": 1432 + }, + { + "epoch": 2.1230997404523544, + "grad_norm": 8.100365592281845, + "learning_rate": 2.3937581986623226e-06, + "loss": 1.8299, + "step": 1433 + }, + { + "epoch": 2.124582869855395, + "grad_norm": 7.714363161944826, + "learning_rate": 2.386404636997024e-06, + "loss": 1.7704, + "step": 1434 + }, + { + "epoch": 2.126065999258435, + "grad_norm": 7.477438119348326, + "learning_rate": 2.379058845693577e-06, + "loss": 1.771, + "step": 1435 + }, + { + "epoch": 2.127549128661476, + "grad_norm": 6.916307208224731, + "learning_rate": 2.3717208465914193e-06, + "loss": 1.682, + "step": 1436 + }, + { + "epoch": 2.129032258064516, + "grad_norm": 6.834788649684306, + "learning_rate": 2.36439066150682e-06, + "loss": 1.6326, + "step": 1437 + }, + { + "epoch": 2.1305153874675566, + "grad_norm": 6.888927394845219, + "learning_rate": 2.357068312232827e-06, + "loss": 1.7308, + "step": 1438 + }, + { + "epoch": 2.131998516870597, + "grad_norm": 8.021997007219717, + "learning_rate": 2.3497538205391764e-06, + "loss": 1.9562, + "step": 1439 + }, + { + "epoch": 2.1334816462736375, + "grad_norm": 6.878823371822241, + "learning_rate": 2.3424472081722555e-06, + "loss": 1.8481, + "step": 1440 + }, + { + "epoch": 2.1349647756766776, + "grad_norm": 7.732136697828647, + "learning_rate": 2.3351484968550264e-06, + "loss": 1.795, + "step": 1441 + }, + { + "epoch": 2.1364479050797183, + "grad_norm": 8.082122839504352, + "learning_rate": 2.327857708286949e-06, + "loss": 1.84, + "step": 1442 + }, + { + "epoch": 2.1379310344827585, + "grad_norm": 7.279792286041523, + "learning_rate": 2.320574864143942e-06, + "loss": 1.5989, + "step": 1443 + }, + { + "epoch": 2.139414163885799, + "grad_norm": 7.550448568973027, + "learning_rate": 2.313299986078297e-06, + "loss": 1.5677, + "step": 1444 + }, + { + "epoch": 2.1408972932888393, + "grad_norm": 7.824248700314126, + "learning_rate": 2.306033095718622e-06, + "loss": 1.9895, + "step": 1445 + }, + { + "epoch": 2.14238042269188, + "grad_norm": 8.730890414605605, + "learning_rate": 2.298774214669785e-06, + "loss": 1.8535, + "step": 1446 + }, + { + "epoch": 2.14386355209492, + "grad_norm": 7.140137441070659, + "learning_rate": 2.291523364512829e-06, + "loss": 1.8336, + "step": 1447 + }, + { + "epoch": 2.1453466814979607, + "grad_norm": 8.096322401869761, + "learning_rate": 2.2842805668049323e-06, + "loss": 1.9131, + "step": 1448 + }, + { + "epoch": 2.146829810901001, + "grad_norm": 8.057921853200424, + "learning_rate": 2.2770458430793275e-06, + "loss": 1.8501, + "step": 1449 + }, + { + "epoch": 2.1483129403040415, + "grad_norm": 7.896982537229542, + "learning_rate": 2.269819214845241e-06, + "loss": 1.7048, + "step": 1450 + }, + { + "epoch": 2.149796069707082, + "grad_norm": 7.6593133439968035, + "learning_rate": 2.2626007035878377e-06, + "loss": 2.0193, + "step": 1451 + }, + { + "epoch": 2.1512791991101223, + "grad_norm": 7.410472585939682, + "learning_rate": 2.255390330768144e-06, + "loss": 1.8594, + "step": 1452 + }, + { + "epoch": 2.152762328513163, + "grad_norm": 7.34256108130581, + "learning_rate": 2.2481881178229907e-06, + "loss": 1.8478, + "step": 1453 + }, + { + "epoch": 2.154245457916203, + "grad_norm": 6.905744580025073, + "learning_rate": 2.2409940861649564e-06, + "loss": 1.743, + "step": 1454 + }, + { + "epoch": 2.1557285873192438, + "grad_norm": 7.455384680372202, + "learning_rate": 2.2338082571822822e-06, + "loss": 1.6268, + "step": 1455 + }, + { + "epoch": 2.157211716722284, + "grad_norm": 7.194740147521352, + "learning_rate": 2.226630652238836e-06, + "loss": 1.9163, + "step": 1456 + }, + { + "epoch": 2.1586948461253246, + "grad_norm": 7.677957477762842, + "learning_rate": 2.2194612926740285e-06, + "loss": 1.9974, + "step": 1457 + }, + { + "epoch": 2.1601779755283648, + "grad_norm": 8.03034641711364, + "learning_rate": 2.2123001998027543e-06, + "loss": 2.0629, + "step": 1458 + }, + { + "epoch": 2.1616611049314054, + "grad_norm": 7.858723218750868, + "learning_rate": 2.2051473949153386e-06, + "loss": 1.9272, + "step": 1459 + }, + { + "epoch": 2.1631442343344456, + "grad_norm": 7.827280347562966, + "learning_rate": 2.198002899277459e-06, + "loss": 1.6853, + "step": 1460 + }, + { + "epoch": 2.164627363737486, + "grad_norm": 7.37476099015696, + "learning_rate": 2.1908667341300923e-06, + "loss": 1.7098, + "step": 1461 + }, + { + "epoch": 2.1661104931405264, + "grad_norm": 7.972346329287338, + "learning_rate": 2.1837389206894443e-06, + "loss": 1.9553, + "step": 1462 + }, + { + "epoch": 2.167593622543567, + "grad_norm": 7.739292891069979, + "learning_rate": 2.176619480146899e-06, + "loss": 1.853, + "step": 1463 + }, + { + "epoch": 2.169076751946607, + "grad_norm": 6.966860585608495, + "learning_rate": 2.169508433668939e-06, + "loss": 1.7548, + "step": 1464 + }, + { + "epoch": 2.170559881349648, + "grad_norm": 7.017703122836418, + "learning_rate": 2.1624058023970945e-06, + "loss": 1.7406, + "step": 1465 + }, + { + "epoch": 2.172043010752688, + "grad_norm": 7.2484589058652995, + "learning_rate": 2.155311607447877e-06, + "loss": 1.9031, + "step": 1466 + }, + { + "epoch": 2.1735261401557286, + "grad_norm": 7.053836449890633, + "learning_rate": 2.1482258699127133e-06, + "loss": 1.76, + "step": 1467 + }, + { + "epoch": 2.175009269558769, + "grad_norm": 6.957172597177855, + "learning_rate": 2.141148610857893e-06, + "loss": 1.6541, + "step": 1468 + }, + { + "epoch": 2.1764923989618095, + "grad_norm": 6.501623817955983, + "learning_rate": 2.134079851324491e-06, + "loss": 1.8801, + "step": 1469 + }, + { + "epoch": 2.1779755283648496, + "grad_norm": 7.724217295465677, + "learning_rate": 2.1270196123283143e-06, + "loss": 1.8033, + "step": 1470 + }, + { + "epoch": 2.1794586577678903, + "grad_norm": 7.813925092838744, + "learning_rate": 2.1199679148598434e-06, + "loss": 1.7399, + "step": 1471 + }, + { + "epoch": 2.1809417871709305, + "grad_norm": 7.108742887977647, + "learning_rate": 2.112924779884158e-06, + "loss": 1.8027, + "step": 1472 + }, + { + "epoch": 2.182424916573971, + "grad_norm": 7.08407058272313, + "learning_rate": 2.105890228340882e-06, + "loss": 1.7681, + "step": 1473 + }, + { + "epoch": 2.1839080459770113, + "grad_norm": 8.588016507777024, + "learning_rate": 2.0988642811441255e-06, + "loss": 1.9117, + "step": 1474 + }, + { + "epoch": 2.185391175380052, + "grad_norm": 7.407560441763156, + "learning_rate": 2.091846959182408e-06, + "loss": 1.6522, + "step": 1475 + }, + { + "epoch": 2.1868743047830925, + "grad_norm": 7.244605591463643, + "learning_rate": 2.084838283318616e-06, + "loss": 2.0399, + "step": 1476 + }, + { + "epoch": 2.1883574341861327, + "grad_norm": 7.637297360516248, + "learning_rate": 2.077838274389924e-06, + "loss": 1.7792, + "step": 1477 + }, + { + "epoch": 2.1898405635891733, + "grad_norm": 7.102590576706879, + "learning_rate": 2.070846953207739e-06, + "loss": 1.6688, + "step": 1478 + }, + { + "epoch": 2.1913236929922135, + "grad_norm": 7.3612223043532605, + "learning_rate": 2.0638643405576468e-06, + "loss": 1.7342, + "step": 1479 + }, + { + "epoch": 2.192806822395254, + "grad_norm": 8.093968048471933, + "learning_rate": 2.05689045719933e-06, + "loss": 1.9002, + "step": 1480 + }, + { + "epoch": 2.1942899517982943, + "grad_norm": 7.241596291467036, + "learning_rate": 2.0499253238665284e-06, + "loss": 1.5397, + "step": 1481 + }, + { + "epoch": 2.195773081201335, + "grad_norm": 7.56084445074104, + "learning_rate": 2.0429689612669633e-06, + "loss": 1.8851, + "step": 1482 + }, + { + "epoch": 2.197256210604375, + "grad_norm": 7.492951336855607, + "learning_rate": 2.0360213900822795e-06, + "loss": 2.0266, + "step": 1483 + }, + { + "epoch": 2.1987393400074158, + "grad_norm": 7.290366064876157, + "learning_rate": 2.029082630967989e-06, + "loss": 1.5286, + "step": 1484 + }, + { + "epoch": 2.200222469410456, + "grad_norm": 6.78030650930049, + "learning_rate": 2.0221527045534e-06, + "loss": 1.6909, + "step": 1485 + }, + { + "epoch": 2.2017055988134966, + "grad_norm": 7.359547260761112, + "learning_rate": 2.0152316314415602e-06, + "loss": 1.9231, + "step": 1486 + }, + { + "epoch": 2.2031887282165368, + "grad_norm": 7.073047357759154, + "learning_rate": 2.008319432209205e-06, + "loss": 1.8154, + "step": 1487 + }, + { + "epoch": 2.2046718576195774, + "grad_norm": 7.315809539659785, + "learning_rate": 2.0014161274066717e-06, + "loss": 1.7426, + "step": 1488 + }, + { + "epoch": 2.2061549870226176, + "grad_norm": 6.934648321727505, + "learning_rate": 1.994521737557869e-06, + "loss": 1.9311, + "step": 1489 + }, + { + "epoch": 2.207638116425658, + "grad_norm": 7.348378706082327, + "learning_rate": 1.9876362831601932e-06, + "loss": 1.854, + "step": 1490 + }, + { + "epoch": 2.2091212458286984, + "grad_norm": 8.091097393021004, + "learning_rate": 1.9807597846844737e-06, + "loss": 1.6828, + "step": 1491 + }, + { + "epoch": 2.210604375231739, + "grad_norm": 7.234874033723834, + "learning_rate": 1.9738922625749206e-06, + "loss": 1.693, + "step": 1492 + }, + { + "epoch": 2.212087504634779, + "grad_norm": 7.385561015186944, + "learning_rate": 1.9670337372490504e-06, + "loss": 1.7161, + "step": 1493 + }, + { + "epoch": 2.21357063403782, + "grad_norm": 7.351091877641467, + "learning_rate": 1.9601842290976343e-06, + "loss": 1.8896, + "step": 1494 + }, + { + "epoch": 2.21505376344086, + "grad_norm": 7.496137956791435, + "learning_rate": 1.9533437584846317e-06, + "loss": 1.8283, + "step": 1495 + }, + { + "epoch": 2.2165368928439007, + "grad_norm": 7.074034445979626, + "learning_rate": 1.9465123457471395e-06, + "loss": 1.8322, + "step": 1496 + }, + { + "epoch": 2.2180200222469413, + "grad_norm": 8.205304135924594, + "learning_rate": 1.939690011195319e-06, + "loss": 2.1031, + "step": 1497 + }, + { + "epoch": 2.2195031516499815, + "grad_norm": 7.416276349852843, + "learning_rate": 1.9328767751123452e-06, + "loss": 1.7589, + "step": 1498 + }, + { + "epoch": 2.2209862810530216, + "grad_norm": 7.210077120378627, + "learning_rate": 1.9260726577543378e-06, + "loss": 1.7023, + "step": 1499 + }, + { + "epoch": 2.2224694104560623, + "grad_norm": 7.787286605194539, + "learning_rate": 1.9192776793503134e-06, + "loss": 1.9261, + "step": 1500 + }, + { + "epoch": 2.223952539859103, + "grad_norm": 7.670024035170145, + "learning_rate": 1.9124918601021124e-06, + "loss": 1.7858, + "step": 1501 + }, + { + "epoch": 2.225435669262143, + "grad_norm": 8.168267178967204, + "learning_rate": 1.9057152201843472e-06, + "loss": 1.893, + "step": 1502 + }, + { + "epoch": 2.2269187986651837, + "grad_norm": 7.5782014322666305, + "learning_rate": 1.8989477797443363e-06, + "loss": 1.9052, + "step": 1503 + }, + { + "epoch": 2.228401928068224, + "grad_norm": 7.482203878102337, + "learning_rate": 1.8921895589020527e-06, + "loss": 1.8414, + "step": 1504 + }, + { + "epoch": 2.2298850574712645, + "grad_norm": 7.886796839130509, + "learning_rate": 1.8854405777500556e-06, + "loss": 1.6097, + "step": 1505 + }, + { + "epoch": 2.2313681868743047, + "grad_norm": 6.746760476574444, + "learning_rate": 1.8787008563534326e-06, + "loss": 1.7985, + "step": 1506 + }, + { + "epoch": 2.2328513162773453, + "grad_norm": 6.5836149911504664, + "learning_rate": 1.8719704147497492e-06, + "loss": 1.8414, + "step": 1507 + }, + { + "epoch": 2.2343344456803855, + "grad_norm": 7.439892318185957, + "learning_rate": 1.865249272948969e-06, + "loss": 1.9165, + "step": 1508 + }, + { + "epoch": 2.235817575083426, + "grad_norm": 7.893246016804852, + "learning_rate": 1.8585374509334193e-06, + "loss": 1.7786, + "step": 1509 + }, + { + "epoch": 2.2373007044864663, + "grad_norm": 7.355778406664435, + "learning_rate": 1.8518349686577113e-06, + "loss": 1.9079, + "step": 1510 + }, + { + "epoch": 2.238783833889507, + "grad_norm": 7.696219838179226, + "learning_rate": 1.845141846048691e-06, + "loss": 1.7974, + "step": 1511 + }, + { + "epoch": 2.240266963292547, + "grad_norm": 7.501200633024081, + "learning_rate": 1.8384581030053806e-06, + "loss": 1.8916, + "step": 1512 + }, + { + "epoch": 2.241750092695588, + "grad_norm": 7.129870008370687, + "learning_rate": 1.8317837593989085e-06, + "loss": 1.6172, + "step": 1513 + }, + { + "epoch": 2.243233222098628, + "grad_norm": 7.477793167974658, + "learning_rate": 1.8251188350724653e-06, + "loss": 1.8495, + "step": 1514 + }, + { + "epoch": 2.2447163515016686, + "grad_norm": 7.734170294828016, + "learning_rate": 1.8184633498412396e-06, + "loss": 1.8109, + "step": 1515 + }, + { + "epoch": 2.246199480904709, + "grad_norm": 7.8086618695270325, + "learning_rate": 1.8118173234923447e-06, + "loss": 1.6555, + "step": 1516 + }, + { + "epoch": 2.2476826103077494, + "grad_norm": 7.502865944165095, + "learning_rate": 1.8051807757847867e-06, + "loss": 2.0131, + "step": 1517 + }, + { + "epoch": 2.2491657397107896, + "grad_norm": 7.545094930292667, + "learning_rate": 1.7985537264493814e-06, + "loss": 1.7704, + "step": 1518 + }, + { + "epoch": 2.25064886911383, + "grad_norm": 7.567910566944684, + "learning_rate": 1.7919361951887082e-06, + "loss": 1.7974, + "step": 1519 + }, + { + "epoch": 2.2521319985168704, + "grad_norm": 7.763563385109024, + "learning_rate": 1.7853282016770536e-06, + "loss": 1.737, + "step": 1520 + }, + { + "epoch": 2.253615127919911, + "grad_norm": 7.4745278953739, + "learning_rate": 1.778729765560337e-06, + "loss": 1.8774, + "step": 1521 + }, + { + "epoch": 2.2550982573229517, + "grad_norm": 7.680911259348224, + "learning_rate": 1.7721409064560746e-06, + "loss": 1.8618, + "step": 1522 + }, + { + "epoch": 2.256581386725992, + "grad_norm": 6.650155719074415, + "learning_rate": 1.7655616439533025e-06, + "loss": 1.8822, + "step": 1523 + }, + { + "epoch": 2.258064516129032, + "grad_norm": 7.546597938065461, + "learning_rate": 1.7589919976125265e-06, + "loss": 1.7373, + "step": 1524 + }, + { + "epoch": 2.2595476455320727, + "grad_norm": 6.595003553464858, + "learning_rate": 1.7524319869656675e-06, + "loss": 1.7601, + "step": 1525 + }, + { + "epoch": 2.2610307749351133, + "grad_norm": 8.283283945218646, + "learning_rate": 1.7458816315159937e-06, + "loss": 1.7573, + "step": 1526 + }, + { + "epoch": 2.2625139043381535, + "grad_norm": 7.403488857900312, + "learning_rate": 1.7393409507380688e-06, + "loss": 1.8865, + "step": 1527 + }, + { + "epoch": 2.263997033741194, + "grad_norm": 8.352717525813851, + "learning_rate": 1.7328099640776984e-06, + "loss": 1.7389, + "step": 1528 + }, + { + "epoch": 2.2654801631442343, + "grad_norm": 7.591609692047956, + "learning_rate": 1.7262886909518566e-06, + "loss": 1.7393, + "step": 1529 + }, + { + "epoch": 2.266963292547275, + "grad_norm": 7.325571912507133, + "learning_rate": 1.7197771507486494e-06, + "loss": 1.9066, + "step": 1530 + }, + { + "epoch": 2.268446421950315, + "grad_norm": 8.009437533812182, + "learning_rate": 1.7132753628272403e-06, + "loss": 1.9498, + "step": 1531 + }, + { + "epoch": 2.2699295513533557, + "grad_norm": 7.766035589070678, + "learning_rate": 1.7067833465177991e-06, + "loss": 1.7263, + "step": 1532 + }, + { + "epoch": 2.271412680756396, + "grad_norm": 7.1001284902571555, + "learning_rate": 1.7003011211214471e-06, + "loss": 1.7217, + "step": 1533 + }, + { + "epoch": 2.2728958101594365, + "grad_norm": 6.993376386216502, + "learning_rate": 1.6938287059101937e-06, + "loss": 1.849, + "step": 1534 + }, + { + "epoch": 2.2743789395624767, + "grad_norm": 7.642951040680317, + "learning_rate": 1.6873661201268833e-06, + "loss": 1.7481, + "step": 1535 + }, + { + "epoch": 2.2758620689655173, + "grad_norm": 6.854564713594073, + "learning_rate": 1.6809133829851344e-06, + "loss": 1.846, + "step": 1536 + }, + { + "epoch": 2.2773451983685575, + "grad_norm": 6.942573718311924, + "learning_rate": 1.6744705136692906e-06, + "loss": 1.6236, + "step": 1537 + }, + { + "epoch": 2.278828327771598, + "grad_norm": 6.986156635390822, + "learning_rate": 1.6680375313343534e-06, + "loss": 1.6403, + "step": 1538 + }, + { + "epoch": 2.2803114571746383, + "grad_norm": 7.377838180280686, + "learning_rate": 1.6616144551059305e-06, + "loss": 1.9607, + "step": 1539 + }, + { + "epoch": 2.281794586577679, + "grad_norm": 8.010465381465961, + "learning_rate": 1.655201304080178e-06, + "loss": 1.8478, + "step": 1540 + }, + { + "epoch": 2.283277715980719, + "grad_norm": 7.4909751626706775, + "learning_rate": 1.6487980973237434e-06, + "loss": 1.9625, + "step": 1541 + }, + { + "epoch": 2.28476084538376, + "grad_norm": 7.270958482948883, + "learning_rate": 1.642404853873713e-06, + "loss": 1.6899, + "step": 1542 + }, + { + "epoch": 2.2862439747868, + "grad_norm": 8.022612064271264, + "learning_rate": 1.6360215927375483e-06, + "loss": 1.789, + "step": 1543 + }, + { + "epoch": 2.2877271041898406, + "grad_norm": 7.932163209503014, + "learning_rate": 1.6296483328930313e-06, + "loss": 1.7046, + "step": 1544 + }, + { + "epoch": 2.289210233592881, + "grad_norm": 7.490893715158511, + "learning_rate": 1.6232850932882155e-06, + "loss": 1.8168, + "step": 1545 + }, + { + "epoch": 2.2906933629959214, + "grad_norm": 7.2223599362713315, + "learning_rate": 1.6169318928413574e-06, + "loss": 1.8244, + "step": 1546 + }, + { + "epoch": 2.292176492398962, + "grad_norm": 6.868658008356775, + "learning_rate": 1.6105887504408679e-06, + "loss": 1.8236, + "step": 1547 + }, + { + "epoch": 2.2936596218020022, + "grad_norm": 7.629221691732576, + "learning_rate": 1.6042556849452617e-06, + "loss": 1.916, + "step": 1548 + }, + { + "epoch": 2.2951427512050424, + "grad_norm": 7.229791421940449, + "learning_rate": 1.5979327151830815e-06, + "loss": 1.862, + "step": 1549 + }, + { + "epoch": 2.296625880608083, + "grad_norm": 8.356260493723573, + "learning_rate": 1.5916198599528664e-06, + "loss": 1.6905, + "step": 1550 + }, + { + "epoch": 2.2981090100111237, + "grad_norm": 7.597061339692887, + "learning_rate": 1.5853171380230791e-06, + "loss": 1.7556, + "step": 1551 + }, + { + "epoch": 2.299592139414164, + "grad_norm": 7.75942220706132, + "learning_rate": 1.5790245681320544e-06, + "loss": 1.7287, + "step": 1552 + }, + { + "epoch": 2.3010752688172045, + "grad_norm": 8.086511625509713, + "learning_rate": 1.5727421689879518e-06, + "loss": 1.8427, + "step": 1553 + }, + { + "epoch": 2.3025583982202447, + "grad_norm": 7.543204086923311, + "learning_rate": 1.56646995926868e-06, + "loss": 1.662, + "step": 1554 + }, + { + "epoch": 2.3040415276232853, + "grad_norm": 7.373748993383038, + "learning_rate": 1.5602079576218665e-06, + "loss": 1.7661, + "step": 1555 + }, + { + "epoch": 2.3055246570263255, + "grad_norm": 7.450051937058273, + "learning_rate": 1.5539561826647832e-06, + "loss": 1.8091, + "step": 1556 + }, + { + "epoch": 2.307007786429366, + "grad_norm": 7.142595505464885, + "learning_rate": 1.5477146529842961e-06, + "loss": 1.7394, + "step": 1557 + }, + { + "epoch": 2.3084909158324063, + "grad_norm": 7.642464106067019, + "learning_rate": 1.5414833871368184e-06, + "loss": 1.9286, + "step": 1558 + }, + { + "epoch": 2.309974045235447, + "grad_norm": 7.142562679120177, + "learning_rate": 1.5352624036482422e-06, + "loss": 1.6119, + "step": 1559 + }, + { + "epoch": 2.311457174638487, + "grad_norm": 6.753907946164934, + "learning_rate": 1.5290517210138888e-06, + "loss": 1.8132, + "step": 1560 + }, + { + "epoch": 2.3129403040415277, + "grad_norm": 7.46894708304445, + "learning_rate": 1.5228513576984633e-06, + "loss": 1.7609, + "step": 1561 + }, + { + "epoch": 2.314423433444568, + "grad_norm": 7.009554615570211, + "learning_rate": 1.5166613321359775e-06, + "loss": 1.7603, + "step": 1562 + }, + { + "epoch": 2.3159065628476085, + "grad_norm": 8.114810481547268, + "learning_rate": 1.5104816627297203e-06, + "loss": 1.8633, + "step": 1563 + }, + { + "epoch": 2.3173896922506487, + "grad_norm": 7.9660372919442315, + "learning_rate": 1.5043123678521855e-06, + "loss": 1.9563, + "step": 1564 + }, + { + "epoch": 2.3188728216536894, + "grad_norm": 7.717339622971277, + "learning_rate": 1.498153465845022e-06, + "loss": 1.8439, + "step": 1565 + }, + { + "epoch": 2.3203559510567295, + "grad_norm": 8.497059220439455, + "learning_rate": 1.4920049750189852e-06, + "loss": 1.9284, + "step": 1566 + }, + { + "epoch": 2.32183908045977, + "grad_norm": 8.273920910497559, + "learning_rate": 1.4858669136538728e-06, + "loss": 1.9032, + "step": 1567 + }, + { + "epoch": 2.3233222098628104, + "grad_norm": 8.451422798555203, + "learning_rate": 1.4797392999984773e-06, + "loss": 1.9873, + "step": 1568 + }, + { + "epoch": 2.324805339265851, + "grad_norm": 7.823162357142252, + "learning_rate": 1.4736221522705263e-06, + "loss": 1.6446, + "step": 1569 + }, + { + "epoch": 2.326288468668891, + "grad_norm": 7.658656919075079, + "learning_rate": 1.4675154886566384e-06, + "loss": 1.8186, + "step": 1570 + }, + { + "epoch": 2.327771598071932, + "grad_norm": 7.996137783401482, + "learning_rate": 1.4614193273122562e-06, + "loss": 1.9701, + "step": 1571 + }, + { + "epoch": 2.3292547274749724, + "grad_norm": 7.7973663859135645, + "learning_rate": 1.4553336863616008e-06, + "loss": 1.9619, + "step": 1572 + }, + { + "epoch": 2.3307378568780126, + "grad_norm": 7.644787512007222, + "learning_rate": 1.4492585838976136e-06, + "loss": 1.7848, + "step": 1573 + }, + { + "epoch": 2.332220986281053, + "grad_norm": 7.165906574489695, + "learning_rate": 1.4431940379819093e-06, + "loss": 1.9521, + "step": 1574 + }, + { + "epoch": 2.3337041156840934, + "grad_norm": 7.982005389611068, + "learning_rate": 1.4371400666447128e-06, + "loss": 1.744, + "step": 1575 + }, + { + "epoch": 2.335187245087134, + "grad_norm": 7.421100922208488, + "learning_rate": 1.4310966878848116e-06, + "loss": 1.7256, + "step": 1576 + }, + { + "epoch": 2.3366703744901742, + "grad_norm": 7.386882759297873, + "learning_rate": 1.4250639196695e-06, + "loss": 1.7069, + "step": 1577 + }, + { + "epoch": 2.338153503893215, + "grad_norm": 7.234654202350879, + "learning_rate": 1.41904177993453e-06, + "loss": 2.0414, + "step": 1578 + }, + { + "epoch": 2.339636633296255, + "grad_norm": 7.959515148399449, + "learning_rate": 1.4130302865840517e-06, + "loss": 1.9048, + "step": 1579 + }, + { + "epoch": 2.3411197626992957, + "grad_norm": 7.560274281532841, + "learning_rate": 1.4070294574905607e-06, + "loss": 1.8902, + "step": 1580 + }, + { + "epoch": 2.342602892102336, + "grad_norm": 7.735479218221572, + "learning_rate": 1.401039310494855e-06, + "loss": 1.8767, + "step": 1581 + }, + { + "epoch": 2.3440860215053765, + "grad_norm": 8.521482325007177, + "learning_rate": 1.3950598634059625e-06, + "loss": 1.9517, + "step": 1582 + }, + { + "epoch": 2.3455691509084167, + "grad_norm": 8.136916760801805, + "learning_rate": 1.3890911340011115e-06, + "loss": 1.9137, + "step": 1583 + }, + { + "epoch": 2.3470522803114573, + "grad_norm": 7.418841374373365, + "learning_rate": 1.3831331400256582e-06, + "loss": 1.9625, + "step": 1584 + }, + { + "epoch": 2.3485354097144975, + "grad_norm": 7.4949665497537445, + "learning_rate": 1.3771858991930432e-06, + "loss": 1.9359, + "step": 1585 + }, + { + "epoch": 2.350018539117538, + "grad_norm": 7.494752419286705, + "learning_rate": 1.3712494291847416e-06, + "loss": 1.7808, + "step": 1586 + }, + { + "epoch": 2.3515016685205783, + "grad_norm": 8.594743789579656, + "learning_rate": 1.365323747650202e-06, + "loss": 1.8371, + "step": 1587 + }, + { + "epoch": 2.352984797923619, + "grad_norm": 7.593717797489108, + "learning_rate": 1.3594088722067972e-06, + "loss": 1.8027, + "step": 1588 + }, + { + "epoch": 2.354467927326659, + "grad_norm": 7.553562006289394, + "learning_rate": 1.3535048204397805e-06, + "loss": 1.8528, + "step": 1589 + }, + { + "epoch": 2.3559510567296997, + "grad_norm": 7.606691932759918, + "learning_rate": 1.3476116099022134e-06, + "loss": 1.6429, + "step": 1590 + }, + { + "epoch": 2.35743418613274, + "grad_norm": 7.378248439019642, + "learning_rate": 1.3417292581149388e-06, + "loss": 1.7404, + "step": 1591 + }, + { + "epoch": 2.3589173155357805, + "grad_norm": 7.491970793863409, + "learning_rate": 1.3358577825665077e-06, + "loss": 1.8601, + "step": 1592 + }, + { + "epoch": 2.3604004449388207, + "grad_norm": 7.159609029883109, + "learning_rate": 1.3299972007131357e-06, + "loss": 1.8186, + "step": 1593 + }, + { + "epoch": 2.3618835743418614, + "grad_norm": 7.461495199477711, + "learning_rate": 1.3241475299786582e-06, + "loss": 1.5465, + "step": 1594 + }, + { + "epoch": 2.3633667037449015, + "grad_norm": 7.963811770882842, + "learning_rate": 1.3183087877544604e-06, + "loss": 1.8744, + "step": 1595 + }, + { + "epoch": 2.364849833147942, + "grad_norm": 9.189277760508476, + "learning_rate": 1.3124809913994458e-06, + "loss": 2.1174, + "step": 1596 + }, + { + "epoch": 2.366332962550983, + "grad_norm": 7.919828207216285, + "learning_rate": 1.3066641582399696e-06, + "loss": 1.7882, + "step": 1597 + }, + { + "epoch": 2.367816091954023, + "grad_norm": 6.443159404293988, + "learning_rate": 1.3008583055697944e-06, + "loss": 1.9412, + "step": 1598 + }, + { + "epoch": 2.369299221357063, + "grad_norm": 7.855791141664783, + "learning_rate": 1.2950634506500392e-06, + "loss": 1.7652, + "step": 1599 + }, + { + "epoch": 2.370782350760104, + "grad_norm": 9.06191580227256, + "learning_rate": 1.2892796107091237e-06, + "loss": 1.8875, + "step": 1600 + }, + { + "epoch": 2.3722654801631444, + "grad_norm": 7.293382787008703, + "learning_rate": 1.2835068029427188e-06, + "loss": 1.673, + "step": 1601 + }, + { + "epoch": 2.3737486095661846, + "grad_norm": 7.777107248539495, + "learning_rate": 1.2777450445136996e-06, + "loss": 1.876, + "step": 1602 + }, + { + "epoch": 2.3752317389692252, + "grad_norm": 8.004462525974573, + "learning_rate": 1.2719943525520884e-06, + "loss": 1.868, + "step": 1603 + }, + { + "epoch": 2.3767148683722654, + "grad_norm": 7.569138899471005, + "learning_rate": 1.2662547441550056e-06, + "loss": 1.6634, + "step": 1604 + }, + { + "epoch": 2.378197997775306, + "grad_norm": 7.126775773532852, + "learning_rate": 1.2605262363866211e-06, + "loss": 1.7553, + "step": 1605 + }, + { + "epoch": 2.3796811271783462, + "grad_norm": 7.943447726949144, + "learning_rate": 1.2548088462781006e-06, + "loss": 1.8127, + "step": 1606 + }, + { + "epoch": 2.381164256581387, + "grad_norm": 7.331530692652386, + "learning_rate": 1.249102590827559e-06, + "loss": 1.7337, + "step": 1607 + }, + { + "epoch": 2.382647385984427, + "grad_norm": 7.521394892067586, + "learning_rate": 1.2434074870000057e-06, + "loss": 1.9856, + "step": 1608 + }, + { + "epoch": 2.3841305153874677, + "grad_norm": 8.288953582589755, + "learning_rate": 1.237723551727294e-06, + "loss": 1.8195, + "step": 1609 + }, + { + "epoch": 2.385613644790508, + "grad_norm": 7.90652948187501, + "learning_rate": 1.232050801908074e-06, + "loss": 1.8659, + "step": 1610 + }, + { + "epoch": 2.3870967741935485, + "grad_norm": 7.46380723919005, + "learning_rate": 1.2263892544077439e-06, + "loss": 1.8411, + "step": 1611 + }, + { + "epoch": 2.3885799035965887, + "grad_norm": 6.919606080445995, + "learning_rate": 1.2207389260583908e-06, + "loss": 1.7595, + "step": 1612 + }, + { + "epoch": 2.3900630329996293, + "grad_norm": 7.190748004509151, + "learning_rate": 1.21509983365875e-06, + "loss": 1.8659, + "step": 1613 + }, + { + "epoch": 2.3915461624026695, + "grad_norm": 7.7050512099370865, + "learning_rate": 1.2094719939741546e-06, + "loss": 1.6762, + "step": 1614 + }, + { + "epoch": 2.39302929180571, + "grad_norm": 8.61789468783933, + "learning_rate": 1.2038554237364742e-06, + "loss": 1.7959, + "step": 1615 + }, + { + "epoch": 2.3945124212087503, + "grad_norm": 8.008832608222502, + "learning_rate": 1.1982501396440831e-06, + "loss": 1.8197, + "step": 1616 + }, + { + "epoch": 2.395995550611791, + "grad_norm": 7.359156362060786, + "learning_rate": 1.1926561583617968e-06, + "loss": 1.7669, + "step": 1617 + }, + { + "epoch": 2.397478680014831, + "grad_norm": 8.08572770764978, + "learning_rate": 1.1870734965208247e-06, + "loss": 1.9435, + "step": 1618 + }, + { + "epoch": 2.3989618094178717, + "grad_norm": 7.290351695930762, + "learning_rate": 1.1815021707187285e-06, + "loss": 1.7018, + "step": 1619 + }, + { + "epoch": 2.400444938820912, + "grad_norm": 8.18476759135061, + "learning_rate": 1.1759421975193637e-06, + "loss": 1.9125, + "step": 1620 + }, + { + "epoch": 2.4019280682239526, + "grad_norm": 7.495825745047263, + "learning_rate": 1.1703935934528327e-06, + "loss": 1.826, + "step": 1621 + }, + { + "epoch": 2.403411197626993, + "grad_norm": 7.693281126738691, + "learning_rate": 1.1648563750154424e-06, + "loss": 1.8776, + "step": 1622 + }, + { + "epoch": 2.4048943270300334, + "grad_norm": 7.023752926107444, + "learning_rate": 1.1593305586696407e-06, + "loss": 1.8407, + "step": 1623 + }, + { + "epoch": 2.4063774564330735, + "grad_norm": 7.887667143367911, + "learning_rate": 1.1538161608439858e-06, + "loss": 1.9703, + "step": 1624 + }, + { + "epoch": 2.407860585836114, + "grad_norm": 8.132820981098869, + "learning_rate": 1.1483131979330826e-06, + "loss": 1.8709, + "step": 1625 + }, + { + "epoch": 2.409343715239155, + "grad_norm": 8.007481468053447, + "learning_rate": 1.1428216862975383e-06, + "loss": 1.9397, + "step": 1626 + }, + { + "epoch": 2.410826844642195, + "grad_norm": 7.329454115581193, + "learning_rate": 1.137341642263922e-06, + "loss": 1.9795, + "step": 1627 + }, + { + "epoch": 2.4123099740452356, + "grad_norm": 7.840752317679121, + "learning_rate": 1.1318730821246982e-06, + "loss": 1.7648, + "step": 1628 + }, + { + "epoch": 2.413793103448276, + "grad_norm": 7.314363932211987, + "learning_rate": 1.126416022138198e-06, + "loss": 1.8188, + "step": 1629 + }, + { + "epoch": 2.4152762328513164, + "grad_norm": 7.539112745442822, + "learning_rate": 1.1209704785285625e-06, + "loss": 2.0115, + "step": 1630 + }, + { + "epoch": 2.4167593622543566, + "grad_norm": 7.444482671881653, + "learning_rate": 1.1155364674856834e-06, + "loss": 1.7797, + "step": 1631 + }, + { + "epoch": 2.4182424916573972, + "grad_norm": 7.782380629241843, + "learning_rate": 1.1101140051651776e-06, + "loss": 1.8402, + "step": 1632 + }, + { + "epoch": 2.4197256210604374, + "grad_norm": 8.072119437899651, + "learning_rate": 1.1047031076883207e-06, + "loss": 2.1124, + "step": 1633 + }, + { + "epoch": 2.421208750463478, + "grad_norm": 8.453381631850354, + "learning_rate": 1.0993037911420045e-06, + "loss": 1.6704, + "step": 1634 + }, + { + "epoch": 2.4226918798665182, + "grad_norm": 7.5459428199944325, + "learning_rate": 1.0939160715786967e-06, + "loss": 1.764, + "step": 1635 + }, + { + "epoch": 2.424175009269559, + "grad_norm": 7.373023701672509, + "learning_rate": 1.088539965016377e-06, + "loss": 1.8382, + "step": 1636 + }, + { + "epoch": 2.425658138672599, + "grad_norm": 7.759197008985833, + "learning_rate": 1.083175487438507e-06, + "loss": 1.9079, + "step": 1637 + }, + { + "epoch": 2.4271412680756397, + "grad_norm": 7.068424783425758, + "learning_rate": 1.0778226547939713e-06, + "loss": 1.7736, + "step": 1638 + }, + { + "epoch": 2.42862439747868, + "grad_norm": 7.353210603967028, + "learning_rate": 1.072481482997032e-06, + "loss": 1.7747, + "step": 1639 + }, + { + "epoch": 2.4301075268817205, + "grad_norm": 8.061472548659207, + "learning_rate": 1.067151987927288e-06, + "loss": 1.9715, + "step": 1640 + }, + { + "epoch": 2.4315906562847607, + "grad_norm": 8.101456227962949, + "learning_rate": 1.0618341854296176e-06, + "loss": 1.7034, + "step": 1641 + }, + { + "epoch": 2.4330737856878013, + "grad_norm": 7.650172623638393, + "learning_rate": 1.0565280913141384e-06, + "loss": 1.7369, + "step": 1642 + }, + { + "epoch": 2.4345569150908415, + "grad_norm": 7.174054443390566, + "learning_rate": 1.0512337213561568e-06, + "loss": 1.8406, + "step": 1643 + }, + { + "epoch": 2.436040044493882, + "grad_norm": 7.255332860769045, + "learning_rate": 1.045951091296127e-06, + "loss": 1.7479, + "step": 1644 + }, + { + "epoch": 2.4375231738969223, + "grad_norm": 7.809521298762451, + "learning_rate": 1.0406802168395946e-06, + "loss": 2.084, + "step": 1645 + }, + { + "epoch": 2.439006303299963, + "grad_norm": 7.936196575720437, + "learning_rate": 1.0354211136571586e-06, + "loss": 1.9333, + "step": 1646 + }, + { + "epoch": 2.4404894327030036, + "grad_norm": 7.6930545506781, + "learning_rate": 1.0301737973844183e-06, + "loss": 1.9478, + "step": 1647 + }, + { + "epoch": 2.4419725621060437, + "grad_norm": 7.951709232254531, + "learning_rate": 1.0249382836219346e-06, + "loss": 1.8741, + "step": 1648 + }, + { + "epoch": 2.443455691509084, + "grad_norm": 7.606824933228972, + "learning_rate": 1.019714587935174e-06, + "loss": 1.9489, + "step": 1649 + }, + { + "epoch": 2.4449388209121246, + "grad_norm": 7.823006426642803, + "learning_rate": 1.0145027258544703e-06, + "loss": 1.8562, + "step": 1650 + }, + { + "epoch": 2.446421950315165, + "grad_norm": 7.493201160813176, + "learning_rate": 1.0093027128749722e-06, + "loss": 1.7315, + "step": 1651 + }, + { + "epoch": 2.4479050797182054, + "grad_norm": 6.689697055729612, + "learning_rate": 1.0041145644566053e-06, + "loss": 1.625, + "step": 1652 + }, + { + "epoch": 2.449388209121246, + "grad_norm": 7.664545117409576, + "learning_rate": 9.989382960240173e-07, + "loss": 1.9097, + "step": 1653 + }, + { + "epoch": 2.450871338524286, + "grad_norm": 7.953088805427431, + "learning_rate": 9.937739229665361e-07, + "loss": 1.7524, + "step": 1654 + }, + { + "epoch": 2.452354467927327, + "grad_norm": 7.685066506492676, + "learning_rate": 9.886214606381278e-07, + "loss": 1.7567, + "step": 1655 + }, + { + "epoch": 2.453837597330367, + "grad_norm": 7.531248781409602, + "learning_rate": 9.834809243573406e-07, + "loss": 1.8502, + "step": 1656 + }, + { + "epoch": 2.4553207267334076, + "grad_norm": 7.390705643321481, + "learning_rate": 9.783523294072727e-07, + "loss": 1.7105, + "step": 1657 + }, + { + "epoch": 2.456803856136448, + "grad_norm": 7.49911174039704, + "learning_rate": 9.732356910355168e-07, + "loss": 1.8683, + "step": 1658 + }, + { + "epoch": 2.4582869855394884, + "grad_norm": 7.199984418893678, + "learning_rate": 9.681310244541164e-07, + "loss": 1.7502, + "step": 1659 + }, + { + "epoch": 2.4597701149425286, + "grad_norm": 7.254624776899424, + "learning_rate": 9.630383448395269e-07, + "loss": 1.7277, + "step": 1660 + }, + { + "epoch": 2.4612532443455692, + "grad_norm": 7.635857246163762, + "learning_rate": 9.57957667332562e-07, + "loss": 1.8599, + "step": 1661 + }, + { + "epoch": 2.4627363737486094, + "grad_norm": 7.333100319547658, + "learning_rate": 9.528890070383523e-07, + "loss": 1.72, + "step": 1662 + }, + { + "epoch": 2.46421950315165, + "grad_norm": 7.522497036300992, + "learning_rate": 9.478323790263055e-07, + "loss": 1.9432, + "step": 1663 + }, + { + "epoch": 2.4657026325546902, + "grad_norm": 7.128780139411389, + "learning_rate": 9.427877983300482e-07, + "loss": 1.8829, + "step": 1664 + }, + { + "epoch": 2.467185761957731, + "grad_norm": 7.0253616949777395, + "learning_rate": 9.377552799473982e-07, + "loss": 1.5937, + "step": 1665 + }, + { + "epoch": 2.468668891360771, + "grad_norm": 7.541612374191999, + "learning_rate": 9.327348388403063e-07, + "loss": 1.9457, + "step": 1666 + }, + { + "epoch": 2.4701520207638117, + "grad_norm": 7.447788284417712, + "learning_rate": 9.277264899348171e-07, + "loss": 1.8667, + "step": 1667 + }, + { + "epoch": 2.471635150166852, + "grad_norm": 7.5466589547873255, + "learning_rate": 9.227302481210299e-07, + "loss": 1.974, + "step": 1668 + }, + { + "epoch": 2.4731182795698925, + "grad_norm": 7.6120454666133694, + "learning_rate": 9.177461282530387e-07, + "loss": 1.5813, + "step": 1669 + }, + { + "epoch": 2.4746014089729327, + "grad_norm": 6.648637980217076, + "learning_rate": 9.127741451489086e-07, + "loss": 1.8376, + "step": 1670 + }, + { + "epoch": 2.4760845383759733, + "grad_norm": 7.964361107938135, + "learning_rate": 9.078143135906154e-07, + "loss": 1.9018, + "step": 1671 + }, + { + "epoch": 2.477567667779014, + "grad_norm": 7.693313552006592, + "learning_rate": 9.02866648324009e-07, + "loss": 1.7909, + "step": 1672 + }, + { + "epoch": 2.479050797182054, + "grad_norm": 7.896142213902569, + "learning_rate": 8.979311640587701e-07, + "loss": 1.7168, + "step": 1673 + }, + { + "epoch": 2.4805339265850943, + "grad_norm": 7.685766252620565, + "learning_rate": 8.930078754683625e-07, + "loss": 1.7044, + "step": 1674 + }, + { + "epoch": 2.482017055988135, + "grad_norm": 7.95568522886892, + "learning_rate": 8.880967971899906e-07, + "loss": 2.0083, + "step": 1675 + }, + { + "epoch": 2.4835001853911756, + "grad_norm": 7.26779442978262, + "learning_rate": 8.831979438245619e-07, + "loss": 1.8096, + "step": 1676 + }, + { + "epoch": 2.4849833147942157, + "grad_norm": 7.167556839636992, + "learning_rate": 8.783113299366331e-07, + "loss": 1.9217, + "step": 1677 + }, + { + "epoch": 2.4864664441972564, + "grad_norm": 7.612987407759854, + "learning_rate": 8.734369700543744e-07, + "loss": 1.8365, + "step": 1678 + }, + { + "epoch": 2.4879495736002966, + "grad_norm": 7.30491092477839, + "learning_rate": 8.685748786695241e-07, + "loss": 1.646, + "step": 1679 + }, + { + "epoch": 2.489432703003337, + "grad_norm": 6.799898281186435, + "learning_rate": 8.637250702373445e-07, + "loss": 1.7823, + "step": 1680 + }, + { + "epoch": 2.4909158324063774, + "grad_norm": 7.272947610383027, + "learning_rate": 8.588875591765838e-07, + "loss": 1.9003, + "step": 1681 + }, + { + "epoch": 2.492398961809418, + "grad_norm": 7.330845247575539, + "learning_rate": 8.540623598694259e-07, + "loss": 1.8213, + "step": 1682 + }, + { + "epoch": 2.493882091212458, + "grad_norm": 7.68277639045822, + "learning_rate": 8.492494866614509e-07, + "loss": 1.7616, + "step": 1683 + }, + { + "epoch": 2.495365220615499, + "grad_norm": 6.770154703818226, + "learning_rate": 8.44448953861593e-07, + "loss": 1.8851, + "step": 1684 + }, + { + "epoch": 2.496848350018539, + "grad_norm": 7.1076681951671805, + "learning_rate": 8.396607757421015e-07, + "loss": 1.8213, + "step": 1685 + }, + { + "epoch": 2.4983314794215796, + "grad_norm": 7.516271486911377, + "learning_rate": 8.348849665384906e-07, + "loss": 1.7534, + "step": 1686 + }, + { + "epoch": 2.49981460882462, + "grad_norm": 7.906268169229095, + "learning_rate": 8.301215404495e-07, + "loss": 1.5992, + "step": 1687 + }, + { + "epoch": 2.5012977382276604, + "grad_norm": 7.692407920668091, + "learning_rate": 8.253705116370603e-07, + "loss": 1.8383, + "step": 1688 + }, + { + "epoch": 2.5027808676307006, + "grad_norm": 7.384949405571013, + "learning_rate": 8.206318942262337e-07, + "loss": 1.6428, + "step": 1689 + }, + { + "epoch": 2.5042639970337413, + "grad_norm": 7.078662730044259, + "learning_rate": 8.159057023051936e-07, + "loss": 1.6453, + "step": 1690 + }, + { + "epoch": 2.5057471264367814, + "grad_norm": 7.6242425495821635, + "learning_rate": 8.111919499251653e-07, + "loss": 1.8551, + "step": 1691 + }, + { + "epoch": 2.507230255839822, + "grad_norm": 7.5261343224154595, + "learning_rate": 8.064906511003912e-07, + "loss": 1.872, + "step": 1692 + }, + { + "epoch": 2.5087133852428627, + "grad_norm": 7.0934362039724, + "learning_rate": 8.018018198080913e-07, + "loss": 1.7309, + "step": 1693 + }, + { + "epoch": 2.510196514645903, + "grad_norm": 7.745344939307792, + "learning_rate": 7.971254699884167e-07, + "loss": 1.9057, + "step": 1694 + }, + { + "epoch": 2.511679644048943, + "grad_norm": 7.370858848350933, + "learning_rate": 7.924616155444098e-07, + "loss": 1.6363, + "step": 1695 + }, + { + "epoch": 2.5131627734519837, + "grad_norm": 6.38754262905574, + "learning_rate": 7.878102703419683e-07, + "loss": 1.9331, + "step": 1696 + }, + { + "epoch": 2.5146459028550243, + "grad_norm": 7.561956726311665, + "learning_rate": 7.831714482097907e-07, + "loss": 1.7556, + "step": 1697 + }, + { + "epoch": 2.5161290322580645, + "grad_norm": 7.422147018082433, + "learning_rate": 7.785451629393525e-07, + "loss": 1.7088, + "step": 1698 + }, + { + "epoch": 2.5176121616611047, + "grad_norm": 7.993909773969922, + "learning_rate": 7.739314282848509e-07, + "loss": 1.9854, + "step": 1699 + }, + { + "epoch": 2.5190952910641453, + "grad_norm": 7.136545706474316, + "learning_rate": 7.693302579631712e-07, + "loss": 1.8938, + "step": 1700 + }, + { + "epoch": 2.520578420467186, + "grad_norm": 7.368320407886016, + "learning_rate": 7.647416656538464e-07, + "loss": 1.641, + "step": 1701 + }, + { + "epoch": 2.522061549870226, + "grad_norm": 6.807587475445918, + "learning_rate": 7.601656649990086e-07, + "loss": 1.7472, + "step": 1702 + }, + { + "epoch": 2.5235446792732668, + "grad_norm": 7.241725327939498, + "learning_rate": 7.556022696033593e-07, + "loss": 1.7165, + "step": 1703 + }, + { + "epoch": 2.525027808676307, + "grad_norm": 7.849784084392076, + "learning_rate": 7.510514930341251e-07, + "loss": 1.8431, + "step": 1704 + }, + { + "epoch": 2.5265109380793476, + "grad_norm": 7.697288630431992, + "learning_rate": 7.465133488210091e-07, + "loss": 1.7551, + "step": 1705 + }, + { + "epoch": 2.5279940674823878, + "grad_norm": 7.504850001085853, + "learning_rate": 7.419878504561651e-07, + "loss": 1.8626, + "step": 1706 + }, + { + "epoch": 2.5294771968854284, + "grad_norm": 7.2603603865363775, + "learning_rate": 7.374750113941459e-07, + "loss": 1.8018, + "step": 1707 + }, + { + "epoch": 2.5309603262884686, + "grad_norm": 7.333876083213626, + "learning_rate": 7.329748450518676e-07, + "loss": 1.805, + "step": 1708 + }, + { + "epoch": 2.532443455691509, + "grad_norm": 7.759787671447962, + "learning_rate": 7.284873648085733e-07, + "loss": 1.5439, + "step": 1709 + }, + { + "epoch": 2.5339265850945494, + "grad_norm": 7.203600656563013, + "learning_rate": 7.24012584005786e-07, + "loss": 1.8333, + "step": 1710 + }, + { + "epoch": 2.53540971449759, + "grad_norm": 7.715535948791023, + "learning_rate": 7.195505159472726e-07, + "loss": 1.9194, + "step": 1711 + }, + { + "epoch": 2.53689284390063, + "grad_norm": 7.719501584148626, + "learning_rate": 7.15101173899006e-07, + "loss": 1.7519, + "step": 1712 + }, + { + "epoch": 2.538375973303671, + "grad_norm": 7.358631127503561, + "learning_rate": 7.106645710891219e-07, + "loss": 1.7941, + "step": 1713 + }, + { + "epoch": 2.539859102706711, + "grad_norm": 7.33345270973122, + "learning_rate": 7.062407207078853e-07, + "loss": 1.7225, + "step": 1714 + }, + { + "epoch": 2.5413422321097516, + "grad_norm": 8.230347195091776, + "learning_rate": 7.018296359076443e-07, + "loss": 1.8893, + "step": 1715 + }, + { + "epoch": 2.542825361512792, + "grad_norm": 7.894477290770661, + "learning_rate": 6.974313298027946e-07, + "loss": 1.8709, + "step": 1716 + }, + { + "epoch": 2.5443084909158324, + "grad_norm": 8.135382811793283, + "learning_rate": 6.930458154697401e-07, + "loss": 1.8261, + "step": 1717 + }, + { + "epoch": 2.545791620318873, + "grad_norm": 7.259668964588666, + "learning_rate": 6.88673105946856e-07, + "loss": 1.7134, + "step": 1718 + }, + { + "epoch": 2.5472747497219133, + "grad_norm": 7.35133499781611, + "learning_rate": 6.843132142344455e-07, + "loss": 1.7938, + "step": 1719 + }, + { + "epoch": 2.5487578791249534, + "grad_norm": 7.874299551335516, + "learning_rate": 6.79966153294705e-07, + "loss": 1.7664, + "step": 1720 + }, + { + "epoch": 2.550241008527994, + "grad_norm": 7.53148840993657, + "learning_rate": 6.756319360516856e-07, + "loss": 1.8027, + "step": 1721 + }, + { + "epoch": 2.5517241379310347, + "grad_norm": 7.848193954200176, + "learning_rate": 6.713105753912514e-07, + "loss": 1.7797, + "step": 1722 + }, + { + "epoch": 2.553207267334075, + "grad_norm": 7.519686402320239, + "learning_rate": 6.670020841610431e-07, + "loss": 1.8891, + "step": 1723 + }, + { + "epoch": 2.554690396737115, + "grad_norm": 7.616149899796188, + "learning_rate": 6.627064751704409e-07, + "loss": 1.795, + "step": 1724 + }, + { + "epoch": 2.5561735261401557, + "grad_norm": 8.348891497522507, + "learning_rate": 6.584237611905242e-07, + "loss": 2.0273, + "step": 1725 + }, + { + "epoch": 2.5576566555431963, + "grad_norm": 7.580347179392006, + "learning_rate": 6.541539549540383e-07, + "loss": 1.5452, + "step": 1726 + }, + { + "epoch": 2.5591397849462365, + "grad_norm": 6.943467266884024, + "learning_rate": 6.498970691553486e-07, + "loss": 1.8087, + "step": 1727 + }, + { + "epoch": 2.560622914349277, + "grad_norm": 6.808801340399502, + "learning_rate": 6.456531164504082e-07, + "loss": 1.8904, + "step": 1728 + }, + { + "epoch": 2.5621060437523173, + "grad_norm": 7.507924939179851, + "learning_rate": 6.414221094567241e-07, + "loss": 1.9365, + "step": 1729 + }, + { + "epoch": 2.563589173155358, + "grad_norm": 7.348394223560322, + "learning_rate": 6.372040607533064e-07, + "loss": 1.7767, + "step": 1730 + }, + { + "epoch": 2.565072302558398, + "grad_norm": 7.839303891824425, + "learning_rate": 6.329989828806482e-07, + "loss": 1.7398, + "step": 1731 + }, + { + "epoch": 2.5665554319614388, + "grad_norm": 7.775809886045296, + "learning_rate": 6.288068883406751e-07, + "loss": 1.7327, + "step": 1732 + }, + { + "epoch": 2.568038561364479, + "grad_norm": 7.534704121627996, + "learning_rate": 6.246277895967112e-07, + "loss": 1.9305, + "step": 1733 + }, + { + "epoch": 2.5695216907675196, + "grad_norm": 8.59567888285118, + "learning_rate": 6.204616990734486e-07, + "loss": 1.8804, + "step": 1734 + }, + { + "epoch": 2.5710048201705598, + "grad_norm": 7.700267920752455, + "learning_rate": 6.163086291568998e-07, + "loss": 1.821, + "step": 1735 + }, + { + "epoch": 2.5724879495736004, + "grad_norm": 8.540712618574124, + "learning_rate": 6.121685921943688e-07, + "loss": 1.8099, + "step": 1736 + }, + { + "epoch": 2.5739710789766406, + "grad_norm": 7.617534802178931, + "learning_rate": 6.080416004944145e-07, + "loss": 1.7449, + "step": 1737 + }, + { + "epoch": 2.575454208379681, + "grad_norm": 7.337741486389738, + "learning_rate": 6.039276663268045e-07, + "loss": 1.9023, + "step": 1738 + }, + { + "epoch": 2.5769373377827214, + "grad_norm": 7.300779068037262, + "learning_rate": 5.998268019224929e-07, + "loss": 1.7094, + "step": 1739 + }, + { + "epoch": 2.578420467185762, + "grad_norm": 7.42461991615112, + "learning_rate": 5.957390194735724e-07, + "loss": 1.853, + "step": 1740 + }, + { + "epoch": 2.579903596588802, + "grad_norm": 7.4595957297587585, + "learning_rate": 5.916643311332438e-07, + "loss": 1.6434, + "step": 1741 + }, + { + "epoch": 2.581386725991843, + "grad_norm": 7.012982206627093, + "learning_rate": 5.876027490157798e-07, + "loss": 1.8421, + "step": 1742 + }, + { + "epoch": 2.5828698553948835, + "grad_norm": 6.696528812657122, + "learning_rate": 5.835542851964826e-07, + "loss": 1.6085, + "step": 1743 + }, + { + "epoch": 2.5843529847979236, + "grad_norm": 7.729006966201663, + "learning_rate": 5.795189517116596e-07, + "loss": 1.9283, + "step": 1744 + }, + { + "epoch": 2.585836114200964, + "grad_norm": 8.716936764688281, + "learning_rate": 5.754967605585771e-07, + "loss": 1.7888, + "step": 1745 + }, + { + "epoch": 2.5873192436040044, + "grad_norm": 8.049388220349313, + "learning_rate": 5.71487723695427e-07, + "loss": 2.0286, + "step": 1746 + }, + { + "epoch": 2.588802373007045, + "grad_norm": 8.01567276400992, + "learning_rate": 5.674918530412982e-07, + "loss": 1.7637, + "step": 1747 + }, + { + "epoch": 2.5902855024100853, + "grad_norm": 7.499570268808479, + "learning_rate": 5.635091604761306e-07, + "loss": 1.6717, + "step": 1748 + }, + { + "epoch": 2.5917686318131254, + "grad_norm": 7.278902483004221, + "learning_rate": 5.595396578406864e-07, + "loss": 1.7152, + "step": 1749 + }, + { + "epoch": 2.593251761216166, + "grad_norm": 7.393783381006658, + "learning_rate": 5.555833569365159e-07, + "loss": 1.5395, + "step": 1750 + }, + { + "epoch": 2.5947348906192067, + "grad_norm": 7.656236016217915, + "learning_rate": 5.516402695259165e-07, + "loss": 1.7347, + "step": 1751 + }, + { + "epoch": 2.596218020022247, + "grad_norm": 7.173635447168359, + "learning_rate": 5.477104073319034e-07, + "loss": 1.8434, + "step": 1752 + }, + { + "epoch": 2.5977011494252875, + "grad_norm": 7.316417786438214, + "learning_rate": 5.437937820381706e-07, + "loss": 1.6769, + "step": 1753 + }, + { + "epoch": 2.5991842788283277, + "grad_norm": 7.634858259144561, + "learning_rate": 5.398904052890591e-07, + "loss": 1.8081, + "step": 1754 + }, + { + "epoch": 2.6006674082313683, + "grad_norm": 8.241890171185807, + "learning_rate": 5.360002886895227e-07, + "loss": 1.7308, + "step": 1755 + }, + { + "epoch": 2.6021505376344085, + "grad_norm": 7.054502595771792, + "learning_rate": 5.321234438050893e-07, + "loss": 1.8934, + "step": 1756 + }, + { + "epoch": 2.603633667037449, + "grad_norm": 7.375553667548612, + "learning_rate": 5.282598821618317e-07, + "loss": 1.6882, + "step": 1757 + }, + { + "epoch": 2.6051167964404893, + "grad_norm": 7.882681702010551, + "learning_rate": 5.244096152463279e-07, + "loss": 1.9019, + "step": 1758 + }, + { + "epoch": 2.60659992584353, + "grad_norm": 7.042304541063478, + "learning_rate": 5.205726545056345e-07, + "loss": 1.6289, + "step": 1759 + }, + { + "epoch": 2.60808305524657, + "grad_norm": 7.4831907461933085, + "learning_rate": 5.16749011347244e-07, + "loss": 1.7244, + "step": 1760 + }, + { + "epoch": 2.6095661846496108, + "grad_norm": 7.436192778740082, + "learning_rate": 5.12938697139056e-07, + "loss": 1.7581, + "step": 1761 + }, + { + "epoch": 2.611049314052651, + "grad_norm": 8.317030987593188, + "learning_rate": 5.091417232093437e-07, + "loss": 1.8295, + "step": 1762 + }, + { + "epoch": 2.6125324434556916, + "grad_norm": 7.48535495101994, + "learning_rate": 5.053581008467184e-07, + "loss": 1.8707, + "step": 1763 + }, + { + "epoch": 2.6140155728587318, + "grad_norm": 8.635412738095598, + "learning_rate": 5.015878413000958e-07, + "loss": 1.9938, + "step": 1764 + }, + { + "epoch": 2.6154987022617724, + "grad_norm": 7.915116720405985, + "learning_rate": 4.978309557786637e-07, + "loss": 1.8537, + "step": 1765 + }, + { + "epoch": 2.6169818316648126, + "grad_norm": 7.272654064346395, + "learning_rate": 4.940874554518465e-07, + "loss": 1.7373, + "step": 1766 + }, + { + "epoch": 2.618464961067853, + "grad_norm": 7.948665043813024, + "learning_rate": 4.903573514492777e-07, + "loss": 1.8418, + "step": 1767 + }, + { + "epoch": 2.619948090470894, + "grad_norm": 8.309558039864566, + "learning_rate": 4.866406548607588e-07, + "loss": 1.6951, + "step": 1768 + }, + { + "epoch": 2.621431219873934, + "grad_norm": 7.487538721413033, + "learning_rate": 4.829373767362316e-07, + "loss": 1.8462, + "step": 1769 + }, + { + "epoch": 2.622914349276974, + "grad_norm": 7.006246113254242, + "learning_rate": 4.792475280857473e-07, + "loss": 1.515, + "step": 1770 + }, + { + "epoch": 2.624397478680015, + "grad_norm": 6.942514559916767, + "learning_rate": 4.755711198794233e-07, + "loss": 1.9274, + "step": 1771 + }, + { + "epoch": 2.6258806080830555, + "grad_norm": 7.17091827514119, + "learning_rate": 4.719081630474248e-07, + "loss": 1.8533, + "step": 1772 + }, + { + "epoch": 2.6273637374860956, + "grad_norm": 7.632839315441306, + "learning_rate": 4.682586684799223e-07, + "loss": 1.8585, + "step": 1773 + }, + { + "epoch": 2.628846866889136, + "grad_norm": 7.4495801713297265, + "learning_rate": 4.646226470270615e-07, + "loss": 1.621, + "step": 1774 + }, + { + "epoch": 2.6303299962921765, + "grad_norm": 6.316728686667068, + "learning_rate": 4.610001094989358e-07, + "loss": 1.7141, + "step": 1775 + }, + { + "epoch": 2.631813125695217, + "grad_norm": 7.876900477005761, + "learning_rate": 4.573910666655429e-07, + "loss": 1.8391, + "step": 1776 + }, + { + "epoch": 2.6332962550982573, + "grad_norm": 7.556039755840808, + "learning_rate": 4.5379552925676696e-07, + "loss": 2.0305, + "step": 1777 + }, + { + "epoch": 2.634779384501298, + "grad_norm": 6.990923858140146, + "learning_rate": 4.502135079623382e-07, + "loss": 1.887, + "step": 1778 + }, + { + "epoch": 2.636262513904338, + "grad_norm": 7.412891551992874, + "learning_rate": 4.46645013431799e-07, + "loss": 1.9473, + "step": 1779 + }, + { + "epoch": 2.6377456433073787, + "grad_norm": 6.992052395706018, + "learning_rate": 4.430900562744805e-07, + "loss": 1.6594, + "step": 1780 + }, + { + "epoch": 2.639228772710419, + "grad_norm": 7.349278328064197, + "learning_rate": 4.395486470594645e-07, + "loss": 1.8304, + "step": 1781 + }, + { + "epoch": 2.6407119021134595, + "grad_norm": 6.8417840289052085, + "learning_rate": 4.3602079631555274e-07, + "loss": 1.683, + "step": 1782 + }, + { + "epoch": 2.6421950315164997, + "grad_norm": 7.4867748610797396, + "learning_rate": 4.3250651453124017e-07, + "loss": 1.8014, + "step": 1783 + }, + { + "epoch": 2.6436781609195403, + "grad_norm": 8.027739084545942, + "learning_rate": 4.2900581215467807e-07, + "loss": 1.7805, + "step": 1784 + }, + { + "epoch": 2.6451612903225805, + "grad_norm": 8.163467612363156, + "learning_rate": 4.25518699593645e-07, + "loss": 1.8333, + "step": 1785 + }, + { + "epoch": 2.646644419725621, + "grad_norm": 6.710945771898146, + "learning_rate": 4.220451872155179e-07, + "loss": 1.7532, + "step": 1786 + }, + { + "epoch": 2.6481275491286613, + "grad_norm": 8.649360929371207, + "learning_rate": 4.1858528534723776e-07, + "loss": 1.8203, + "step": 1787 + }, + { + "epoch": 2.649610678531702, + "grad_norm": 6.826593808172259, + "learning_rate": 4.15139004275284e-07, + "loss": 1.7161, + "step": 1788 + }, + { + "epoch": 2.651093807934742, + "grad_norm": 6.831878398514155, + "learning_rate": 4.117063542456373e-07, + "loss": 1.725, + "step": 1789 + }, + { + "epoch": 2.6525769373377828, + "grad_norm": 7.46628404412611, + "learning_rate": 4.0828734546375426e-07, + "loss": 1.8661, + "step": 1790 + }, + { + "epoch": 2.654060066740823, + "grad_norm": 7.355645951122674, + "learning_rate": 4.048819880945337e-07, + "loss": 1.8202, + "step": 1791 + }, + { + "epoch": 2.6555431961438636, + "grad_norm": 7.292049074749945, + "learning_rate": 4.01490292262291e-07, + "loss": 1.8401, + "step": 1792 + }, + { + "epoch": 2.657026325546904, + "grad_norm": 7.28968904814557, + "learning_rate": 3.981122680507216e-07, + "loss": 1.7249, + "step": 1793 + }, + { + "epoch": 2.6585094549499444, + "grad_norm": 7.44246093085141, + "learning_rate": 3.947479255028763e-07, + "loss": 1.7627, + "step": 1794 + }, + { + "epoch": 2.6599925843529846, + "grad_norm": 7.821552314392608, + "learning_rate": 3.913972746211309e-07, + "loss": 1.9942, + "step": 1795 + }, + { + "epoch": 2.661475713756025, + "grad_norm": 7.643633752765681, + "learning_rate": 3.880603253671522e-07, + "loss": 1.9648, + "step": 1796 + }, + { + "epoch": 2.662958843159066, + "grad_norm": 8.035112279114058, + "learning_rate": 3.8473708766187336e-07, + "loss": 1.7696, + "step": 1797 + }, + { + "epoch": 2.664441972562106, + "grad_norm": 7.568057869763805, + "learning_rate": 3.8142757138546095e-07, + "loss": 1.7636, + "step": 1798 + }, + { + "epoch": 2.665925101965146, + "grad_norm": 9.705613514040698, + "learning_rate": 3.7813178637728767e-07, + "loss": 2.2163, + "step": 1799 + }, + { + "epoch": 2.667408231368187, + "grad_norm": 7.446582442694902, + "learning_rate": 3.7484974243590376e-07, + "loss": 1.9815, + "step": 1800 + }, + { + "epoch": 2.6688913607712275, + "grad_norm": 8.074154156042136, + "learning_rate": 3.7158144931900395e-07, + "loss": 1.8175, + "step": 1801 + }, + { + "epoch": 2.6703744901742676, + "grad_norm": 7.89392833749364, + "learning_rate": 3.683269167434023e-07, + "loss": 1.8019, + "step": 1802 + }, + { + "epoch": 2.6718576195773083, + "grad_norm": 7.329416717374921, + "learning_rate": 3.650861543850043e-07, + "loss": 1.8623, + "step": 1803 + }, + { + "epoch": 2.6733407489803485, + "grad_norm": 6.762024461403772, + "learning_rate": 3.618591718787695e-07, + "loss": 1.6821, + "step": 1804 + }, + { + "epoch": 2.674823878383389, + "grad_norm": 7.975042155992588, + "learning_rate": 3.586459788186958e-07, + "loss": 1.8451, + "step": 1805 + }, + { + "epoch": 2.6763070077864293, + "grad_norm": 7.81084172745627, + "learning_rate": 3.5544658475778317e-07, + "loss": 1.9374, + "step": 1806 + }, + { + "epoch": 2.67779013718947, + "grad_norm": 8.005837838163798, + "learning_rate": 3.5226099920800115e-07, + "loss": 1.5219, + "step": 1807 + }, + { + "epoch": 2.67927326659251, + "grad_norm": 7.263103592296555, + "learning_rate": 3.490892316402733e-07, + "loss": 1.8367, + "step": 1808 + }, + { + "epoch": 2.6807563959955507, + "grad_norm": 7.835944403893469, + "learning_rate": 3.4593129148443593e-07, + "loss": 1.7866, + "step": 1809 + }, + { + "epoch": 2.682239525398591, + "grad_norm": 8.05567656558798, + "learning_rate": 3.4278718812921777e-07, + "loss": 1.9383, + "step": 1810 + }, + { + "epoch": 2.6837226548016315, + "grad_norm": 7.769489882652294, + "learning_rate": 3.396569309222114e-07, + "loss": 1.7839, + "step": 1811 + }, + { + "epoch": 2.6852057842046717, + "grad_norm": 7.3940068144820765, + "learning_rate": 3.3654052916984036e-07, + "loss": 1.8311, + "step": 1812 + }, + { + "epoch": 2.6866889136077123, + "grad_norm": 8.066235384152996, + "learning_rate": 3.334379921373393e-07, + "loss": 1.7173, + "step": 1813 + }, + { + "epoch": 2.688172043010753, + "grad_norm": 7.39105711975659, + "learning_rate": 3.303493290487192e-07, + "loss": 1.8303, + "step": 1814 + }, + { + "epoch": 2.689655172413793, + "grad_norm": 7.038456425142032, + "learning_rate": 3.2727454908674404e-07, + "loss": 1.67, + "step": 1815 + }, + { + "epoch": 2.6911383018168333, + "grad_norm": 6.965685693788715, + "learning_rate": 3.2421366139290423e-07, + "loss": 1.7262, + "step": 1816 + }, + { + "epoch": 2.692621431219874, + "grad_norm": 7.159802188341476, + "learning_rate": 3.211666750673853e-07, + "loss": 1.6492, + "step": 1817 + }, + { + "epoch": 2.6941045606229146, + "grad_norm": 8.098254454233935, + "learning_rate": 3.181335991690443e-07, + "loss": 1.8727, + "step": 1818 + }, + { + "epoch": 2.6955876900259548, + "grad_norm": 7.22013667939144, + "learning_rate": 3.151144427153807e-07, + "loss": 1.8152, + "step": 1819 + }, + { + "epoch": 2.697070819428995, + "grad_norm": 8.152568002249124, + "learning_rate": 3.1210921468251163e-07, + "loss": 1.8334, + "step": 1820 + }, + { + "epoch": 2.6985539488320356, + "grad_norm": 7.0618514193127995, + "learning_rate": 3.091179240051462e-07, + "loss": 1.7895, + "step": 1821 + }, + { + "epoch": 2.700037078235076, + "grad_norm": 7.994496957635029, + "learning_rate": 3.061405795765532e-07, + "loss": 2.098, + "step": 1822 + }, + { + "epoch": 2.7015202076381164, + "grad_norm": 8.206412384760503, + "learning_rate": 3.0317719024854087e-07, + "loss": 1.7475, + "step": 1823 + }, + { + "epoch": 2.7030033370411566, + "grad_norm": 7.378805871636309, + "learning_rate": 3.002277648314278e-07, + "loss": 1.9064, + "step": 1824 + }, + { + "epoch": 2.704486466444197, + "grad_norm": 7.754087949500135, + "learning_rate": 2.9729231209401743e-07, + "loss": 1.7768, + "step": 1825 + }, + { + "epoch": 2.705969595847238, + "grad_norm": 7.125276082880713, + "learning_rate": 2.943708407635704e-07, + "loss": 1.5996, + "step": 1826 + }, + { + "epoch": 2.707452725250278, + "grad_norm": 7.602647033412835, + "learning_rate": 2.9146335952577987e-07, + "loss": 1.8, + "step": 1827 + }, + { + "epoch": 2.7089358546533187, + "grad_norm": 7.730499907001404, + "learning_rate": 2.8856987702474805e-07, + "loss": 1.641, + "step": 1828 + }, + { + "epoch": 2.710418984056359, + "grad_norm": 7.285616470207864, + "learning_rate": 2.8569040186295636e-07, + "loss": 1.844, + "step": 1829 + }, + { + "epoch": 2.7119021134593995, + "grad_norm": 7.259269297467355, + "learning_rate": 2.8282494260124137e-07, + "loss": 1.8612, + "step": 1830 + }, + { + "epoch": 2.7133852428624397, + "grad_norm": 7.015865499397023, + "learning_rate": 2.799735077587695e-07, + "loss": 1.8748, + "step": 1831 + }, + { + "epoch": 2.7148683722654803, + "grad_norm": 8.202325498416695, + "learning_rate": 2.771361058130129e-07, + "loss": 1.8466, + "step": 1832 + }, + { + "epoch": 2.7163515016685205, + "grad_norm": 7.874057708969928, + "learning_rate": 2.7431274519972294e-07, + "loss": 1.863, + "step": 1833 + }, + { + "epoch": 2.717834631071561, + "grad_norm": 7.589294010626396, + "learning_rate": 2.715034343129047e-07, + "loss": 1.6156, + "step": 1834 + }, + { + "epoch": 2.7193177604746013, + "grad_norm": 7.256035519401119, + "learning_rate": 2.687081815047926e-07, + "loss": 2.0376, + "step": 1835 + }, + { + "epoch": 2.720800889877642, + "grad_norm": 7.465501262974547, + "learning_rate": 2.659269950858273e-07, + "loss": 1.9792, + "step": 1836 + }, + { + "epoch": 2.722284019280682, + "grad_norm": 7.767144003175195, + "learning_rate": 2.6315988332462684e-07, + "loss": 1.5798, + "step": 1837 + }, + { + "epoch": 2.7237671486837227, + "grad_norm": 7.571745876807592, + "learning_rate": 2.6040685444796545e-07, + "loss": 1.7242, + "step": 1838 + }, + { + "epoch": 2.7252502780867633, + "grad_norm": 7.02933749584506, + "learning_rate": 2.576679166407503e-07, + "loss": 1.6621, + "step": 1839 + }, + { + "epoch": 2.7267334074898035, + "grad_norm": 7.6677263897268695, + "learning_rate": 2.549430780459905e-07, + "loss": 1.9586, + "step": 1840 + }, + { + "epoch": 2.7282165368928437, + "grad_norm": 8.08712979705576, + "learning_rate": 2.5223234676478193e-07, + "loss": 1.8241, + "step": 1841 + }, + { + "epoch": 2.7296996662958843, + "grad_norm": 7.616069806095328, + "learning_rate": 2.495357308562768e-07, + "loss": 1.6729, + "step": 1842 + }, + { + "epoch": 2.731182795698925, + "grad_norm": 7.825150192905846, + "learning_rate": 2.468532383376604e-07, + "loss": 1.766, + "step": 1843 + }, + { + "epoch": 2.732665925101965, + "grad_norm": 7.139404248123828, + "learning_rate": 2.4418487718413153e-07, + "loss": 1.8605, + "step": 1844 + }, + { + "epoch": 2.7341490545050053, + "grad_norm": 7.685525826635041, + "learning_rate": 2.4153065532887157e-07, + "loss": 1.7974, + "step": 1845 + }, + { + "epoch": 2.735632183908046, + "grad_norm": 7.505962620241136, + "learning_rate": 2.3889058066302873e-07, + "loss": 1.8743, + "step": 1846 + }, + { + "epoch": 2.7371153133110866, + "grad_norm": 7.034589769920824, + "learning_rate": 2.3626466103568835e-07, + "loss": 1.8621, + "step": 1847 + }, + { + "epoch": 2.738598442714127, + "grad_norm": 7.438892230722868, + "learning_rate": 2.3365290425385268e-07, + "loss": 1.9443, + "step": 1848 + }, + { + "epoch": 2.740081572117167, + "grad_norm": 7.204076147833582, + "learning_rate": 2.310553180824193e-07, + "loss": 1.8736, + "step": 1849 + }, + { + "epoch": 2.7415647015202076, + "grad_norm": 7.075200394608157, + "learning_rate": 2.2847191024415182e-07, + "loss": 1.6794, + "step": 1850 + }, + { + "epoch": 2.743047830923248, + "grad_norm": 7.539342207158532, + "learning_rate": 2.2590268841966357e-07, + "loss": 2.0125, + "step": 1851 + }, + { + "epoch": 2.7445309603262884, + "grad_norm": 7.937776631094876, + "learning_rate": 2.2334766024739218e-07, + "loss": 2.08, + "step": 1852 + }, + { + "epoch": 2.746014089729329, + "grad_norm": 7.5586269410383204, + "learning_rate": 2.2080683332357466e-07, + "loss": 1.7081, + "step": 1853 + }, + { + "epoch": 2.747497219132369, + "grad_norm": 7.502244038094291, + "learning_rate": 2.1828021520222952e-07, + "loss": 1.6528, + "step": 1854 + }, + { + "epoch": 2.74898034853541, + "grad_norm": 7.000556976668607, + "learning_rate": 2.157678133951302e-07, + "loss": 1.8207, + "step": 1855 + }, + { + "epoch": 2.75046347793845, + "grad_norm": 7.234048574111117, + "learning_rate": 2.132696353717839e-07, + "loss": 1.8383, + "step": 1856 + }, + { + "epoch": 2.7519466073414907, + "grad_norm": 7.904054242959459, + "learning_rate": 2.1078568855941118e-07, + "loss": 1.7658, + "step": 1857 + }, + { + "epoch": 2.753429736744531, + "grad_norm": 7.202028876649991, + "learning_rate": 2.0831598034292188e-07, + "loss": 1.8585, + "step": 1858 + }, + { + "epoch": 2.7549128661475715, + "grad_norm": 8.07004915595189, + "learning_rate": 2.058605180648926e-07, + "loss": 1.9972, + "step": 1859 + }, + { + "epoch": 2.7563959955506117, + "grad_norm": 7.966066346925783, + "learning_rate": 2.0341930902554764e-07, + "loss": 1.7189, + "step": 1860 + }, + { + "epoch": 2.7578791249536523, + "grad_norm": 7.66245614721197, + "learning_rate": 2.0099236048273407e-07, + "loss": 1.9362, + "step": 1861 + }, + { + "epoch": 2.7593622543566925, + "grad_norm": 8.155020790540759, + "learning_rate": 1.985796796519035e-07, + "loss": 1.7699, + "step": 1862 + }, + { + "epoch": 2.760845383759733, + "grad_norm": 7.802453459524455, + "learning_rate": 1.9618127370608752e-07, + "loss": 1.8257, + "step": 1863 + }, + { + "epoch": 2.7623285131627737, + "grad_norm": 7.842541100862704, + "learning_rate": 1.9379714977587782e-07, + "loss": 2.0067, + "step": 1864 + }, + { + "epoch": 2.763811642565814, + "grad_norm": 8.372596575925028, + "learning_rate": 1.9142731494940502e-07, + "loss": 1.6728, + "step": 1865 + }, + { + "epoch": 2.765294771968854, + "grad_norm": 7.826102805167631, + "learning_rate": 1.890717762723182e-07, + "loss": 1.9885, + "step": 1866 + }, + { + "epoch": 2.7667779013718947, + "grad_norm": 7.187059665898244, + "learning_rate": 1.8673054074776265e-07, + "loss": 1.8142, + "step": 1867 + }, + { + "epoch": 2.7682610307749353, + "grad_norm": 7.900667617517708, + "learning_rate": 1.844036153363582e-07, + "loss": 1.7587, + "step": 1868 + }, + { + "epoch": 2.7697441601779755, + "grad_norm": 8.120202059084054, + "learning_rate": 1.820910069561832e-07, + "loss": 1.6593, + "step": 1869 + }, + { + "epoch": 2.7712272895810157, + "grad_norm": 7.209947968610418, + "learning_rate": 1.7979272248274726e-07, + "loss": 1.7132, + "step": 1870 + }, + { + "epoch": 2.7727104189840563, + "grad_norm": 7.563685767587828, + "learning_rate": 1.7750876874897627e-07, + "loss": 1.7448, + "step": 1871 + }, + { + "epoch": 2.774193548387097, + "grad_norm": 7.564989617669713, + "learning_rate": 1.7523915254519018e-07, + "loss": 1.888, + "step": 1872 + }, + { + "epoch": 2.775676677790137, + "grad_norm": 8.248025302135217, + "learning_rate": 1.7298388061907978e-07, + "loss": 2.0232, + "step": 1873 + }, + { + "epoch": 2.7771598071931773, + "grad_norm": 7.2377014975464755, + "learning_rate": 1.7074295967569432e-07, + "loss": 1.9467, + "step": 1874 + }, + { + "epoch": 2.778642936596218, + "grad_norm": 9.256387262536395, + "learning_rate": 1.6851639637741335e-07, + "loss": 1.7822, + "step": 1875 + }, + { + "epoch": 2.7801260659992586, + "grad_norm": 7.6899781592287, + "learning_rate": 1.6630419734393e-07, + "loss": 1.9532, + "step": 1876 + }, + { + "epoch": 2.781609195402299, + "grad_norm": 7.815148212967362, + "learning_rate": 1.6410636915223545e-07, + "loss": 1.6175, + "step": 1877 + }, + { + "epoch": 2.7830923248053394, + "grad_norm": 7.421546446607623, + "learning_rate": 1.6192291833659057e-07, + "loss": 1.7696, + "step": 1878 + }, + { + "epoch": 2.7845754542083796, + "grad_norm": 7.204185616018578, + "learning_rate": 1.597538513885144e-07, + "loss": 1.9113, + "step": 1879 + }, + { + "epoch": 2.7860585836114202, + "grad_norm": 7.9015077318849904, + "learning_rate": 1.5759917475676401e-07, + "loss": 1.8221, + "step": 1880 + }, + { + "epoch": 2.7875417130144604, + "grad_norm": 7.5621123391866965, + "learning_rate": 1.554588948473068e-07, + "loss": 1.7613, + "step": 1881 + }, + { + "epoch": 2.789024842417501, + "grad_norm": 7.498124196231563, + "learning_rate": 1.53333018023315e-07, + "loss": 1.7723, + "step": 1882 + }, + { + "epoch": 2.790507971820541, + "grad_norm": 7.6966115564381, + "learning_rate": 1.512215506051351e-07, + "loss": 1.9463, + "step": 1883 + }, + { + "epoch": 2.791991101223582, + "grad_norm": 7.87064262359213, + "learning_rate": 1.4912449887027557e-07, + "loss": 1.9851, + "step": 1884 + }, + { + "epoch": 2.793474230626622, + "grad_norm": 7.787417450956274, + "learning_rate": 1.4704186905338648e-07, + "loss": 1.7963, + "step": 1885 + }, + { + "epoch": 2.7949573600296627, + "grad_norm": 7.686304593452225, + "learning_rate": 1.4497366734623874e-07, + "loss": 1.8767, + "step": 1886 + }, + { + "epoch": 2.796440489432703, + "grad_norm": 8.170512500949084, + "learning_rate": 1.4291989989770994e-07, + "loss": 1.9657, + "step": 1887 + }, + { + "epoch": 2.7979236188357435, + "grad_norm": 7.061042027839807, + "learning_rate": 1.4088057281376243e-07, + "loss": 1.7373, + "step": 1888 + }, + { + "epoch": 2.799406748238784, + "grad_norm": 7.8183830586395855, + "learning_rate": 1.3885569215742746e-07, + "loss": 1.7135, + "step": 1889 + }, + { + "epoch": 2.8008898776418243, + "grad_norm": 7.227332645793767, + "learning_rate": 1.3684526394878607e-07, + "loss": 1.7418, + "step": 1890 + }, + { + "epoch": 2.8023730070448645, + "grad_norm": 7.491482863797644, + "learning_rate": 1.3484929416495096e-07, + "loss": 1.8151, + "step": 1891 + }, + { + "epoch": 2.803856136447905, + "grad_norm": 7.495541292622709, + "learning_rate": 1.3286778874004924e-07, + "loss": 1.9468, + "step": 1892 + }, + { + "epoch": 2.8053392658509457, + "grad_norm": 7.873993656007427, + "learning_rate": 1.3090075356520515e-07, + "loss": 1.7891, + "step": 1893 + }, + { + "epoch": 2.806822395253986, + "grad_norm": 7.352312458864552, + "learning_rate": 1.2894819448852126e-07, + "loss": 1.9105, + "step": 1894 + }, + { + "epoch": 2.808305524657026, + "grad_norm": 8.048968452760224, + "learning_rate": 1.2701011731506285e-07, + "loss": 1.6548, + "step": 1895 + }, + { + "epoch": 2.8097886540600667, + "grad_norm": 7.62940702072218, + "learning_rate": 1.2508652780683916e-07, + "loss": 1.8156, + "step": 1896 + }, + { + "epoch": 2.8112717834631074, + "grad_norm": 7.6189263216982805, + "learning_rate": 1.2317743168278605e-07, + "loss": 1.7529, + "step": 1897 + }, + { + "epoch": 2.8127549128661475, + "grad_norm": 6.942853332675943, + "learning_rate": 1.212828346187528e-07, + "loss": 1.7228, + "step": 1898 + }, + { + "epoch": 2.8142380422691877, + "grad_norm": 7.585024742618108, + "learning_rate": 1.1940274224747806e-07, + "loss": 1.9149, + "step": 1899 + }, + { + "epoch": 2.8157211716722284, + "grad_norm": 8.56093868042262, + "learning_rate": 1.1753716015858008e-07, + "loss": 1.8231, + "step": 1900 + }, + { + "epoch": 2.817204301075269, + "grad_norm": 7.227704684245402, + "learning_rate": 1.1568609389853546e-07, + "loss": 1.7358, + "step": 1901 + }, + { + "epoch": 2.818687430478309, + "grad_norm": 7.11256165504275, + "learning_rate": 1.1384954897066702e-07, + "loss": 1.9125, + "step": 1902 + }, + { + "epoch": 2.82017055988135, + "grad_norm": 7.718318808641808, + "learning_rate": 1.1202753083512152e-07, + "loss": 1.7628, + "step": 1903 + }, + { + "epoch": 2.82165368928439, + "grad_norm": 7.908690460855367, + "learning_rate": 1.1022004490885974e-07, + "loss": 1.9716, + "step": 1904 + }, + { + "epoch": 2.8231368186874306, + "grad_norm": 7.568856204442426, + "learning_rate": 1.0842709656563533e-07, + "loss": 1.7783, + "step": 1905 + }, + { + "epoch": 2.824619948090471, + "grad_norm": 7.709341352964169, + "learning_rate": 1.0664869113598097e-07, + "loss": 1.7604, + "step": 1906 + }, + { + "epoch": 2.8261030774935114, + "grad_norm": 7.925993796085504, + "learning_rate": 1.0488483390719506e-07, + "loss": 1.7588, + "step": 1907 + }, + { + "epoch": 2.8275862068965516, + "grad_norm": 7.465447745034687, + "learning_rate": 1.0313553012332057e-07, + "loss": 1.7787, + "step": 1908 + }, + { + "epoch": 2.8290693362995922, + "grad_norm": 7.47809552427694, + "learning_rate": 1.0140078498513283e-07, + "loss": 1.8447, + "step": 1909 + }, + { + "epoch": 2.8305524657026324, + "grad_norm": 7.4237312432204385, + "learning_rate": 9.968060365012466e-08, + "loss": 1.6453, + "step": 1910 + }, + { + "epoch": 2.832035595105673, + "grad_norm": 7.221737951797601, + "learning_rate": 9.7974991232489e-08, + "loss": 1.7288, + "step": 1911 + }, + { + "epoch": 2.8335187245087132, + "grad_norm": 6.94025681924053, + "learning_rate": 9.628395280310398e-08, + "loss": 1.6444, + "step": 1912 + }, + { + "epoch": 2.835001853911754, + "grad_norm": 7.410226149748048, + "learning_rate": 9.46074933895208e-08, + "loss": 1.9055, + "step": 1913 + }, + { + "epoch": 2.8364849833147945, + "grad_norm": 7.414854551549083, + "learning_rate": 9.29456179759436e-08, + "loss": 1.8171, + "step": 1914 + }, + { + "epoch": 2.8379681127178347, + "grad_norm": 6.847339378476776, + "learning_rate": 9.129833150321955e-08, + "loss": 1.7431, + "step": 1915 + }, + { + "epoch": 2.839451242120875, + "grad_norm": 8.06457375636133, + "learning_rate": 8.966563886882107e-08, + "loss": 1.7467, + "step": 1916 + }, + { + "epoch": 2.8409343715239155, + "grad_norm": 7.351828665846771, + "learning_rate": 8.804754492683254e-08, + "loss": 1.876, + "step": 1917 + }, + { + "epoch": 2.842417500926956, + "grad_norm": 8.382781642040039, + "learning_rate": 8.644405448793636e-08, + "loss": 1.8675, + "step": 1918 + }, + { + "epoch": 2.8439006303299963, + "grad_norm": 7.206320573282631, + "learning_rate": 8.485517231939577e-08, + "loss": 1.815, + "step": 1919 + }, + { + "epoch": 2.8453837597330365, + "grad_norm": 8.298139638137458, + "learning_rate": 8.328090314504544e-08, + "loss": 1.945, + "step": 1920 + }, + { + "epoch": 2.846866889136077, + "grad_norm": 7.581991647652846, + "learning_rate": 8.172125164527312e-08, + "loss": 1.9445, + "step": 1921 + }, + { + "epoch": 2.8483500185391177, + "grad_norm": 8.155258938110869, + "learning_rate": 8.017622245700851e-08, + "loss": 1.728, + "step": 1922 + }, + { + "epoch": 2.849833147942158, + "grad_norm": 7.511251045074836, + "learning_rate": 7.864582017370725e-08, + "loss": 1.7455, + "step": 1923 + }, + { + "epoch": 2.851316277345198, + "grad_norm": 7.597583408898764, + "learning_rate": 7.713004934533974e-08, + "loss": 1.9524, + "step": 1924 + }, + { + "epoch": 2.8527994067482387, + "grad_norm": 7.31085055741953, + "learning_rate": 7.562891447837451e-08, + "loss": 1.846, + "step": 1925 + }, + { + "epoch": 2.8542825361512794, + "grad_norm": 7.647573940209493, + "learning_rate": 7.414242003576876e-08, + "loss": 1.7007, + "step": 1926 + }, + { + "epoch": 2.8557656655543195, + "grad_norm": 7.263350010161005, + "learning_rate": 7.267057043695014e-08, + "loss": 1.8343, + "step": 1927 + }, + { + "epoch": 2.85724879495736, + "grad_norm": 8.416359994358753, + "learning_rate": 7.121337005780937e-08, + "loss": 1.9053, + "step": 1928 + }, + { + "epoch": 2.8587319243604004, + "grad_norm": 8.24777358009068, + "learning_rate": 6.977082323068207e-08, + "loss": 1.7701, + "step": 1929 + }, + { + "epoch": 2.860215053763441, + "grad_norm": 6.970554039231589, + "learning_rate": 6.834293424433869e-08, + "loss": 1.7819, + "step": 1930 + }, + { + "epoch": 2.861698183166481, + "grad_norm": 7.657000263998588, + "learning_rate": 6.692970734397176e-08, + "loss": 1.9331, + "step": 1931 + }, + { + "epoch": 2.863181312569522, + "grad_norm": 7.110204471469501, + "learning_rate": 6.5531146731182e-08, + "loss": 1.6818, + "step": 1932 + }, + { + "epoch": 2.864664441972562, + "grad_norm": 8.193585227250194, + "learning_rate": 6.414725656396614e-08, + "loss": 1.7493, + "step": 1933 + }, + { + "epoch": 2.8661475713756026, + "grad_norm": 7.991523164099557, + "learning_rate": 6.277804095670526e-08, + "loss": 2.0307, + "step": 1934 + }, + { + "epoch": 2.867630700778643, + "grad_norm": 7.744085069214279, + "learning_rate": 6.142350398015306e-08, + "loss": 1.8346, + "step": 1935 + }, + { + "epoch": 2.8691138301816834, + "grad_norm": 7.735445658442029, + "learning_rate": 6.0083649661421e-08, + "loss": 1.7789, + "step": 1936 + }, + { + "epoch": 2.8705969595847236, + "grad_norm": 7.123017119268093, + "learning_rate": 5.875848198396927e-08, + "loss": 1.6579, + "step": 1937 + }, + { + "epoch": 2.8720800889877642, + "grad_norm": 7.662506222477719, + "learning_rate": 5.7448004887594166e-08, + "loss": 1.8558, + "step": 1938 + }, + { + "epoch": 2.873563218390805, + "grad_norm": 6.631615327198283, + "learning_rate": 5.615222226841632e-08, + "loss": 1.8453, + "step": 1939 + }, + { + "epoch": 2.875046347793845, + "grad_norm": 8.097463766013412, + "learning_rate": 5.48711379788669e-08, + "loss": 1.8389, + "step": 1940 + }, + { + "epoch": 2.8765294771968852, + "grad_norm": 7.521809847830521, + "learning_rate": 5.360475582768088e-08, + "loss": 1.8209, + "step": 1941 + }, + { + "epoch": 2.878012606599926, + "grad_norm": 7.744831167146529, + "learning_rate": 5.2353079579879895e-08, + "loss": 1.7404, + "step": 1942 + }, + { + "epoch": 2.8794957360029665, + "grad_norm": 8.099174544369992, + "learning_rate": 5.1116112956767775e-08, + "loss": 1.8459, + "step": 1943 + }, + { + "epoch": 2.8809788654060067, + "grad_norm": 7.211583413004427, + "learning_rate": 4.989385963591275e-08, + "loss": 1.8115, + "step": 1944 + }, + { + "epoch": 2.882461994809047, + "grad_norm": 7.857593403444777, + "learning_rate": 4.8686323251140865e-08, + "loss": 1.982, + "step": 1945 + }, + { + "epoch": 2.8839451242120875, + "grad_norm": 7.7468632183173955, + "learning_rate": 4.7493507392524226e-08, + "loss": 1.8296, + "step": 1946 + }, + { + "epoch": 2.885428253615128, + "grad_norm": 7.2816412071458085, + "learning_rate": 4.6315415606368875e-08, + "loss": 1.8832, + "step": 1947 + }, + { + "epoch": 2.8869113830181683, + "grad_norm": 8.18230104443639, + "learning_rate": 4.5152051395206395e-08, + "loss": 1.8266, + "step": 1948 + }, + { + "epoch": 2.8883945124212085, + "grad_norm": 7.887252636587693, + "learning_rate": 4.400341821778231e-08, + "loss": 1.9316, + "step": 1949 + }, + { + "epoch": 2.889877641824249, + "grad_norm": 8.3593890989353, + "learning_rate": 4.28695194890455e-08, + "loss": 1.8575, + "step": 1950 + }, + { + "epoch": 2.8913607712272897, + "grad_norm": 7.620175218744027, + "learning_rate": 4.175035858013987e-08, + "loss": 1.7466, + "step": 1951 + }, + { + "epoch": 2.89284390063033, + "grad_norm": 7.271579683415527, + "learning_rate": 4.064593881839052e-08, + "loss": 1.7785, + "step": 1952 + }, + { + "epoch": 2.8943270300333706, + "grad_norm": 7.836975200376659, + "learning_rate": 3.955626348729813e-08, + "loss": 1.8926, + "step": 1953 + }, + { + "epoch": 2.8958101594364107, + "grad_norm": 7.320518357035278, + "learning_rate": 3.8481335826528467e-08, + "loss": 1.8312, + "step": 1954 + }, + { + "epoch": 2.8972932888394514, + "grad_norm": 7.5530261568419474, + "learning_rate": 3.742115903189791e-08, + "loss": 1.6722, + "step": 1955 + }, + { + "epoch": 2.8987764182424915, + "grad_norm": 7.665086951914787, + "learning_rate": 3.637573625537183e-08, + "loss": 1.9643, + "step": 1956 + }, + { + "epoch": 2.900259547645532, + "grad_norm": 8.232109547654321, + "learning_rate": 3.534507060504844e-08, + "loss": 1.8344, + "step": 1957 + }, + { + "epoch": 2.9017426770485724, + "grad_norm": 7.443510427322418, + "learning_rate": 3.432916514515272e-08, + "loss": 1.9416, + "step": 1958 + }, + { + "epoch": 2.903225806451613, + "grad_norm": 7.892859093404537, + "learning_rate": 3.33280228960281e-08, + "loss": 1.853, + "step": 1959 + }, + { + "epoch": 2.904708935854653, + "grad_norm": 7.5802738417652895, + "learning_rate": 3.234164683412477e-08, + "loss": 1.8243, + "step": 1960 + }, + { + "epoch": 2.906192065257694, + "grad_norm": 7.596236144508018, + "learning_rate": 3.13700398919925e-08, + "loss": 1.8637, + "step": 1961 + }, + { + "epoch": 2.907675194660734, + "grad_norm": 8.183452395046466, + "learning_rate": 3.0413204958271714e-08, + "loss": 1.7155, + "step": 1962 + }, + { + "epoch": 2.9091583240637746, + "grad_norm": 7.257978379233838, + "learning_rate": 2.947114487768521e-08, + "loss": 1.8484, + "step": 1963 + }, + { + "epoch": 2.9106414534668152, + "grad_norm": 7.88353359367157, + "learning_rate": 2.8543862451029248e-08, + "loss": 1.6617, + "step": 1964 + }, + { + "epoch": 2.9121245828698554, + "grad_norm": 6.990105881390767, + "learning_rate": 2.763136043516468e-08, + "loss": 1.7189, + "step": 1965 + }, + { + "epoch": 2.9136077122728956, + "grad_norm": 7.710213879189851, + "learning_rate": 2.673364154301028e-08, + "loss": 1.6848, + "step": 1966 + }, + { + "epoch": 2.9150908416759362, + "grad_norm": 7.300001298243483, + "learning_rate": 2.5850708443533322e-08, + "loss": 1.7734, + "step": 1967 + }, + { + "epoch": 2.916573971078977, + "grad_norm": 6.434365772708407, + "learning_rate": 2.498256376174235e-08, + "loss": 1.6618, + "step": 1968 + }, + { + "epoch": 2.918057100482017, + "grad_norm": 7.9437716818775685, + "learning_rate": 2.41292100786783e-08, + "loss": 1.8971, + "step": 1969 + }, + { + "epoch": 2.9195402298850572, + "grad_norm": 7.4818197712388415, + "learning_rate": 2.3290649931408947e-08, + "loss": 1.897, + "step": 1970 + }, + { + "epoch": 2.921023359288098, + "grad_norm": 7.242060056901663, + "learning_rate": 2.2466885813018925e-08, + "loss": 1.8547, + "step": 1971 + }, + { + "epoch": 2.9225064886911385, + "grad_norm": 7.923704418744815, + "learning_rate": 2.165792017260415e-08, + "loss": 1.7943, + "step": 1972 + }, + { + "epoch": 2.9239896180941787, + "grad_norm": 8.168784353522208, + "learning_rate": 2.0863755415264085e-08, + "loss": 1.6425, + "step": 1973 + }, + { + "epoch": 2.925472747497219, + "grad_norm": 8.70747884607125, + "learning_rate": 2.0084393902093936e-08, + "loss": 1.743, + "step": 1974 + }, + { + "epoch": 2.9269558769002595, + "grad_norm": 7.546908443945241, + "learning_rate": 1.931983795017689e-08, + "loss": 1.6411, + "step": 1975 + }, + { + "epoch": 2.9284390063033, + "grad_norm": 7.468924335389524, + "learning_rate": 1.857008983258135e-08, + "loss": 1.8199, + "step": 1976 + }, + { + "epoch": 2.9299221357063403, + "grad_norm": 7.543903821025496, + "learning_rate": 1.783515177834816e-08, + "loss": 1.7796, + "step": 1977 + }, + { + "epoch": 2.931405265109381, + "grad_norm": 7.872685320759852, + "learning_rate": 1.711502597248893e-08, + "loss": 1.9179, + "step": 1978 + }, + { + "epoch": 2.932888394512421, + "grad_norm": 7.933307665386082, + "learning_rate": 1.640971455597662e-08, + "loss": 1.7992, + "step": 1979 + }, + { + "epoch": 2.9343715239154617, + "grad_norm": 7.304948650657707, + "learning_rate": 1.571921962574052e-08, + "loss": 1.5831, + "step": 1980 + }, + { + "epoch": 2.935854653318502, + "grad_norm": 7.950861808179716, + "learning_rate": 1.504354323466073e-08, + "loss": 1.7974, + "step": 1981 + }, + { + "epoch": 2.9373377827215426, + "grad_norm": 7.430153178260773, + "learning_rate": 1.4382687391559236e-08, + "loss": 1.9368, + "step": 1982 + }, + { + "epoch": 2.9388209121245827, + "grad_norm": 7.8128391474098375, + "learning_rate": 1.373665406119662e-08, + "loss": 1.7874, + "step": 1983 + }, + { + "epoch": 2.9403040415276234, + "grad_norm": 7.862809245070661, + "learning_rate": 1.3105445164265374e-08, + "loss": 1.7658, + "step": 1984 + }, + { + "epoch": 2.9417871709306636, + "grad_norm": 7.076534733591031, + "learning_rate": 1.248906257738436e-08, + "loss": 1.7272, + "step": 1985 + }, + { + "epoch": 2.943270300333704, + "grad_norm": 7.309886890041798, + "learning_rate": 1.188750813309214e-08, + "loss": 1.8973, + "step": 1986 + }, + { + "epoch": 2.9447534297367444, + "grad_norm": 7.7127986894705725, + "learning_rate": 1.1300783619844214e-08, + "loss": 1.8818, + "step": 1987 + }, + { + "epoch": 2.946236559139785, + "grad_norm": 7.805337964124957, + "learning_rate": 1.0728890782002455e-08, + "loss": 1.7909, + "step": 1988 + }, + { + "epoch": 2.9477196885428256, + "grad_norm": 7.083194339805901, + "learning_rate": 1.0171831319837344e-08, + "loss": 1.7423, + "step": 1989 + }, + { + "epoch": 2.949202817945866, + "grad_norm": 7.824496508690463, + "learning_rate": 9.629606889516862e-09, + "loss": 1.8622, + "step": 1990 + }, + { + "epoch": 2.950685947348906, + "grad_norm": 8.226796444552352, + "learning_rate": 9.102219103103161e-09, + "loss": 2.0223, + "step": 1991 + }, + { + "epoch": 2.9521690767519466, + "grad_norm": 7.037718714308484, + "learning_rate": 8.589669528549227e-09, + "loss": 1.7657, + "step": 1992 + }, + { + "epoch": 2.9536522061549872, + "grad_norm": 7.589000290533016, + "learning_rate": 8.09195968969334e-09, + "loss": 1.8286, + "step": 1993 + }, + { + "epoch": 2.9551353355580274, + "grad_norm": 8.013770073495053, + "learning_rate": 7.609091066253516e-09, + "loss": 1.8628, + "step": 1994 + }, + { + "epoch": 2.9566184649610676, + "grad_norm": 7.794697778999841, + "learning_rate": 7.141065093824729e-09, + "loss": 1.8434, + "step": 1995 + }, + { + "epoch": 2.9581015943641082, + "grad_norm": 7.729837166213549, + "learning_rate": 6.687883163873921e-09, + "loss": 1.849, + "step": 1996 + }, + { + "epoch": 2.959584723767149, + "grad_norm": 8.134979121778606, + "learning_rate": 6.249546623735558e-09, + "loss": 1.9589, + "step": 1997 + }, + { + "epoch": 2.961067853170189, + "grad_norm": 8.285981294607476, + "learning_rate": 5.826056776608857e-09, + "loss": 1.7176, + "step": 1998 + }, + { + "epoch": 2.9625509825732292, + "grad_norm": 7.102393874491702, + "learning_rate": 5.41741488155112e-09, + "loss": 1.7811, + "step": 1999 + }, + { + "epoch": 2.96403411197627, + "grad_norm": 7.5726830741222315, + "learning_rate": 5.0236221534777365e-09, + "loss": 1.8315, + "step": 2000 + }, + { + "epoch": 2.9655172413793105, + "grad_norm": 7.768365513939915, + "learning_rate": 4.644679763155524e-09, + "loss": 1.9292, + "step": 2001 + }, + { + "epoch": 2.9670003707823507, + "grad_norm": 7.162400642719783, + "learning_rate": 4.280588837201616e-09, + "loss": 1.5983, + "step": 2002 + }, + { + "epoch": 2.9684835001853913, + "grad_norm": 7.174793370657929, + "learning_rate": 3.931350458077354e-09, + "loss": 1.7614, + "step": 2003 + }, + { + "epoch": 2.9699666295884315, + "grad_norm": 8.011554121470514, + "learning_rate": 3.596965664088847e-09, + "loss": 2.0045, + "step": 2004 + }, + { + "epoch": 2.971449758991472, + "grad_norm": 7.339124420429306, + "learning_rate": 3.277435449378641e-09, + "loss": 1.6385, + "step": 2005 + }, + { + "epoch": 2.9729328883945123, + "grad_norm": 7.31543807143947, + "learning_rate": 2.97276076392905e-09, + "loss": 1.6295, + "step": 2006 + }, + { + "epoch": 2.974416017797553, + "grad_norm": 6.526415074823961, + "learning_rate": 2.6829425135538323e-09, + "loss": 1.7676, + "step": 2007 + }, + { + "epoch": 2.975899147200593, + "grad_norm": 7.682677960993287, + "learning_rate": 2.407981559898187e-09, + "loss": 1.6973, + "step": 2008 + }, + { + "epoch": 2.9773822766036337, + "grad_norm": 7.895837359604418, + "learning_rate": 2.147878720437091e-09, + "loss": 1.76, + "step": 2009 + }, + { + "epoch": 2.978865406006674, + "grad_norm": 7.993814869286936, + "learning_rate": 1.9026347684697465e-09, + "loss": 1.8637, + "step": 2010 + }, + { + "epoch": 2.9803485354097146, + "grad_norm": 7.716265136729132, + "learning_rate": 1.6722504331195822e-09, + "loss": 1.7135, + "step": 2011 + }, + { + "epoch": 2.9818316648127547, + "grad_norm": 7.280682405460439, + "learning_rate": 1.4567263993325865e-09, + "loss": 1.874, + "step": 2012 + }, + { + "epoch": 2.9833147942157954, + "grad_norm": 7.181679006286015, + "learning_rate": 1.256063307872868e-09, + "loss": 1.7183, + "step": 2013 + }, + { + "epoch": 2.984797923618836, + "grad_norm": 6.959494611038903, + "learning_rate": 1.0702617553226547e-09, + "loss": 1.9374, + "step": 2014 + }, + { + "epoch": 2.986281053021876, + "grad_norm": 8.237382520815377, + "learning_rate": 8.99322294081184e-10, + "loss": 1.8754, + "step": 2015 + }, + { + "epoch": 2.9877641824249164, + "grad_norm": 7.611341104716217, + "learning_rate": 7.432454323597071e-10, + "loss": 1.927, + "step": 2016 + }, + { + "epoch": 2.989247311827957, + "grad_norm": 7.788688826653011, + "learning_rate": 6.020316341837085e-10, + "loss": 1.7551, + "step": 2017 + }, + { + "epoch": 2.9907304412309976, + "grad_norm": 8.015346314285019, + "learning_rate": 4.756813193890209e-10, + "loss": 1.8131, + "step": 2018 + }, + { + "epoch": 2.992213570634038, + "grad_norm": 8.032186652677478, + "learning_rate": 3.6419486362293533e-10, + "loss": 1.7488, + "step": 2019 + }, + { + "epoch": 2.993696700037078, + "grad_norm": 6.975345470178313, + "learning_rate": 2.6757259834031544e-10, + "loss": 1.7815, + "step": 2020 + }, + { + "epoch": 2.9951798294401186, + "grad_norm": 7.943415437394898, + "learning_rate": 1.8581481080415242e-10, + "loss": 1.9314, + "step": 2021 + }, + { + "epoch": 2.9966629588431593, + "grad_norm": 7.63286849536286, + "learning_rate": 1.18921744085565e-10, + "loss": 1.8379, + "step": 2022 + }, + { + "epoch": 2.9981460882461994, + "grad_norm": 8.152463708910538, + "learning_rate": 6.689359706046894e-11, + "loss": 1.9285, + "step": 2023 + }, + { + "epoch": 2.9996292176492396, + "grad_norm": 7.895575669712671, + "learning_rate": 2.9730524411797356e-11, + "loss": 1.821, + "step": 2024 + }, + { + "epoch": 3.0, + "grad_norm": 7.895575669712671, + "learning_rate": 7.432636627280332e-12, + "loss": 2.3473, + "step": 2025 + }, + { + "epoch": 3.0, + "step": 2025, + "total_flos": 7457904663552.0, + "train_loss": 2.6562096971935696, + "train_runtime": 8576.6517, + "train_samples_per_second": 15.094, + "train_steps_per_second": 0.236 + } + ], + "logging_steps": 1, + "max_steps": 2025, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7457904663552.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}