{ "best_metric": 2.2021772861480713, "best_model_checkpoint": "final_models/laft_lug_mpt/checkpoint-4530", "epoch": 6.0, "eval_steps": 500, "global_step": 9060, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006622516556291391, "grad_norm": 25.366834045369046, "learning_rate": 1.5e-07, "loss": 4.8438, "step": 1 }, { "epoch": 0.0013245033112582781, "grad_norm": 24.103572996230607, "learning_rate": 3e-07, "loss": 4.75, "step": 2 }, { "epoch": 0.001986754966887417, "grad_norm": 31.403042933752992, "learning_rate": 4.5e-07, "loss": 4.7812, "step": 3 }, { "epoch": 0.0026490066225165563, "grad_norm": 28.349556331713593, "learning_rate": 6e-07, "loss": 4.8438, "step": 4 }, { "epoch": 0.0033112582781456954, "grad_norm": 47.522014401524785, "learning_rate": 7.499999999999999e-07, "loss": 4.9062, "step": 5 }, { "epoch": 0.003973509933774834, "grad_norm": 33.42440788025143, "learning_rate": 9e-07, "loss": 4.875, "step": 6 }, { "epoch": 0.004635761589403974, "grad_norm": 37.37771966002289, "learning_rate": 1.05e-06, "loss": 4.75, "step": 7 }, { "epoch": 0.005298013245033113, "grad_norm": 28.49686603636974, "learning_rate": 1.2e-06, "loss": 4.625, "step": 8 }, { "epoch": 0.005960264900662252, "grad_norm": 38.58555547468358, "learning_rate": 1.3499999999999998e-06, "loss": 4.625, "step": 9 }, { "epoch": 0.006622516556291391, "grad_norm": 15.5788294149647, "learning_rate": 1.4999999999999998e-06, "loss": 4.3125, "step": 10 }, { "epoch": 0.00728476821192053, "grad_norm": 13.539642570093081, "learning_rate": 1.6499999999999997e-06, "loss": 4.2188, "step": 11 }, { "epoch": 0.007947019867549669, "grad_norm": 16.45997685007064, "learning_rate": 1.8e-06, "loss": 4.125, "step": 12 }, { "epoch": 0.008609271523178808, "grad_norm": 10.338870374620004, "learning_rate": 1.9499999999999995e-06, "loss": 4.0312, "step": 13 }, { "epoch": 0.009271523178807948, "grad_norm": 12.531078700007233, "learning_rate": 2.1e-06, "loss": 4.1562, "step": 14 }, { "epoch": 0.009933774834437087, "grad_norm": 12.085342249480263, "learning_rate": 2.2499999999999996e-06, "loss": 4.3125, "step": 15 }, { "epoch": 0.010596026490066225, "grad_norm": 15.697423559496999, "learning_rate": 2.4e-06, "loss": 4.25, "step": 16 }, { "epoch": 0.011258278145695364, "grad_norm": 11.03089343361638, "learning_rate": 2.55e-06, "loss": 4.0625, "step": 17 }, { "epoch": 0.011920529801324504, "grad_norm": 11.783898396824728, "learning_rate": 2.6999999999999996e-06, "loss": 4.1562, "step": 18 }, { "epoch": 0.012582781456953643, "grad_norm": 14.035038701029494, "learning_rate": 2.85e-06, "loss": 3.875, "step": 19 }, { "epoch": 0.013245033112582781, "grad_norm": 181.286768740991, "learning_rate": 2.9999999999999997e-06, "loss": 4.1562, "step": 20 }, { "epoch": 0.01390728476821192, "grad_norm": 28.84619958472304, "learning_rate": 3.15e-06, "loss": 4.0625, "step": 21 }, { "epoch": 0.01456953642384106, "grad_norm": 9.485673104253173, "learning_rate": 3.2999999999999993e-06, "loss": 4.0625, "step": 22 }, { "epoch": 0.015231788079470199, "grad_norm": 9.570221121660918, "learning_rate": 3.4499999999999996e-06, "loss": 3.9688, "step": 23 }, { "epoch": 0.015894039735099338, "grad_norm": 9.685835428032023, "learning_rate": 3.6e-06, "loss": 4.0, "step": 24 }, { "epoch": 0.016556291390728478, "grad_norm": 8.965141745792456, "learning_rate": 3.7499999999999997e-06, "loss": 4.0625, "step": 25 }, { "epoch": 0.017218543046357615, "grad_norm": 8.443756038943226, "learning_rate": 3.899999999999999e-06, "loss": 4.0312, "step": 26 }, { "epoch": 0.017880794701986755, "grad_norm": 9.00900750414927, "learning_rate": 4.049999999999999e-06, "loss": 3.9531, "step": 27 }, { "epoch": 0.018543046357615896, "grad_norm": 21.43724193727691, "learning_rate": 4.2e-06, "loss": 3.9375, "step": 28 }, { "epoch": 0.019205298013245033, "grad_norm": 8.00759282451024, "learning_rate": 4.35e-06, "loss": 4.0625, "step": 29 }, { "epoch": 0.019867549668874173, "grad_norm": 8.335236533716525, "learning_rate": 4.499999999999999e-06, "loss": 4.0312, "step": 30 }, { "epoch": 0.02052980132450331, "grad_norm": 8.523181382739859, "learning_rate": 4.6499999999999995e-06, "loss": 3.7344, "step": 31 }, { "epoch": 0.02119205298013245, "grad_norm": 7.335662695749709, "learning_rate": 4.8e-06, "loss": 3.9531, "step": 32 }, { "epoch": 0.02185430463576159, "grad_norm": 14.642056640956355, "learning_rate": 4.95e-06, "loss": 3.875, "step": 33 }, { "epoch": 0.022516556291390728, "grad_norm": 7.226751530548763, "learning_rate": 5.1e-06, "loss": 3.625, "step": 34 }, { "epoch": 0.023178807947019868, "grad_norm": 6.931934901699227, "learning_rate": 5.25e-06, "loss": 3.8125, "step": 35 }, { "epoch": 0.02384105960264901, "grad_norm": 6.961482163524123, "learning_rate": 5.399999999999999e-06, "loss": 3.7969, "step": 36 }, { "epoch": 0.024503311258278145, "grad_norm": 7.166952548626372, "learning_rate": 5.549999999999999e-06, "loss": 3.7344, "step": 37 }, { "epoch": 0.025165562913907286, "grad_norm": 7.5011039990677855, "learning_rate": 5.7e-06, "loss": 3.8594, "step": 38 }, { "epoch": 0.025827814569536423, "grad_norm": 6.979537062109198, "learning_rate": 5.85e-06, "loss": 3.625, "step": 39 }, { "epoch": 0.026490066225165563, "grad_norm": 7.501463225997184, "learning_rate": 5.999999999999999e-06, "loss": 3.7969, "step": 40 }, { "epoch": 0.027152317880794703, "grad_norm": 7.128116057728067, "learning_rate": 6.1499999999999996e-06, "loss": 3.875, "step": 41 }, { "epoch": 0.02781456953642384, "grad_norm": 7.450186004966472, "learning_rate": 6.3e-06, "loss": 3.3906, "step": 42 }, { "epoch": 0.02847682119205298, "grad_norm": 7.262599008700824, "learning_rate": 6.449999999999999e-06, "loss": 3.7344, "step": 43 }, { "epoch": 0.02913907284768212, "grad_norm": 7.157354371834633, "learning_rate": 6.599999999999999e-06, "loss": 3.5469, "step": 44 }, { "epoch": 0.029801324503311258, "grad_norm": 6.529833935351724, "learning_rate": 6.749999999999999e-06, "loss": 3.3594, "step": 45 }, { "epoch": 0.030463576158940398, "grad_norm": 7.9320289419469185, "learning_rate": 6.899999999999999e-06, "loss": 3.8594, "step": 46 }, { "epoch": 0.031125827814569535, "grad_norm": 7.060377804922717, "learning_rate": 7.049999999999999e-06, "loss": 3.625, "step": 47 }, { "epoch": 0.031788079470198675, "grad_norm": 6.891224904152876, "learning_rate": 7.2e-06, "loss": 3.7656, "step": 48 }, { "epoch": 0.03245033112582781, "grad_norm": 9.294009042390158, "learning_rate": 7.35e-06, "loss": 3.7344, "step": 49 }, { "epoch": 0.033112582781456956, "grad_norm": 7.084400003040845, "learning_rate": 7.499999999999999e-06, "loss": 3.625, "step": 50 }, { "epoch": 0.03377483443708609, "grad_norm": 6.4800219931444945, "learning_rate": 7.65e-06, "loss": 3.6562, "step": 51 }, { "epoch": 0.03443708609271523, "grad_norm": 7.035922127455148, "learning_rate": 7.799999999999998e-06, "loss": 3.7656, "step": 52 }, { "epoch": 0.035099337748344374, "grad_norm": 6.278399860408964, "learning_rate": 7.949999999999998e-06, "loss": 3.2031, "step": 53 }, { "epoch": 0.03576158940397351, "grad_norm": 6.355358791009081, "learning_rate": 8.099999999999999e-06, "loss": 3.4844, "step": 54 }, { "epoch": 0.03642384105960265, "grad_norm": 7.164881703796917, "learning_rate": 8.249999999999999e-06, "loss": 3.3281, "step": 55 }, { "epoch": 0.03708609271523179, "grad_norm": 47.03805644062284, "learning_rate": 8.4e-06, "loss": 3.5625, "step": 56 }, { "epoch": 0.03774834437086093, "grad_norm": 7.518813612641354, "learning_rate": 8.55e-06, "loss": 3.7188, "step": 57 }, { "epoch": 0.038410596026490065, "grad_norm": 6.307641293758115, "learning_rate": 8.7e-06, "loss": 3.0, "step": 58 }, { "epoch": 0.0390728476821192, "grad_norm": 6.101815811381738, "learning_rate": 8.849999999999998e-06, "loss": 3.5, "step": 59 }, { "epoch": 0.039735099337748346, "grad_norm": 6.121500036003077, "learning_rate": 8.999999999999999e-06, "loss": 3.1406, "step": 60 }, { "epoch": 0.04039735099337748, "grad_norm": 6.322691623459317, "learning_rate": 9.149999999999999e-06, "loss": 3.6562, "step": 61 }, { "epoch": 0.04105960264900662, "grad_norm": 8.550403796972356, "learning_rate": 9.299999999999999e-06, "loss": 3.6094, "step": 62 }, { "epoch": 0.041721854304635764, "grad_norm": 5.881021205952493, "learning_rate": 9.45e-06, "loss": 3.4531, "step": 63 }, { "epoch": 0.0423841059602649, "grad_norm": 6.343767768578617, "learning_rate": 9.6e-06, "loss": 3.4375, "step": 64 }, { "epoch": 0.04304635761589404, "grad_norm": 5.468997291403601, "learning_rate": 9.75e-06, "loss": 3.3594, "step": 65 }, { "epoch": 0.04370860927152318, "grad_norm": 13.21834323159589, "learning_rate": 9.9e-06, "loss": 3.6406, "step": 66 }, { "epoch": 0.04437086092715232, "grad_norm": 5.751675099558176, "learning_rate": 1.005e-05, "loss": 3.6875, "step": 67 }, { "epoch": 0.045033112582781455, "grad_norm": 5.789432849160812, "learning_rate": 1.02e-05, "loss": 3.5, "step": 68 }, { "epoch": 0.0456953642384106, "grad_norm": 9.22798294268364, "learning_rate": 1.035e-05, "loss": 3.5312, "step": 69 }, { "epoch": 0.046357615894039736, "grad_norm": 5.81339002256944, "learning_rate": 1.05e-05, "loss": 3.4062, "step": 70 }, { "epoch": 0.04701986754966887, "grad_norm": 6.483865471822502, "learning_rate": 1.0649999999999998e-05, "loss": 3.5469, "step": 71 }, { "epoch": 0.04768211920529802, "grad_norm": 6.681407312460688, "learning_rate": 1.0799999999999998e-05, "loss": 3.0469, "step": 72 }, { "epoch": 0.048344370860927154, "grad_norm": 5.900284275769466, "learning_rate": 1.0949999999999998e-05, "loss": 3.4062, "step": 73 }, { "epoch": 0.04900662251655629, "grad_norm": 6.565697552430453, "learning_rate": 1.1099999999999999e-05, "loss": 3.4219, "step": 74 }, { "epoch": 0.04966887417218543, "grad_norm": 6.479208793249415, "learning_rate": 1.1249999999999999e-05, "loss": 3.4375, "step": 75 }, { "epoch": 0.05033112582781457, "grad_norm": 10.119589497347254, "learning_rate": 1.14e-05, "loss": 3.4062, "step": 76 }, { "epoch": 0.05099337748344371, "grad_norm": 5.657504471843712, "learning_rate": 1.155e-05, "loss": 3.4375, "step": 77 }, { "epoch": 0.051655629139072845, "grad_norm": 5.762975347033508, "learning_rate": 1.17e-05, "loss": 3.3906, "step": 78 }, { "epoch": 0.05231788079470199, "grad_norm": 5.472175426409925, "learning_rate": 1.1849999999999998e-05, "loss": 3.3281, "step": 79 }, { "epoch": 0.052980132450331126, "grad_norm": 5.36080185986848, "learning_rate": 1.1999999999999999e-05, "loss": 3.3125, "step": 80 }, { "epoch": 0.05364238410596026, "grad_norm": 8.6638918869147, "learning_rate": 1.2149999999999999e-05, "loss": 3.3438, "step": 81 }, { "epoch": 0.054304635761589407, "grad_norm": 5.674651496967873, "learning_rate": 1.2299999999999999e-05, "loss": 3.4062, "step": 82 }, { "epoch": 0.05496688741721854, "grad_norm": 5.361382771034597, "learning_rate": 1.245e-05, "loss": 2.9844, "step": 83 }, { "epoch": 0.05562913907284768, "grad_norm": 6.09753258999747, "learning_rate": 1.26e-05, "loss": 3.0469, "step": 84 }, { "epoch": 0.056291390728476824, "grad_norm": 7.251556938686882, "learning_rate": 1.275e-05, "loss": 3.3906, "step": 85 }, { "epoch": 0.05695364238410596, "grad_norm": 8.686673351259, "learning_rate": 1.2899999999999998e-05, "loss": 3.4062, "step": 86 }, { "epoch": 0.0576158940397351, "grad_norm": 5.322342010285953, "learning_rate": 1.3049999999999999e-05, "loss": 3.2031, "step": 87 }, { "epoch": 0.05827814569536424, "grad_norm": 7.568736522500978, "learning_rate": 1.3199999999999997e-05, "loss": 3.2031, "step": 88 }, { "epoch": 0.05894039735099338, "grad_norm": 7.860436148140724, "learning_rate": 1.3349999999999998e-05, "loss": 3.25, "step": 89 }, { "epoch": 0.059602649006622516, "grad_norm": 5.184123992360204, "learning_rate": 1.3499999999999998e-05, "loss": 3.4219, "step": 90 }, { "epoch": 0.06026490066225165, "grad_norm": 5.15407267726703, "learning_rate": 1.3649999999999998e-05, "loss": 3.2969, "step": 91 }, { "epoch": 0.060927152317880796, "grad_norm": 6.113863867178894, "learning_rate": 1.3799999999999998e-05, "loss": 3.5469, "step": 92 }, { "epoch": 0.06158940397350993, "grad_norm": 5.501189495418315, "learning_rate": 1.3949999999999999e-05, "loss": 3.2031, "step": 93 }, { "epoch": 0.06225165562913907, "grad_norm": 5.771850788341689, "learning_rate": 1.4099999999999999e-05, "loss": 3.3281, "step": 94 }, { "epoch": 0.06291390728476821, "grad_norm": 5.807834746409673, "learning_rate": 1.4249999999999999e-05, "loss": 3.5625, "step": 95 }, { "epoch": 0.06357615894039735, "grad_norm": 63.21753084442461, "learning_rate": 1.44e-05, "loss": 3.4062, "step": 96 }, { "epoch": 0.06423841059602649, "grad_norm": 5.462087047107723, "learning_rate": 1.455e-05, "loss": 3.2969, "step": 97 }, { "epoch": 0.06490066225165562, "grad_norm": 6.078939244629908, "learning_rate": 1.47e-05, "loss": 3.7031, "step": 98 }, { "epoch": 0.06556291390728476, "grad_norm": 4.6847295880819075, "learning_rate": 1.485e-05, "loss": 3.1875, "step": 99 }, { "epoch": 0.06622516556291391, "grad_norm": 5.81891908564673, "learning_rate": 1.4999999999999999e-05, "loss": 2.9219, "step": 100 }, { "epoch": 0.06688741721854305, "grad_norm": 5.860334849479316, "learning_rate": 1.5149999999999999e-05, "loss": 3.4688, "step": 101 }, { "epoch": 0.06754966887417219, "grad_norm": 5.771205621871528, "learning_rate": 1.53e-05, "loss": 3.375, "step": 102 }, { "epoch": 0.06821192052980132, "grad_norm": 5.50689544288883, "learning_rate": 1.545e-05, "loss": 3.4219, "step": 103 }, { "epoch": 0.06887417218543046, "grad_norm": 8.700164391791958, "learning_rate": 1.5599999999999996e-05, "loss": 3.1562, "step": 104 }, { "epoch": 0.0695364238410596, "grad_norm": 5.26785768330116, "learning_rate": 1.5749999999999997e-05, "loss": 3.2656, "step": 105 }, { "epoch": 0.07019867549668875, "grad_norm": 5.1421669104430086, "learning_rate": 1.5899999999999997e-05, "loss": 3.25, "step": 106 }, { "epoch": 0.07086092715231788, "grad_norm": 5.3978457374794555, "learning_rate": 1.6049999999999997e-05, "loss": 3.2188, "step": 107 }, { "epoch": 0.07152317880794702, "grad_norm": 5.33549150599327, "learning_rate": 1.6199999999999997e-05, "loss": 3.3281, "step": 108 }, { "epoch": 0.07218543046357616, "grad_norm": 5.173401967196686, "learning_rate": 1.6349999999999998e-05, "loss": 3.3906, "step": 109 }, { "epoch": 0.0728476821192053, "grad_norm": 5.2978481071786145, "learning_rate": 1.6499999999999998e-05, "loss": 3.2656, "step": 110 }, { "epoch": 0.07350993377483443, "grad_norm": 9.97935657801475, "learning_rate": 1.6649999999999998e-05, "loss": 3.2812, "step": 111 }, { "epoch": 0.07417218543046358, "grad_norm": 4.6848286454403105, "learning_rate": 1.68e-05, "loss": 3.4688, "step": 112 }, { "epoch": 0.07483443708609272, "grad_norm": 4.918402389219796, "learning_rate": 1.695e-05, "loss": 3.3281, "step": 113 }, { "epoch": 0.07549668874172186, "grad_norm": 6.630124232917517, "learning_rate": 1.71e-05, "loss": 3.0312, "step": 114 }, { "epoch": 0.076158940397351, "grad_norm": 5.1924776683830105, "learning_rate": 1.725e-05, "loss": 3.0625, "step": 115 }, { "epoch": 0.07682119205298013, "grad_norm": 6.984685865380342, "learning_rate": 1.74e-05, "loss": 3.5312, "step": 116 }, { "epoch": 0.07748344370860927, "grad_norm": 4.67514968669566, "learning_rate": 1.755e-05, "loss": 3.1875, "step": 117 }, { "epoch": 0.0781456953642384, "grad_norm": 4.8054194464335325, "learning_rate": 1.7699999999999997e-05, "loss": 3.2969, "step": 118 }, { "epoch": 0.07880794701986756, "grad_norm": 4.989066147447696, "learning_rate": 1.7849999999999997e-05, "loss": 2.8438, "step": 119 }, { "epoch": 0.07947019867549669, "grad_norm": 4.434139647061637, "learning_rate": 1.7999999999999997e-05, "loss": 3.3281, "step": 120 }, { "epoch": 0.08013245033112583, "grad_norm": 4.848827523568246, "learning_rate": 1.8149999999999997e-05, "loss": 3.25, "step": 121 }, { "epoch": 0.08079470198675497, "grad_norm": 4.819274007507709, "learning_rate": 1.8299999999999998e-05, "loss": 3.375, "step": 122 }, { "epoch": 0.0814569536423841, "grad_norm": 4.6709028641541614, "learning_rate": 1.8449999999999998e-05, "loss": 3.125, "step": 123 }, { "epoch": 0.08211920529801324, "grad_norm": 5.111429794195006, "learning_rate": 1.8599999999999998e-05, "loss": 3.1094, "step": 124 }, { "epoch": 0.08278145695364239, "grad_norm": 5.096698395147178, "learning_rate": 1.875e-05, "loss": 3.2969, "step": 125 }, { "epoch": 0.08344370860927153, "grad_norm": 5.314646436061757, "learning_rate": 1.89e-05, "loss": 3.25, "step": 126 }, { "epoch": 0.08410596026490066, "grad_norm": 4.843553978032765, "learning_rate": 1.905e-05, "loss": 3.3594, "step": 127 }, { "epoch": 0.0847682119205298, "grad_norm": 4.240870302572376, "learning_rate": 1.92e-05, "loss": 3.2656, "step": 128 }, { "epoch": 0.08543046357615894, "grad_norm": 4.791622055588395, "learning_rate": 1.935e-05, "loss": 3.0156, "step": 129 }, { "epoch": 0.08609271523178808, "grad_norm": 4.485961677489531, "learning_rate": 1.95e-05, "loss": 3.2969, "step": 130 }, { "epoch": 0.08675496688741721, "grad_norm": 4.759869683803728, "learning_rate": 1.965e-05, "loss": 3.4219, "step": 131 }, { "epoch": 0.08741721854304636, "grad_norm": 4.391280244843061, "learning_rate": 1.98e-05, "loss": 3.1094, "step": 132 }, { "epoch": 0.0880794701986755, "grad_norm": 4.3469035092862045, "learning_rate": 1.995e-05, "loss": 3.1094, "step": 133 }, { "epoch": 0.08874172185430464, "grad_norm": 4.190341140841672, "learning_rate": 2.01e-05, "loss": 3.0469, "step": 134 }, { "epoch": 0.08940397350993377, "grad_norm": 4.49563004174637, "learning_rate": 2.025e-05, "loss": 2.9844, "step": 135 }, { "epoch": 0.09006622516556291, "grad_norm": 5.209854057537542, "learning_rate": 2.04e-05, "loss": 3.6094, "step": 136 }, { "epoch": 0.09072847682119205, "grad_norm": 4.470904397461312, "learning_rate": 2.055e-05, "loss": 3.3594, "step": 137 }, { "epoch": 0.0913907284768212, "grad_norm": 4.404925028420012, "learning_rate": 2.07e-05, "loss": 3.3438, "step": 138 }, { "epoch": 0.09205298013245033, "grad_norm": 4.235668053153122, "learning_rate": 2.085e-05, "loss": 3.2031, "step": 139 }, { "epoch": 0.09271523178807947, "grad_norm": 5.1489886719249585, "learning_rate": 2.1e-05, "loss": 3.2656, "step": 140 }, { "epoch": 0.09337748344370861, "grad_norm": 4.367981545093435, "learning_rate": 2.1149999999999996e-05, "loss": 3.1719, "step": 141 }, { "epoch": 0.09403973509933775, "grad_norm": 4.2364539730829245, "learning_rate": 2.1299999999999996e-05, "loss": 3.25, "step": 142 }, { "epoch": 0.09470198675496688, "grad_norm": 4.5575893875267015, "learning_rate": 2.1449999999999996e-05, "loss": 3.2969, "step": 143 }, { "epoch": 0.09536423841059603, "grad_norm": 3.949076483181059, "learning_rate": 2.1599999999999996e-05, "loss": 3.1406, "step": 144 }, { "epoch": 0.09602649006622517, "grad_norm": 4.244144790740239, "learning_rate": 2.1749999999999997e-05, "loss": 3.1094, "step": 145 }, { "epoch": 0.09668874172185431, "grad_norm": 4.294050854634997, "learning_rate": 2.1899999999999997e-05, "loss": 3.1875, "step": 146 }, { "epoch": 0.09735099337748344, "grad_norm": 4.239134750651539, "learning_rate": 2.2049999999999997e-05, "loss": 2.8594, "step": 147 }, { "epoch": 0.09801324503311258, "grad_norm": 4.212574580317083, "learning_rate": 2.2199999999999998e-05, "loss": 3.2344, "step": 148 }, { "epoch": 0.09867549668874172, "grad_norm": 4.092774522252905, "learning_rate": 2.2349999999999998e-05, "loss": 3.1562, "step": 149 }, { "epoch": 0.09933774834437085, "grad_norm": 4.153421618460364, "learning_rate": 2.2499999999999998e-05, "loss": 2.8281, "step": 150 }, { "epoch": 0.1, "grad_norm": 4.827904818874844, "learning_rate": 2.2649999999999998e-05, "loss": 3.3594, "step": 151 }, { "epoch": 0.10066225165562914, "grad_norm": 4.652360213901008, "learning_rate": 2.28e-05, "loss": 3.2188, "step": 152 }, { "epoch": 0.10132450331125828, "grad_norm": 4.267631099265926, "learning_rate": 2.295e-05, "loss": 3.125, "step": 153 }, { "epoch": 0.10198675496688742, "grad_norm": 4.052690594441697, "learning_rate": 2.31e-05, "loss": 3.3906, "step": 154 }, { "epoch": 0.10264900662251655, "grad_norm": 4.021559285610502, "learning_rate": 2.325e-05, "loss": 3.1562, "step": 155 }, { "epoch": 0.10331125827814569, "grad_norm": 4.222179144888645, "learning_rate": 2.34e-05, "loss": 3.1719, "step": 156 }, { "epoch": 0.10397350993377484, "grad_norm": 4.1681408751929006, "learning_rate": 2.3549999999999996e-05, "loss": 3.1719, "step": 157 }, { "epoch": 0.10463576158940398, "grad_norm": 4.1208863311935735, "learning_rate": 2.3699999999999997e-05, "loss": 3.2344, "step": 158 }, { "epoch": 0.10529801324503311, "grad_norm": 4.15281753214379, "learning_rate": 2.3849999999999997e-05, "loss": 3.2031, "step": 159 }, { "epoch": 0.10596026490066225, "grad_norm": 3.7206428623442744, "learning_rate": 2.3999999999999997e-05, "loss": 2.875, "step": 160 }, { "epoch": 0.10662251655629139, "grad_norm": 4.043442327454647, "learning_rate": 2.4149999999999997e-05, "loss": 3.1875, "step": 161 }, { "epoch": 0.10728476821192053, "grad_norm": 3.8834759506069174, "learning_rate": 2.4299999999999998e-05, "loss": 2.7812, "step": 162 }, { "epoch": 0.10794701986754966, "grad_norm": 3.6602152758977318, "learning_rate": 2.4449999999999998e-05, "loss": 3.1875, "step": 163 }, { "epoch": 0.10860927152317881, "grad_norm": 4.2287619105009675, "learning_rate": 2.4599999999999998e-05, "loss": 2.75, "step": 164 }, { "epoch": 0.10927152317880795, "grad_norm": 110.11644419668347, "learning_rate": 2.475e-05, "loss": 3.1875, "step": 165 }, { "epoch": 0.10993377483443709, "grad_norm": 4.548875327770721, "learning_rate": 2.49e-05, "loss": 3.1719, "step": 166 }, { "epoch": 0.11059602649006622, "grad_norm": 4.2580577760972425, "learning_rate": 2.505e-05, "loss": 3.3594, "step": 167 }, { "epoch": 0.11125827814569536, "grad_norm": 4.897121920365212, "learning_rate": 2.52e-05, "loss": 3.0625, "step": 168 }, { "epoch": 0.1119205298013245, "grad_norm": 4.201568083604385, "learning_rate": 2.535e-05, "loss": 3.1875, "step": 169 }, { "epoch": 0.11258278145695365, "grad_norm": 4.529239410849345, "learning_rate": 2.55e-05, "loss": 2.875, "step": 170 }, { "epoch": 0.11324503311258279, "grad_norm": 5.123301636622721, "learning_rate": 2.565e-05, "loss": 3.3906, "step": 171 }, { "epoch": 0.11390728476821192, "grad_norm": 4.49654856501852, "learning_rate": 2.5799999999999997e-05, "loss": 3.1406, "step": 172 }, { "epoch": 0.11456953642384106, "grad_norm": 4.492921901944353, "learning_rate": 2.5949999999999997e-05, "loss": 3.0938, "step": 173 }, { "epoch": 0.1152317880794702, "grad_norm": 4.923309698447017, "learning_rate": 2.6099999999999997e-05, "loss": 3.0, "step": 174 }, { "epoch": 0.11589403973509933, "grad_norm": 4.690247596431798, "learning_rate": 2.6249999999999998e-05, "loss": 3.4688, "step": 175 }, { "epoch": 0.11655629139072848, "grad_norm": 4.272596871015295, "learning_rate": 2.6399999999999995e-05, "loss": 3.2969, "step": 176 }, { "epoch": 0.11721854304635762, "grad_norm": 4.732381577853729, "learning_rate": 2.6549999999999995e-05, "loss": 3.0469, "step": 177 }, { "epoch": 0.11788079470198676, "grad_norm": 4.501743902648583, "learning_rate": 2.6699999999999995e-05, "loss": 3.25, "step": 178 }, { "epoch": 0.1185430463576159, "grad_norm": 4.179809817716381, "learning_rate": 2.6849999999999995e-05, "loss": 3.25, "step": 179 }, { "epoch": 0.11920529801324503, "grad_norm": 3.8587769390686555, "learning_rate": 2.6999999999999996e-05, "loss": 2.9375, "step": 180 }, { "epoch": 0.11986754966887417, "grad_norm": 4.092849684724, "learning_rate": 2.7149999999999996e-05, "loss": 3.1406, "step": 181 }, { "epoch": 0.1205298013245033, "grad_norm": 4.053902606102931, "learning_rate": 2.7299999999999996e-05, "loss": 2.8594, "step": 182 }, { "epoch": 0.12119205298013246, "grad_norm": 3.9965128349894683, "learning_rate": 2.7449999999999996e-05, "loss": 3.2344, "step": 183 }, { "epoch": 0.12185430463576159, "grad_norm": 4.196957126428847, "learning_rate": 2.7599999999999997e-05, "loss": 3.3125, "step": 184 }, { "epoch": 0.12251655629139073, "grad_norm": 3.924473440394618, "learning_rate": 2.7749999999999997e-05, "loss": 3.25, "step": 185 }, { "epoch": 0.12317880794701987, "grad_norm": 3.9970385700387805, "learning_rate": 2.7899999999999997e-05, "loss": 2.9844, "step": 186 }, { "epoch": 0.123841059602649, "grad_norm": 3.947088309652612, "learning_rate": 2.8049999999999997e-05, "loss": 3.1562, "step": 187 }, { "epoch": 0.12450331125827814, "grad_norm": 4.382439862482131, "learning_rate": 2.8199999999999998e-05, "loss": 2.8438, "step": 188 }, { "epoch": 0.1251655629139073, "grad_norm": 4.385501978447136, "learning_rate": 2.8349999999999998e-05, "loss": 2.9062, "step": 189 }, { "epoch": 0.12582781456953643, "grad_norm": 3.9039332679765724, "learning_rate": 2.8499999999999998e-05, "loss": 3.0312, "step": 190 }, { "epoch": 0.12649006622516556, "grad_norm": 3.991429810304012, "learning_rate": 2.865e-05, "loss": 3.25, "step": 191 }, { "epoch": 0.1271523178807947, "grad_norm": 3.7001444238201446, "learning_rate": 2.88e-05, "loss": 3.2031, "step": 192 }, { "epoch": 0.12781456953642384, "grad_norm": 3.7322489527420672, "learning_rate": 2.895e-05, "loss": 3.3906, "step": 193 }, { "epoch": 0.12847682119205298, "grad_norm": 3.623251477088876, "learning_rate": 2.91e-05, "loss": 3.3281, "step": 194 }, { "epoch": 0.1291390728476821, "grad_norm": 3.867124928209203, "learning_rate": 2.925e-05, "loss": 3.2188, "step": 195 }, { "epoch": 0.12980132450331125, "grad_norm": 3.8359874276884285, "learning_rate": 2.94e-05, "loss": 3.0938, "step": 196 }, { "epoch": 0.1304635761589404, "grad_norm": 4.080413503782875, "learning_rate": 2.955e-05, "loss": 2.6875, "step": 197 }, { "epoch": 0.13112582781456952, "grad_norm": 4.601332797904551, "learning_rate": 2.97e-05, "loss": 3.2344, "step": 198 }, { "epoch": 0.1317880794701987, "grad_norm": 4.127875712856974, "learning_rate": 2.985e-05, "loss": 3.0938, "step": 199 }, { "epoch": 0.13245033112582782, "grad_norm": 3.675504261209941, "learning_rate": 2.9999999999999997e-05, "loss": 3.0938, "step": 200 }, { "epoch": 0.13311258278145696, "grad_norm": 3.84472757645993, "learning_rate": 3.0149999999999998e-05, "loss": 3.2656, "step": 201 }, { "epoch": 0.1337748344370861, "grad_norm": 3.6825027293286814, "learning_rate": 3.0299999999999998e-05, "loss": 3.0469, "step": 202 }, { "epoch": 0.13443708609271524, "grad_norm": 3.503359203782447, "learning_rate": 3.0449999999999998e-05, "loss": 3.1562, "step": 203 }, { "epoch": 0.13509933774834437, "grad_norm": 4.067763833084508, "learning_rate": 3.06e-05, "loss": 3.3281, "step": 204 }, { "epoch": 0.1357615894039735, "grad_norm": 3.5888029350269224, "learning_rate": 3.0749999999999995e-05, "loss": 3.0625, "step": 205 }, { "epoch": 0.13642384105960265, "grad_norm": 4.0400209174875155, "learning_rate": 3.09e-05, "loss": 3.3281, "step": 206 }, { "epoch": 0.13708609271523178, "grad_norm": 3.5904414498861272, "learning_rate": 3.1049999999999996e-05, "loss": 3.0938, "step": 207 }, { "epoch": 0.13774834437086092, "grad_norm": 3.625287833426859, "learning_rate": 3.119999999999999e-05, "loss": 3.0312, "step": 208 }, { "epoch": 0.13841059602649006, "grad_norm": 3.608411288215931, "learning_rate": 3.1349999999999996e-05, "loss": 2.9531, "step": 209 }, { "epoch": 0.1390728476821192, "grad_norm": 3.5996876944737, "learning_rate": 3.149999999999999e-05, "loss": 3.25, "step": 210 }, { "epoch": 0.13973509933774833, "grad_norm": 3.3429836589835347, "learning_rate": 3.165e-05, "loss": 2.8594, "step": 211 }, { "epoch": 0.1403973509933775, "grad_norm": 3.6784230937164337, "learning_rate": 3.1799999999999994e-05, "loss": 2.75, "step": 212 }, { "epoch": 0.14105960264900663, "grad_norm": 3.6646729961201854, "learning_rate": 3.195e-05, "loss": 2.9844, "step": 213 }, { "epoch": 0.14172185430463577, "grad_norm": 3.61158194251105, "learning_rate": 3.2099999999999994e-05, "loss": 3.2031, "step": 214 }, { "epoch": 0.1423841059602649, "grad_norm": 3.732622006010356, "learning_rate": 3.225e-05, "loss": 2.9219, "step": 215 }, { "epoch": 0.14304635761589404, "grad_norm": 3.5848477173929734, "learning_rate": 3.2399999999999995e-05, "loss": 2.8594, "step": 216 }, { "epoch": 0.14370860927152318, "grad_norm": 3.511184340284042, "learning_rate": 3.255e-05, "loss": 2.9688, "step": 217 }, { "epoch": 0.14437086092715232, "grad_norm": 4.194066170987906, "learning_rate": 3.2699999999999995e-05, "loss": 3.0781, "step": 218 }, { "epoch": 0.14503311258278145, "grad_norm": 3.280374157945727, "learning_rate": 3.285e-05, "loss": 2.9062, "step": 219 }, { "epoch": 0.1456953642384106, "grad_norm": 3.3743497592004257, "learning_rate": 3.2999999999999996e-05, "loss": 2.9219, "step": 220 }, { "epoch": 0.14635761589403973, "grad_norm": 3.362357268675812, "learning_rate": 3.315e-05, "loss": 3.375, "step": 221 }, { "epoch": 0.14701986754966886, "grad_norm": 3.5982062729311957, "learning_rate": 3.3299999999999996e-05, "loss": 3.2344, "step": 222 }, { "epoch": 0.147682119205298, "grad_norm": 3.4681319551500676, "learning_rate": 3.345e-05, "loss": 3.0, "step": 223 }, { "epoch": 0.14834437086092717, "grad_norm": 3.3635239042832974, "learning_rate": 3.36e-05, "loss": 3.2969, "step": 224 }, { "epoch": 0.1490066225165563, "grad_norm": 3.2019600675360316, "learning_rate": 3.375e-05, "loss": 3.0469, "step": 225 }, { "epoch": 0.14966887417218544, "grad_norm": 3.494575607343542, "learning_rate": 3.39e-05, "loss": 2.9375, "step": 226 }, { "epoch": 0.15033112582781458, "grad_norm": 3.4057386810355155, "learning_rate": 3.405e-05, "loss": 3.3594, "step": 227 }, { "epoch": 0.1509933774834437, "grad_norm": 3.3601993218846906, "learning_rate": 3.42e-05, "loss": 3.1406, "step": 228 }, { "epoch": 0.15165562913907285, "grad_norm": 3.3991399692466477, "learning_rate": 3.435e-05, "loss": 2.9062, "step": 229 }, { "epoch": 0.152317880794702, "grad_norm": 3.294297373515897, "learning_rate": 3.45e-05, "loss": 2.6719, "step": 230 }, { "epoch": 0.15298013245033112, "grad_norm": 3.987609355582827, "learning_rate": 3.465e-05, "loss": 3.0781, "step": 231 }, { "epoch": 0.15364238410596026, "grad_norm": 3.9084259587624213, "learning_rate": 3.48e-05, "loss": 3.1406, "step": 232 }, { "epoch": 0.1543046357615894, "grad_norm": 3.3596587106716904, "learning_rate": 3.4949999999999996e-05, "loss": 3.0469, "step": 233 }, { "epoch": 0.15496688741721854, "grad_norm": 3.459787228106157, "learning_rate": 3.51e-05, "loss": 3.1406, "step": 234 }, { "epoch": 0.15562913907284767, "grad_norm": 3.2562410822583665, "learning_rate": 3.5249999999999996e-05, "loss": 3.0312, "step": 235 }, { "epoch": 0.1562913907284768, "grad_norm": 3.474251339129339, "learning_rate": 3.539999999999999e-05, "loss": 3.1094, "step": 236 }, { "epoch": 0.15695364238410597, "grad_norm": 3.2986637399227536, "learning_rate": 3.555e-05, "loss": 3.0625, "step": 237 }, { "epoch": 0.1576158940397351, "grad_norm": 3.2746009385243386, "learning_rate": 3.5699999999999994e-05, "loss": 3.25, "step": 238 }, { "epoch": 0.15827814569536425, "grad_norm": 3.0918839052299756, "learning_rate": 3.585e-05, "loss": 3.0781, "step": 239 }, { "epoch": 0.15894039735099338, "grad_norm": 3.2220388664461024, "learning_rate": 3.5999999999999994e-05, "loss": 2.6875, "step": 240 }, { "epoch": 0.15960264900662252, "grad_norm": 3.1368033793353365, "learning_rate": 3.615e-05, "loss": 2.9219, "step": 241 }, { "epoch": 0.16026490066225166, "grad_norm": 3.33429128151937, "learning_rate": 3.6299999999999995e-05, "loss": 3.2188, "step": 242 }, { "epoch": 0.1609271523178808, "grad_norm": 3.127316735595342, "learning_rate": 3.645e-05, "loss": 3.1562, "step": 243 }, { "epoch": 0.16158940397350993, "grad_norm": 3.2386107224953395, "learning_rate": 3.6599999999999995e-05, "loss": 3.1719, "step": 244 }, { "epoch": 0.16225165562913907, "grad_norm": 3.058043859797535, "learning_rate": 3.675e-05, "loss": 3.0625, "step": 245 }, { "epoch": 0.1629139072847682, "grad_norm": 3.157439443754408, "learning_rate": 3.6899999999999996e-05, "loss": 2.875, "step": 246 }, { "epoch": 0.16357615894039734, "grad_norm": 3.2561897128261603, "learning_rate": 3.705e-05, "loss": 3.25, "step": 247 }, { "epoch": 0.16423841059602648, "grad_norm": 3.179349634953305, "learning_rate": 3.7199999999999996e-05, "loss": 3.1406, "step": 248 }, { "epoch": 0.16490066225165562, "grad_norm": 3.242858743296943, "learning_rate": 3.735e-05, "loss": 3.1562, "step": 249 }, { "epoch": 0.16556291390728478, "grad_norm": 2.8791647506080795, "learning_rate": 3.75e-05, "loss": 2.9219, "step": 250 }, { "epoch": 0.16622516556291392, "grad_norm": 3.076401994057578, "learning_rate": 3.7649999999999994e-05, "loss": 2.8594, "step": 251 }, { "epoch": 0.16688741721854305, "grad_norm": 3.0618051560972988, "learning_rate": 3.78e-05, "loss": 3.2344, "step": 252 }, { "epoch": 0.1675496688741722, "grad_norm": 3.0401093762867117, "learning_rate": 3.7949999999999994e-05, "loss": 3.0312, "step": 253 }, { "epoch": 0.16821192052980133, "grad_norm": 3.0317580316936565, "learning_rate": 3.81e-05, "loss": 2.5938, "step": 254 }, { "epoch": 0.16887417218543047, "grad_norm": 3.4131359537704227, "learning_rate": 3.8249999999999995e-05, "loss": 2.9531, "step": 255 }, { "epoch": 0.1695364238410596, "grad_norm": 2.824527994440723, "learning_rate": 3.84e-05, "loss": 2.8906, "step": 256 }, { "epoch": 0.17019867549668874, "grad_norm": 2.830675833417707, "learning_rate": 3.8549999999999995e-05, "loss": 2.5781, "step": 257 }, { "epoch": 0.17086092715231788, "grad_norm": 3.041340070819647, "learning_rate": 3.87e-05, "loss": 2.9688, "step": 258 }, { "epoch": 0.171523178807947, "grad_norm": 3.035707324733149, "learning_rate": 3.8849999999999996e-05, "loss": 3.0, "step": 259 }, { "epoch": 0.17218543046357615, "grad_norm": 3.297057110848455, "learning_rate": 3.9e-05, "loss": 3.0781, "step": 260 }, { "epoch": 0.1728476821192053, "grad_norm": 3.0752860411867613, "learning_rate": 3.9149999999999996e-05, "loss": 3.125, "step": 261 }, { "epoch": 0.17350993377483442, "grad_norm": 2.9046723573594737, "learning_rate": 3.93e-05, "loss": 2.8438, "step": 262 }, { "epoch": 0.1741721854304636, "grad_norm": 3.171530410999741, "learning_rate": 3.945e-05, "loss": 3.4062, "step": 263 }, { "epoch": 0.17483443708609273, "grad_norm": 2.8562672189292933, "learning_rate": 3.96e-05, "loss": 3.0312, "step": 264 }, { "epoch": 0.17549668874172186, "grad_norm": 3.0497161696094186, "learning_rate": 3.975e-05, "loss": 3.1562, "step": 265 }, { "epoch": 0.176158940397351, "grad_norm": 3.0277077489573587, "learning_rate": 3.99e-05, "loss": 3.1562, "step": 266 }, { "epoch": 0.17682119205298014, "grad_norm": 3.023428348791391, "learning_rate": 4.005e-05, "loss": 3.0781, "step": 267 }, { "epoch": 0.17748344370860927, "grad_norm": 2.7763290247551726, "learning_rate": 4.02e-05, "loss": 3.0469, "step": 268 }, { "epoch": 0.1781456953642384, "grad_norm": 2.933416617657179, "learning_rate": 4.035e-05, "loss": 2.7188, "step": 269 }, { "epoch": 0.17880794701986755, "grad_norm": 2.9765295399280896, "learning_rate": 4.05e-05, "loss": 2.9844, "step": 270 }, { "epoch": 0.17947019867549668, "grad_norm": 3.0030915229628175, "learning_rate": 4.065e-05, "loss": 3.0156, "step": 271 }, { "epoch": 0.18013245033112582, "grad_norm": 2.979687167041924, "learning_rate": 4.08e-05, "loss": 3.1094, "step": 272 }, { "epoch": 0.18079470198675496, "grad_norm": 2.9898594996365406, "learning_rate": 4.095e-05, "loss": 2.8594, "step": 273 }, { "epoch": 0.1814569536423841, "grad_norm": 3.063881338434763, "learning_rate": 4.11e-05, "loss": 3.0781, "step": 274 }, { "epoch": 0.18211920529801323, "grad_norm": 2.827998356797006, "learning_rate": 4.125e-05, "loss": 3.0, "step": 275 }, { "epoch": 0.1827814569536424, "grad_norm": 2.830631906167569, "learning_rate": 4.14e-05, "loss": 2.9844, "step": 276 }, { "epoch": 0.18344370860927153, "grad_norm": 2.671075701611255, "learning_rate": 4.155e-05, "loss": 3.1562, "step": 277 }, { "epoch": 0.18410596026490067, "grad_norm": 2.9432234846518672, "learning_rate": 4.17e-05, "loss": 2.875, "step": 278 }, { "epoch": 0.1847682119205298, "grad_norm": 2.8007775083308384, "learning_rate": 4.185e-05, "loss": 2.9219, "step": 279 }, { "epoch": 0.18543046357615894, "grad_norm": 2.827153897323655, "learning_rate": 4.2e-05, "loss": 2.8594, "step": 280 }, { "epoch": 0.18609271523178808, "grad_norm": 3.092860587637192, "learning_rate": 4.215e-05, "loss": 2.625, "step": 281 }, { "epoch": 0.18675496688741722, "grad_norm": 3.3331355753296332, "learning_rate": 4.229999999999999e-05, "loss": 3.1094, "step": 282 }, { "epoch": 0.18741721854304635, "grad_norm": 2.8022509072008037, "learning_rate": 4.2449999999999995e-05, "loss": 2.8281, "step": 283 }, { "epoch": 0.1880794701986755, "grad_norm": 2.8557193406310604, "learning_rate": 4.259999999999999e-05, "loss": 2.9688, "step": 284 }, { "epoch": 0.18874172185430463, "grad_norm": 3.149206840898262, "learning_rate": 4.2749999999999996e-05, "loss": 3.0, "step": 285 }, { "epoch": 0.18940397350993377, "grad_norm": 2.9594738453379015, "learning_rate": 4.289999999999999e-05, "loss": 3.2031, "step": 286 }, { "epoch": 0.1900662251655629, "grad_norm": 2.7987124337866636, "learning_rate": 4.3049999999999996e-05, "loss": 2.9375, "step": 287 }, { "epoch": 0.19072847682119207, "grad_norm": 2.9720094900470366, "learning_rate": 4.319999999999999e-05, "loss": 3.1875, "step": 288 }, { "epoch": 0.1913907284768212, "grad_norm": 2.871756222471837, "learning_rate": 4.334999999999999e-05, "loss": 2.9219, "step": 289 }, { "epoch": 0.19205298013245034, "grad_norm": 3.0726921131145146, "learning_rate": 4.3499999999999993e-05, "loss": 3.2031, "step": 290 }, { "epoch": 0.19271523178807948, "grad_norm": 2.796357850006255, "learning_rate": 4.364999999999999e-05, "loss": 2.75, "step": 291 }, { "epoch": 0.19337748344370861, "grad_norm": 2.6140692779608314, "learning_rate": 4.3799999999999994e-05, "loss": 2.7812, "step": 292 }, { "epoch": 0.19403973509933775, "grad_norm": 2.784207431419087, "learning_rate": 4.394999999999999e-05, "loss": 2.9375, "step": 293 }, { "epoch": 0.1947019867549669, "grad_norm": 2.8355379111399164, "learning_rate": 4.4099999999999995e-05, "loss": 3.0, "step": 294 }, { "epoch": 0.19536423841059603, "grad_norm": 3.2318235743742516, "learning_rate": 4.424999999999999e-05, "loss": 3.0156, "step": 295 }, { "epoch": 0.19602649006622516, "grad_norm": 2.9764608799525303, "learning_rate": 4.4399999999999995e-05, "loss": 2.8906, "step": 296 }, { "epoch": 0.1966887417218543, "grad_norm": 3.050950613384662, "learning_rate": 4.454999999999999e-05, "loss": 3.0938, "step": 297 }, { "epoch": 0.19735099337748344, "grad_norm": 2.8236597243290387, "learning_rate": 4.4699999999999996e-05, "loss": 3.0625, "step": 298 }, { "epoch": 0.19801324503311257, "grad_norm": 3.07416263735879, "learning_rate": 4.484999999999999e-05, "loss": 3.0, "step": 299 }, { "epoch": 0.1986754966887417, "grad_norm": 2.8291727163734564, "learning_rate": 4.4999999999999996e-05, "loss": 2.8281, "step": 300 }, { "epoch": 0.19933774834437087, "grad_norm": 2.7160819305649793, "learning_rate": 4.514999999999999e-05, "loss": 2.9062, "step": 301 }, { "epoch": 0.2, "grad_norm": 2.8935347954748423, "learning_rate": 4.5299999999999997e-05, "loss": 2.9531, "step": 302 }, { "epoch": 0.20066225165562915, "grad_norm": 2.746357879541038, "learning_rate": 4.5449999999999993e-05, "loss": 2.625, "step": 303 }, { "epoch": 0.20132450331125828, "grad_norm": 2.864849257272957, "learning_rate": 4.56e-05, "loss": 2.4531, "step": 304 }, { "epoch": 0.20198675496688742, "grad_norm": 3.4970751732130183, "learning_rate": 4.5749999999999994e-05, "loss": 2.9219, "step": 305 }, { "epoch": 0.20264900662251656, "grad_norm": 2.8068190998074867, "learning_rate": 4.59e-05, "loss": 2.8438, "step": 306 }, { "epoch": 0.2033112582781457, "grad_norm": 2.7541039381240644, "learning_rate": 4.6049999999999994e-05, "loss": 2.9844, "step": 307 }, { "epoch": 0.20397350993377483, "grad_norm": 2.8593958986526617, "learning_rate": 4.62e-05, "loss": 2.5938, "step": 308 }, { "epoch": 0.20463576158940397, "grad_norm": 2.5503599862388855, "learning_rate": 4.6349999999999995e-05, "loss": 2.9219, "step": 309 }, { "epoch": 0.2052980132450331, "grad_norm": 2.805329586203324, "learning_rate": 4.65e-05, "loss": 2.9688, "step": 310 }, { "epoch": 0.20596026490066224, "grad_norm": 2.6603779276962194, "learning_rate": 4.6649999999999996e-05, "loss": 2.9375, "step": 311 }, { "epoch": 0.20662251655629138, "grad_norm": 2.7850339145149885, "learning_rate": 4.68e-05, "loss": 2.8438, "step": 312 }, { "epoch": 0.20728476821192052, "grad_norm": 2.6835506268112455, "learning_rate": 4.6949999999999996e-05, "loss": 3.1094, "step": 313 }, { "epoch": 0.20794701986754968, "grad_norm": 2.8575973590020025, "learning_rate": 4.709999999999999e-05, "loss": 3.2344, "step": 314 }, { "epoch": 0.20860927152317882, "grad_norm": 2.8124172645280714, "learning_rate": 4.7249999999999997e-05, "loss": 3.0469, "step": 315 }, { "epoch": 0.20927152317880796, "grad_norm": 2.6673390249864735, "learning_rate": 4.7399999999999993e-05, "loss": 3.0156, "step": 316 }, { "epoch": 0.2099337748344371, "grad_norm": 2.864915607284846, "learning_rate": 4.755e-05, "loss": 3.0312, "step": 317 }, { "epoch": 0.21059602649006623, "grad_norm": 2.8974326959697847, "learning_rate": 4.7699999999999994e-05, "loss": 2.375, "step": 318 }, { "epoch": 0.21125827814569537, "grad_norm": 2.486815622189848, "learning_rate": 4.785e-05, "loss": 2.9531, "step": 319 }, { "epoch": 0.2119205298013245, "grad_norm": 3.1389257973535845, "learning_rate": 4.7999999999999994e-05, "loss": 3.0781, "step": 320 }, { "epoch": 0.21258278145695364, "grad_norm": 2.855097913494004, "learning_rate": 4.815e-05, "loss": 3.2812, "step": 321 }, { "epoch": 0.21324503311258278, "grad_norm": 2.5855596268711416, "learning_rate": 4.8299999999999995e-05, "loss": 2.8594, "step": 322 }, { "epoch": 0.2139072847682119, "grad_norm": 2.7674153641588086, "learning_rate": 4.845e-05, "loss": 2.9062, "step": 323 }, { "epoch": 0.21456953642384105, "grad_norm": 2.796227189614921, "learning_rate": 4.8599999999999995e-05, "loss": 3.0938, "step": 324 }, { "epoch": 0.2152317880794702, "grad_norm": 2.4121401508696283, "learning_rate": 4.875e-05, "loss": 2.9219, "step": 325 }, { "epoch": 0.21589403973509932, "grad_norm": 2.6208351101059097, "learning_rate": 4.8899999999999996e-05, "loss": 2.4062, "step": 326 }, { "epoch": 0.2165562913907285, "grad_norm": 2.90840734668696, "learning_rate": 4.905e-05, "loss": 2.9375, "step": 327 }, { "epoch": 0.21721854304635763, "grad_norm": 2.9364709830554445, "learning_rate": 4.9199999999999997e-05, "loss": 3.1406, "step": 328 }, { "epoch": 0.21788079470198676, "grad_norm": 2.5824857389931966, "learning_rate": 4.935e-05, "loss": 2.9844, "step": 329 }, { "epoch": 0.2185430463576159, "grad_norm": 2.6891028958819483, "learning_rate": 4.95e-05, "loss": 3.0312, "step": 330 }, { "epoch": 0.21920529801324504, "grad_norm": 2.7301777533336073, "learning_rate": 4.965e-05, "loss": 3.0781, "step": 331 }, { "epoch": 0.21986754966887417, "grad_norm": 2.6687642863091665, "learning_rate": 4.98e-05, "loss": 3.1875, "step": 332 }, { "epoch": 0.2205298013245033, "grad_norm": 2.7919695483649307, "learning_rate": 4.995e-05, "loss": 2.9531, "step": 333 }, { "epoch": 0.22119205298013245, "grad_norm": 2.7075124944529314, "learning_rate": 5.01e-05, "loss": 3.0625, "step": 334 }, { "epoch": 0.22185430463576158, "grad_norm": 2.670072571378032, "learning_rate": 5.025e-05, "loss": 2.5781, "step": 335 }, { "epoch": 0.22251655629139072, "grad_norm": 2.9293439496870524, "learning_rate": 5.04e-05, "loss": 2.8594, "step": 336 }, { "epoch": 0.22317880794701986, "grad_norm": 2.5723702112330353, "learning_rate": 5.055e-05, "loss": 2.8594, "step": 337 }, { "epoch": 0.223841059602649, "grad_norm": 2.683002407769579, "learning_rate": 5.07e-05, "loss": 2.9219, "step": 338 }, { "epoch": 0.22450331125827813, "grad_norm": 2.520196330168102, "learning_rate": 5.0849999999999996e-05, "loss": 2.7656, "step": 339 }, { "epoch": 0.2251655629139073, "grad_norm": 2.617867096499689, "learning_rate": 5.1e-05, "loss": 2.9844, "step": 340 }, { "epoch": 0.22582781456953643, "grad_norm": 2.878837165143681, "learning_rate": 5.1149999999999996e-05, "loss": 3.2188, "step": 341 }, { "epoch": 0.22649006622516557, "grad_norm": 2.4772843115116374, "learning_rate": 5.13e-05, "loss": 2.7812, "step": 342 }, { "epoch": 0.2271523178807947, "grad_norm": 2.744876601078272, "learning_rate": 5.145e-05, "loss": 2.5469, "step": 343 }, { "epoch": 0.22781456953642384, "grad_norm": 2.7275454126235656, "learning_rate": 5.1599999999999994e-05, "loss": 2.8594, "step": 344 }, { "epoch": 0.22847682119205298, "grad_norm": 2.6875250538505524, "learning_rate": 5.174999999999999e-05, "loss": 2.8281, "step": 345 }, { "epoch": 0.22913907284768212, "grad_norm": 2.9809323072875547, "learning_rate": 5.1899999999999994e-05, "loss": 3.0938, "step": 346 }, { "epoch": 0.22980132450331126, "grad_norm": 2.810630277845135, "learning_rate": 5.204999999999999e-05, "loss": 3.2969, "step": 347 }, { "epoch": 0.2304635761589404, "grad_norm": 2.7421934741225105, "learning_rate": 5.2199999999999995e-05, "loss": 3.1406, "step": 348 }, { "epoch": 0.23112582781456953, "grad_norm": 2.651253979198602, "learning_rate": 5.234999999999999e-05, "loss": 2.8594, "step": 349 }, { "epoch": 0.23178807947019867, "grad_norm": 2.695943182177301, "learning_rate": 5.2499999999999995e-05, "loss": 2.9375, "step": 350 }, { "epoch": 0.2324503311258278, "grad_norm": 3.0932176133525915, "learning_rate": 5.264999999999999e-05, "loss": 2.9844, "step": 351 }, { "epoch": 0.23311258278145697, "grad_norm": 2.8776369927511847, "learning_rate": 5.279999999999999e-05, "loss": 3.0938, "step": 352 }, { "epoch": 0.2337748344370861, "grad_norm": 2.9391023537205987, "learning_rate": 5.294999999999999e-05, "loss": 3.0781, "step": 353 }, { "epoch": 0.23443708609271524, "grad_norm": 2.8236525368121788, "learning_rate": 5.309999999999999e-05, "loss": 3.0625, "step": 354 }, { "epoch": 0.23509933774834438, "grad_norm": 2.5022637128814464, "learning_rate": 5.324999999999999e-05, "loss": 2.9688, "step": 355 }, { "epoch": 0.23576158940397351, "grad_norm": 2.595956447908909, "learning_rate": 5.339999999999999e-05, "loss": 2.75, "step": 356 }, { "epoch": 0.23642384105960265, "grad_norm": 2.5911712210683966, "learning_rate": 5.3549999999999994e-05, "loss": 2.9375, "step": 357 }, { "epoch": 0.2370860927152318, "grad_norm": 2.5410609072424797, "learning_rate": 5.369999999999999e-05, "loss": 2.75, "step": 358 }, { "epoch": 0.23774834437086093, "grad_norm": 2.5520508356259692, "learning_rate": 5.3849999999999994e-05, "loss": 2.9375, "step": 359 }, { "epoch": 0.23841059602649006, "grad_norm": 2.81119890515144, "learning_rate": 5.399999999999999e-05, "loss": 3.0312, "step": 360 }, { "epoch": 0.2390728476821192, "grad_norm": 2.3977756283972234, "learning_rate": 5.4149999999999995e-05, "loss": 2.5781, "step": 361 }, { "epoch": 0.23973509933774834, "grad_norm": 2.439623785981216, "learning_rate": 5.429999999999999e-05, "loss": 2.8594, "step": 362 }, { "epoch": 0.24039735099337747, "grad_norm": 2.5215218942203017, "learning_rate": 5.4449999999999995e-05, "loss": 2.8281, "step": 363 }, { "epoch": 0.2410596026490066, "grad_norm": 2.4832011076478246, "learning_rate": 5.459999999999999e-05, "loss": 2.6719, "step": 364 }, { "epoch": 0.24172185430463577, "grad_norm": 2.6187938721609467, "learning_rate": 5.4749999999999996e-05, "loss": 2.9688, "step": 365 }, { "epoch": 0.2423841059602649, "grad_norm": 2.440141000241782, "learning_rate": 5.489999999999999e-05, "loss": 2.9062, "step": 366 }, { "epoch": 0.24304635761589405, "grad_norm": 2.3704319515535897, "learning_rate": 5.5049999999999996e-05, "loss": 2.8594, "step": 367 }, { "epoch": 0.24370860927152319, "grad_norm": 2.4838728753813903, "learning_rate": 5.519999999999999e-05, "loss": 2.9844, "step": 368 }, { "epoch": 0.24437086092715232, "grad_norm": 2.629678501417108, "learning_rate": 5.535e-05, "loss": 2.9375, "step": 369 }, { "epoch": 0.24503311258278146, "grad_norm": 2.3583989808916725, "learning_rate": 5.5499999999999994e-05, "loss": 2.8125, "step": 370 }, { "epoch": 0.2456953642384106, "grad_norm": 2.7176064872795704, "learning_rate": 5.565e-05, "loss": 2.9688, "step": 371 }, { "epoch": 0.24635761589403973, "grad_norm": 2.515644935973342, "learning_rate": 5.5799999999999994e-05, "loss": 3.0781, "step": 372 }, { "epoch": 0.24701986754966887, "grad_norm": 2.785493564497169, "learning_rate": 5.595e-05, "loss": 2.9531, "step": 373 }, { "epoch": 0.247682119205298, "grad_norm": 2.5610668919549835, "learning_rate": 5.6099999999999995e-05, "loss": 2.875, "step": 374 }, { "epoch": 0.24834437086092714, "grad_norm": 2.3132216863861528, "learning_rate": 5.625e-05, "loss": 2.5469, "step": 375 }, { "epoch": 0.24900662251655628, "grad_norm": 2.490089384614338, "learning_rate": 5.6399999999999995e-05, "loss": 2.625, "step": 376 }, { "epoch": 0.24966887417218542, "grad_norm": 2.732437840492482, "learning_rate": 5.654999999999999e-05, "loss": 3.1406, "step": 377 }, { "epoch": 0.2503311258278146, "grad_norm": 2.4087218496183747, "learning_rate": 5.6699999999999996e-05, "loss": 2.8438, "step": 378 }, { "epoch": 0.2509933774834437, "grad_norm": 2.643332835501332, "learning_rate": 5.684999999999999e-05, "loss": 3.0156, "step": 379 }, { "epoch": 0.25165562913907286, "grad_norm": 2.555866937201364, "learning_rate": 5.6999999999999996e-05, "loss": 3.0781, "step": 380 }, { "epoch": 0.25231788079470197, "grad_norm": 2.505409022852811, "learning_rate": 5.714999999999999e-05, "loss": 2.8906, "step": 381 }, { "epoch": 0.25298013245033113, "grad_norm": 2.5153381688711134, "learning_rate": 5.73e-05, "loss": 2.75, "step": 382 }, { "epoch": 0.25364238410596024, "grad_norm": 2.4545178566050176, "learning_rate": 5.7449999999999994e-05, "loss": 2.8594, "step": 383 }, { "epoch": 0.2543046357615894, "grad_norm": 2.7056541046271767, "learning_rate": 5.76e-05, "loss": 2.6875, "step": 384 }, { "epoch": 0.25496688741721857, "grad_norm": 2.404529635277738, "learning_rate": 5.7749999999999994e-05, "loss": 2.875, "step": 385 }, { "epoch": 0.2556291390728477, "grad_norm": 2.583284337641058, "learning_rate": 5.79e-05, "loss": 2.8906, "step": 386 }, { "epoch": 0.25629139072847684, "grad_norm": 2.708736300949351, "learning_rate": 5.8049999999999995e-05, "loss": 3.0312, "step": 387 }, { "epoch": 0.25695364238410595, "grad_norm": 2.6099765912735973, "learning_rate": 5.82e-05, "loss": 2.4844, "step": 388 }, { "epoch": 0.2576158940397351, "grad_norm": 2.36515435616245, "learning_rate": 5.8349999999999995e-05, "loss": 2.7656, "step": 389 }, { "epoch": 0.2582781456953642, "grad_norm": 2.509318371395626, "learning_rate": 5.85e-05, "loss": 2.9219, "step": 390 }, { "epoch": 0.2589403973509934, "grad_norm": 2.6817505049098, "learning_rate": 5.8649999999999996e-05, "loss": 2.8438, "step": 391 }, { "epoch": 0.2596026490066225, "grad_norm": 2.3377568467031247, "learning_rate": 5.88e-05, "loss": 2.8281, "step": 392 }, { "epoch": 0.26026490066225166, "grad_norm": 2.668249930337885, "learning_rate": 5.8949999999999996e-05, "loss": 3.125, "step": 393 }, { "epoch": 0.2609271523178808, "grad_norm": 2.514411634505482, "learning_rate": 5.91e-05, "loss": 2.9375, "step": 394 }, { "epoch": 0.26158940397350994, "grad_norm": 2.359331343947323, "learning_rate": 5.925e-05, "loss": 2.9375, "step": 395 }, { "epoch": 0.26225165562913905, "grad_norm": 2.487143418586103, "learning_rate": 5.94e-05, "loss": 2.875, "step": 396 }, { "epoch": 0.2629139072847682, "grad_norm": 2.407214953122946, "learning_rate": 5.955e-05, "loss": 2.9531, "step": 397 }, { "epoch": 0.2635761589403974, "grad_norm": 2.5547282744195883, "learning_rate": 5.97e-05, "loss": 3.0156, "step": 398 }, { "epoch": 0.2642384105960265, "grad_norm": 2.3445270528639246, "learning_rate": 5.985e-05, "loss": 2.8281, "step": 399 }, { "epoch": 0.26490066225165565, "grad_norm": 2.2435710612762785, "learning_rate": 5.9999999999999995e-05, "loss": 2.8438, "step": 400 }, { "epoch": 0.26556291390728476, "grad_norm": 2.411873316003341, "learning_rate": 6.015e-05, "loss": 2.7969, "step": 401 }, { "epoch": 0.2662251655629139, "grad_norm": 2.262658715287052, "learning_rate": 6.0299999999999995e-05, "loss": 3.0, "step": 402 }, { "epoch": 0.26688741721854303, "grad_norm": 2.320253164823349, "learning_rate": 6.045e-05, "loss": 2.9219, "step": 403 }, { "epoch": 0.2675496688741722, "grad_norm": 2.42375862691307, "learning_rate": 6.0599999999999996e-05, "loss": 2.9844, "step": 404 }, { "epoch": 0.2682119205298013, "grad_norm": 2.403873454275729, "learning_rate": 6.075e-05, "loss": 3.0312, "step": 405 }, { "epoch": 0.26887417218543047, "grad_norm": 2.463958278863841, "learning_rate": 6.0899999999999996e-05, "loss": 2.5, "step": 406 }, { "epoch": 0.2695364238410596, "grad_norm": 2.5748882636255037, "learning_rate": 6.104999999999999e-05, "loss": 2.7812, "step": 407 }, { "epoch": 0.27019867549668874, "grad_norm": 2.847270850293623, "learning_rate": 6.12e-05, "loss": 3.125, "step": 408 }, { "epoch": 0.27086092715231785, "grad_norm": 2.424187088690875, "learning_rate": 6.134999999999999e-05, "loss": 2.5938, "step": 409 }, { "epoch": 0.271523178807947, "grad_norm": 2.3012265403934054, "learning_rate": 6.149999999999999e-05, "loss": 3.0156, "step": 410 }, { "epoch": 0.2721854304635762, "grad_norm": 2.4615639529933193, "learning_rate": 6.165e-05, "loss": 2.7969, "step": 411 }, { "epoch": 0.2728476821192053, "grad_norm": 2.288145587908721, "learning_rate": 6.18e-05, "loss": 2.875, "step": 412 }, { "epoch": 0.27350993377483446, "grad_norm": 2.3886795870975153, "learning_rate": 6.194999999999999e-05, "loss": 2.8438, "step": 413 }, { "epoch": 0.27417218543046357, "grad_norm": 2.2443562936132815, "learning_rate": 6.209999999999999e-05, "loss": 2.4219, "step": 414 }, { "epoch": 0.27483443708609273, "grad_norm": 2.332050495769141, "learning_rate": 6.225e-05, "loss": 2.9688, "step": 415 }, { "epoch": 0.27549668874172184, "grad_norm": 2.40828791389515, "learning_rate": 6.239999999999999e-05, "loss": 2.4375, "step": 416 }, { "epoch": 0.276158940397351, "grad_norm": 2.3452239662126595, "learning_rate": 6.254999999999999e-05, "loss": 3.0, "step": 417 }, { "epoch": 0.2768211920529801, "grad_norm": 2.3501313210712977, "learning_rate": 6.269999999999999e-05, "loss": 2.9375, "step": 418 }, { "epoch": 0.2774834437086093, "grad_norm": 2.4994458958900267, "learning_rate": 6.285e-05, "loss": 2.9219, "step": 419 }, { "epoch": 0.2781456953642384, "grad_norm": 2.3765244843110973, "learning_rate": 6.299999999999999e-05, "loss": 2.9219, "step": 420 }, { "epoch": 0.27880794701986755, "grad_norm": 2.551917170430421, "learning_rate": 6.314999999999999e-05, "loss": 2.7344, "step": 421 }, { "epoch": 0.27947019867549666, "grad_norm": 2.3140198073766283, "learning_rate": 6.33e-05, "loss": 2.9531, "step": 422 }, { "epoch": 0.2801324503311258, "grad_norm": 2.327598517584333, "learning_rate": 6.345e-05, "loss": 2.6094, "step": 423 }, { "epoch": 0.280794701986755, "grad_norm": 2.447317426935946, "learning_rate": 6.359999999999999e-05, "loss": 2.8281, "step": 424 }, { "epoch": 0.2814569536423841, "grad_norm": 2.1877708252048818, "learning_rate": 6.374999999999999e-05, "loss": 2.8906, "step": 425 }, { "epoch": 0.28211920529801326, "grad_norm": 2.3467691452449744, "learning_rate": 6.39e-05, "loss": 2.7031, "step": 426 }, { "epoch": 0.2827814569536424, "grad_norm": 2.3848096996238035, "learning_rate": 6.405e-05, "loss": 3.0625, "step": 427 }, { "epoch": 0.28344370860927154, "grad_norm": 2.5789295552069023, "learning_rate": 6.419999999999999e-05, "loss": 3.0469, "step": 428 }, { "epoch": 0.28410596026490065, "grad_norm": 2.4515640818058957, "learning_rate": 6.434999999999999e-05, "loss": 2.9844, "step": 429 }, { "epoch": 0.2847682119205298, "grad_norm": 2.394402167936939, "learning_rate": 6.45e-05, "loss": 2.8281, "step": 430 }, { "epoch": 0.2854304635761589, "grad_norm": 2.194167494650524, "learning_rate": 6.465e-05, "loss": 2.9375, "step": 431 }, { "epoch": 0.2860927152317881, "grad_norm": 2.359277627825071, "learning_rate": 6.479999999999999e-05, "loss": 2.9375, "step": 432 }, { "epoch": 0.2867549668874172, "grad_norm": 2.247107450667477, "learning_rate": 6.494999999999999e-05, "loss": 3.0, "step": 433 }, { "epoch": 0.28741721854304636, "grad_norm": 2.2533850982167114, "learning_rate": 6.51e-05, "loss": 2.6719, "step": 434 }, { "epoch": 0.28807947019867547, "grad_norm": 2.6216883376256006, "learning_rate": 6.525e-05, "loss": 2.9531, "step": 435 }, { "epoch": 0.28874172185430463, "grad_norm": 2.3376819223729175, "learning_rate": 6.539999999999999e-05, "loss": 2.8281, "step": 436 }, { "epoch": 0.2894039735099338, "grad_norm": 2.518200368497074, "learning_rate": 6.555e-05, "loss": 2.9062, "step": 437 }, { "epoch": 0.2900662251655629, "grad_norm": 2.4745855317949434, "learning_rate": 6.57e-05, "loss": 2.9062, "step": 438 }, { "epoch": 0.29072847682119207, "grad_norm": 2.4856170019322077, "learning_rate": 6.584999999999999e-05, "loss": 2.875, "step": 439 }, { "epoch": 0.2913907284768212, "grad_norm": 2.4507140192218078, "learning_rate": 6.599999999999999e-05, "loss": 3.1875, "step": 440 }, { "epoch": 0.29205298013245035, "grad_norm": 2.241419553933538, "learning_rate": 6.615e-05, "loss": 2.6875, "step": 441 }, { "epoch": 0.29271523178807946, "grad_norm": 2.194376983134961, "learning_rate": 6.63e-05, "loss": 2.7812, "step": 442 }, { "epoch": 0.2933774834437086, "grad_norm": 2.1216090492917696, "learning_rate": 6.644999999999999e-05, "loss": 2.8438, "step": 443 }, { "epoch": 0.29403973509933773, "grad_norm": 2.411637276719395, "learning_rate": 6.659999999999999e-05, "loss": 2.5469, "step": 444 }, { "epoch": 0.2947019867549669, "grad_norm": 2.178699410808115, "learning_rate": 6.675e-05, "loss": 2.8594, "step": 445 }, { "epoch": 0.295364238410596, "grad_norm": 2.295636232653236, "learning_rate": 6.69e-05, "loss": 2.9375, "step": 446 }, { "epoch": 0.29602649006622517, "grad_norm": 2.3053502822067045, "learning_rate": 6.704999999999999e-05, "loss": 2.9844, "step": 447 }, { "epoch": 0.29668874172185433, "grad_norm": 2.3678044953235395, "learning_rate": 6.72e-05, "loss": 2.9062, "step": 448 }, { "epoch": 0.29735099337748344, "grad_norm": 2.2617717075948303, "learning_rate": 6.735e-05, "loss": 2.4219, "step": 449 }, { "epoch": 0.2980132450331126, "grad_norm": 2.269047766376487, "learning_rate": 6.75e-05, "loss": 3.0156, "step": 450 }, { "epoch": 0.2986754966887417, "grad_norm": 2.2524481369608935, "learning_rate": 6.764999999999999e-05, "loss": 2.875, "step": 451 }, { "epoch": 0.2993377483443709, "grad_norm": 2.4434987542996986, "learning_rate": 6.78e-05, "loss": 2.9688, "step": 452 }, { "epoch": 0.3, "grad_norm": 2.3192386902318263, "learning_rate": 6.795e-05, "loss": 2.8594, "step": 453 }, { "epoch": 0.30066225165562915, "grad_norm": 2.48875337721921, "learning_rate": 6.81e-05, "loss": 2.7969, "step": 454 }, { "epoch": 0.30132450331125826, "grad_norm": 2.362226242785205, "learning_rate": 6.824999999999999e-05, "loss": 2.9844, "step": 455 }, { "epoch": 0.3019867549668874, "grad_norm": 2.2670149552987824, "learning_rate": 6.84e-05, "loss": 2.7188, "step": 456 }, { "epoch": 0.30264900662251654, "grad_norm": 2.8193079158488255, "learning_rate": 6.855e-05, "loss": 3.1406, "step": 457 }, { "epoch": 0.3033112582781457, "grad_norm": 2.2646496189624985, "learning_rate": 6.87e-05, "loss": 2.5156, "step": 458 }, { "epoch": 0.3039735099337748, "grad_norm": 2.179387864370677, "learning_rate": 6.884999999999999e-05, "loss": 2.5156, "step": 459 }, { "epoch": 0.304635761589404, "grad_norm": 2.165880982658605, "learning_rate": 6.9e-05, "loss": 2.75, "step": 460 }, { "epoch": 0.30529801324503314, "grad_norm": 2.332414974818671, "learning_rate": 6.915e-05, "loss": 2.9375, "step": 461 }, { "epoch": 0.30596026490066225, "grad_norm": 2.1893904582741524, "learning_rate": 6.93e-05, "loss": 2.4688, "step": 462 }, { "epoch": 0.3066225165562914, "grad_norm": 2.221098659049963, "learning_rate": 6.945e-05, "loss": 2.8438, "step": 463 }, { "epoch": 0.3072847682119205, "grad_norm": 2.407181623611898, "learning_rate": 6.96e-05, "loss": 2.9531, "step": 464 }, { "epoch": 0.3079470198675497, "grad_norm": 2.5102658872763457, "learning_rate": 6.975e-05, "loss": 2.8281, "step": 465 }, { "epoch": 0.3086092715231788, "grad_norm": 2.169804910319373, "learning_rate": 6.989999999999999e-05, "loss": 2.7812, "step": 466 }, { "epoch": 0.30927152317880796, "grad_norm": 2.2954365732232294, "learning_rate": 7.005e-05, "loss": 2.7188, "step": 467 }, { "epoch": 0.30993377483443707, "grad_norm": 2.1603254104763354, "learning_rate": 7.02e-05, "loss": 2.625, "step": 468 }, { "epoch": 0.31059602649006623, "grad_norm": 2.132158855452012, "learning_rate": 7.034999999999999e-05, "loss": 2.3125, "step": 469 }, { "epoch": 0.31125827814569534, "grad_norm": 2.36494053243161, "learning_rate": 7.049999999999999e-05, "loss": 2.6719, "step": 470 }, { "epoch": 0.3119205298013245, "grad_norm": 2.6763288687775164, "learning_rate": 7.065e-05, "loss": 2.9844, "step": 471 }, { "epoch": 0.3125827814569536, "grad_norm": 2.0562046255180033, "learning_rate": 7.079999999999999e-05, "loss": 2.3125, "step": 472 }, { "epoch": 0.3132450331125828, "grad_norm": 2.597697997536415, "learning_rate": 7.094999999999999e-05, "loss": 3.0625, "step": 473 }, { "epoch": 0.31390728476821195, "grad_norm": 2.6390175110906844, "learning_rate": 7.11e-05, "loss": 2.9219, "step": 474 }, { "epoch": 0.31456953642384106, "grad_norm": 2.2129669633402806, "learning_rate": 7.125e-05, "loss": 2.875, "step": 475 }, { "epoch": 0.3152317880794702, "grad_norm": 2.1135558621950414, "learning_rate": 7.139999999999999e-05, "loss": 2.375, "step": 476 }, { "epoch": 0.31589403973509933, "grad_norm": 2.482459306148282, "learning_rate": 7.154999999999999e-05, "loss": 2.8906, "step": 477 }, { "epoch": 0.3165562913907285, "grad_norm": 2.336344546182186, "learning_rate": 7.17e-05, "loss": 2.8906, "step": 478 }, { "epoch": 0.3172185430463576, "grad_norm": 2.3879979269272202, "learning_rate": 7.184999999999998e-05, "loss": 3.0938, "step": 479 }, { "epoch": 0.31788079470198677, "grad_norm": 2.4547318137649614, "learning_rate": 7.199999999999999e-05, "loss": 2.9062, "step": 480 }, { "epoch": 0.3185430463576159, "grad_norm": 2.084216959471573, "learning_rate": 7.214999999999999e-05, "loss": 2.8125, "step": 481 }, { "epoch": 0.31920529801324504, "grad_norm": 2.0090281660434, "learning_rate": 7.23e-05, "loss": 2.4844, "step": 482 }, { "epoch": 0.31986754966887415, "grad_norm": 2.367565101489103, "learning_rate": 7.244999999999999e-05, "loss": 3.0781, "step": 483 }, { "epoch": 0.3205298013245033, "grad_norm": 2.0099732200872413, "learning_rate": 7.259999999999999e-05, "loss": 2.8281, "step": 484 }, { "epoch": 0.3211920529801324, "grad_norm": 2.0290961181524034, "learning_rate": 7.274999999999999e-05, "loss": 2.7969, "step": 485 }, { "epoch": 0.3218543046357616, "grad_norm": 2.136064762662133, "learning_rate": 7.29e-05, "loss": 2.6875, "step": 486 }, { "epoch": 0.32251655629139075, "grad_norm": 2.139831585913877, "learning_rate": 7.304999999999999e-05, "loss": 2.75, "step": 487 }, { "epoch": 0.32317880794701986, "grad_norm": 2.1142817397937264, "learning_rate": 7.319999999999999e-05, "loss": 2.7188, "step": 488 }, { "epoch": 0.32384105960264903, "grad_norm": 2.2499956028241455, "learning_rate": 7.335e-05, "loss": 2.8594, "step": 489 }, { "epoch": 0.32450331125827814, "grad_norm": 2.3426591385386315, "learning_rate": 7.35e-05, "loss": 2.8281, "step": 490 }, { "epoch": 0.3251655629139073, "grad_norm": 2.0769202460708986, "learning_rate": 7.364999999999999e-05, "loss": 2.7969, "step": 491 }, { "epoch": 0.3258278145695364, "grad_norm": 2.418156136483668, "learning_rate": 7.379999999999999e-05, "loss": 3.125, "step": 492 }, { "epoch": 0.3264900662251656, "grad_norm": 2.1335328743175723, "learning_rate": 7.395e-05, "loss": 2.25, "step": 493 }, { "epoch": 0.3271523178807947, "grad_norm": 2.0070473888666807, "learning_rate": 7.41e-05, "loss": 2.7812, "step": 494 }, { "epoch": 0.32781456953642385, "grad_norm": 2.359416477826231, "learning_rate": 7.424999999999999e-05, "loss": 2.8906, "step": 495 }, { "epoch": 0.32847682119205296, "grad_norm": 2.21706296665862, "learning_rate": 7.439999999999999e-05, "loss": 2.8125, "step": 496 }, { "epoch": 0.3291390728476821, "grad_norm": 2.4261006835004166, "learning_rate": 7.455e-05, "loss": 2.8906, "step": 497 }, { "epoch": 0.32980132450331123, "grad_norm": 1.9729049797261617, "learning_rate": 7.47e-05, "loss": 2.5312, "step": 498 }, { "epoch": 0.3304635761589404, "grad_norm": 2.150616973668717, "learning_rate": 7.484999999999999e-05, "loss": 2.8594, "step": 499 }, { "epoch": 0.33112582781456956, "grad_norm": 2.0215825279015225, "learning_rate": 7.5e-05, "loss": 2.4062, "step": 500 }, { "epoch": 0.33178807947019867, "grad_norm": 2.144181930845469, "learning_rate": 7.515e-05, "loss": 2.8125, "step": 501 }, { "epoch": 0.33245033112582784, "grad_norm": 2.2225352152596427, "learning_rate": 7.529999999999999e-05, "loss": 2.7812, "step": 502 }, { "epoch": 0.33311258278145695, "grad_norm": 2.1853496502619825, "learning_rate": 7.544999999999999e-05, "loss": 2.9062, "step": 503 }, { "epoch": 0.3337748344370861, "grad_norm": 2.057238568866444, "learning_rate": 7.56e-05, "loss": 2.875, "step": 504 }, { "epoch": 0.3344370860927152, "grad_norm": 2.3800183087172115, "learning_rate": 7.575e-05, "loss": 2.7344, "step": 505 }, { "epoch": 0.3350993377483444, "grad_norm": 2.1238206723295345, "learning_rate": 7.589999999999999e-05, "loss": 3.0781, "step": 506 }, { "epoch": 0.3357615894039735, "grad_norm": 2.4227058139679745, "learning_rate": 7.604999999999999e-05, "loss": 2.4844, "step": 507 }, { "epoch": 0.33642384105960266, "grad_norm": 2.4387307938851692, "learning_rate": 7.62e-05, "loss": 2.9531, "step": 508 }, { "epoch": 0.33708609271523177, "grad_norm": 2.1937530823153413, "learning_rate": 7.635e-05, "loss": 2.7031, "step": 509 }, { "epoch": 0.33774834437086093, "grad_norm": 2.4708036750827462, "learning_rate": 7.649999999999999e-05, "loss": 3.1094, "step": 510 }, { "epoch": 0.33841059602649004, "grad_norm": 2.290066860117554, "learning_rate": 7.664999999999999e-05, "loss": 3.0312, "step": 511 }, { "epoch": 0.3390728476821192, "grad_norm": 2.188103534587859, "learning_rate": 7.68e-05, "loss": 2.75, "step": 512 }, { "epoch": 0.33973509933774837, "grad_norm": 2.025423983041444, "learning_rate": 7.695e-05, "loss": 2.6562, "step": 513 }, { "epoch": 0.3403973509933775, "grad_norm": 2.0366991940625905, "learning_rate": 7.709999999999999e-05, "loss": 2.9219, "step": 514 }, { "epoch": 0.34105960264900664, "grad_norm": 2.2736736955011247, "learning_rate": 7.725e-05, "loss": 2.9062, "step": 515 }, { "epoch": 0.34172185430463575, "grad_norm": 2.052252999434754, "learning_rate": 7.74e-05, "loss": 2.75, "step": 516 }, { "epoch": 0.3423841059602649, "grad_norm": 2.1958190937817452, "learning_rate": 7.755e-05, "loss": 2.25, "step": 517 }, { "epoch": 0.343046357615894, "grad_norm": 2.40519867353714, "learning_rate": 7.769999999999999e-05, "loss": 2.8906, "step": 518 }, { "epoch": 0.3437086092715232, "grad_norm": 2.1762621592660647, "learning_rate": 7.785e-05, "loss": 2.75, "step": 519 }, { "epoch": 0.3443708609271523, "grad_norm": 2.118278409709848, "learning_rate": 7.8e-05, "loss": 2.4688, "step": 520 }, { "epoch": 0.34503311258278146, "grad_norm": 2.546749299367953, "learning_rate": 7.815e-05, "loss": 2.9844, "step": 521 }, { "epoch": 0.3456953642384106, "grad_norm": 1.938743046755281, "learning_rate": 7.829999999999999e-05, "loss": 2.7031, "step": 522 }, { "epoch": 0.34635761589403974, "grad_norm": 2.42018310611329, "learning_rate": 7.845e-05, "loss": 2.9062, "step": 523 }, { "epoch": 0.34701986754966885, "grad_norm": 2.1081235927722166, "learning_rate": 7.86e-05, "loss": 2.9375, "step": 524 }, { "epoch": 0.347682119205298, "grad_norm": 2.228552809018641, "learning_rate": 7.874999999999999e-05, "loss": 2.9844, "step": 525 }, { "epoch": 0.3483443708609272, "grad_norm": 2.1421400778159914, "learning_rate": 7.89e-05, "loss": 2.4375, "step": 526 }, { "epoch": 0.3490066225165563, "grad_norm": 2.0156998292333856, "learning_rate": 7.905e-05, "loss": 2.2812, "step": 527 }, { "epoch": 0.34966887417218545, "grad_norm": 2.033857679663199, "learning_rate": 7.92e-05, "loss": 2.625, "step": 528 }, { "epoch": 0.35033112582781456, "grad_norm": 2.3222847795850883, "learning_rate": 7.934999999999999e-05, "loss": 2.9688, "step": 529 }, { "epoch": 0.3509933774834437, "grad_norm": 2.2444800210312166, "learning_rate": 7.95e-05, "loss": 2.9531, "step": 530 }, { "epoch": 0.35165562913907283, "grad_norm": 2.098809012772908, "learning_rate": 7.965e-05, "loss": 2.9062, "step": 531 }, { "epoch": 0.352317880794702, "grad_norm": 2.286347529024447, "learning_rate": 7.98e-05, "loss": 3.1875, "step": 532 }, { "epoch": 0.3529801324503311, "grad_norm": 2.1173668810192936, "learning_rate": 7.994999999999999e-05, "loss": 2.5781, "step": 533 }, { "epoch": 0.3536423841059603, "grad_norm": 2.0782603257194676, "learning_rate": 8.01e-05, "loss": 2.3438, "step": 534 }, { "epoch": 0.3543046357615894, "grad_norm": 2.201978072412506, "learning_rate": 8.025e-05, "loss": 2.8281, "step": 535 }, { "epoch": 0.35496688741721855, "grad_norm": 2.3946762770655514, "learning_rate": 8.04e-05, "loss": 2.7812, "step": 536 }, { "epoch": 0.35562913907284766, "grad_norm": 2.001994300905327, "learning_rate": 8.054999999999999e-05, "loss": 2.7656, "step": 537 }, { "epoch": 0.3562913907284768, "grad_norm": 2.1168591409651856, "learning_rate": 8.07e-05, "loss": 2.9531, "step": 538 }, { "epoch": 0.356953642384106, "grad_norm": 2.0907962944096132, "learning_rate": 8.085e-05, "loss": 2.9375, "step": 539 }, { "epoch": 0.3576158940397351, "grad_norm": 2.177488970869635, "learning_rate": 8.1e-05, "loss": 2.7031, "step": 540 }, { "epoch": 0.35827814569536426, "grad_norm": 2.0479111615480243, "learning_rate": 8.115e-05, "loss": 2.7812, "step": 541 }, { "epoch": 0.35894039735099337, "grad_norm": 2.1091162204223797, "learning_rate": 8.13e-05, "loss": 2.8906, "step": 542 }, { "epoch": 0.35960264900662253, "grad_norm": 2.2231684884671528, "learning_rate": 8.145e-05, "loss": 2.8594, "step": 543 }, { "epoch": 0.36026490066225164, "grad_norm": 2.0836665707454096, "learning_rate": 8.16e-05, "loss": 2.4688, "step": 544 }, { "epoch": 0.3609271523178808, "grad_norm": 2.0626081607228595, "learning_rate": 8.175e-05, "loss": 2.5938, "step": 545 }, { "epoch": 0.3615894039735099, "grad_norm": 2.4295458049423004, "learning_rate": 8.19e-05, "loss": 3.0625, "step": 546 }, { "epoch": 0.3622516556291391, "grad_norm": 2.1222122289891088, "learning_rate": 8.205e-05, "loss": 2.4531, "step": 547 }, { "epoch": 0.3629139072847682, "grad_norm": 2.307170306861792, "learning_rate": 8.22e-05, "loss": 2.8906, "step": 548 }, { "epoch": 0.36357615894039735, "grad_norm": 1.9782636664950333, "learning_rate": 8.235e-05, "loss": 2.8281, "step": 549 }, { "epoch": 0.36423841059602646, "grad_norm": 2.1683571917876088, "learning_rate": 8.25e-05, "loss": 2.375, "step": 550 }, { "epoch": 0.3649006622516556, "grad_norm": 1.9704901363214848, "learning_rate": 8.265e-05, "loss": 2.7188, "step": 551 }, { "epoch": 0.3655629139072848, "grad_norm": 2.040234027674569, "learning_rate": 8.28e-05, "loss": 2.7344, "step": 552 }, { "epoch": 0.3662251655629139, "grad_norm": 1.998458962114073, "learning_rate": 8.295e-05, "loss": 2.8594, "step": 553 }, { "epoch": 0.36688741721854307, "grad_norm": 2.168751581349317, "learning_rate": 8.31e-05, "loss": 2.7969, "step": 554 }, { "epoch": 0.3675496688741722, "grad_norm": 2.1855271225047814, "learning_rate": 8.325e-05, "loss": 2.9219, "step": 555 }, { "epoch": 0.36821192052980134, "grad_norm": 1.9517021263998031, "learning_rate": 8.34e-05, "loss": 2.3594, "step": 556 }, { "epoch": 0.36887417218543045, "grad_norm": 2.215857471172879, "learning_rate": 8.355e-05, "loss": 2.7812, "step": 557 }, { "epoch": 0.3695364238410596, "grad_norm": 2.0802274038028807, "learning_rate": 8.37e-05, "loss": 2.2656, "step": 558 }, { "epoch": 0.3701986754966887, "grad_norm": 2.360471645672153, "learning_rate": 8.385e-05, "loss": 2.8281, "step": 559 }, { "epoch": 0.3708609271523179, "grad_norm": 2.129907727372624, "learning_rate": 8.4e-05, "loss": 2.3906, "step": 560 }, { "epoch": 0.371523178807947, "grad_norm": 1.9548570896462385, "learning_rate": 8.415e-05, "loss": 2.6875, "step": 561 }, { "epoch": 0.37218543046357616, "grad_norm": 1.9443511712863943, "learning_rate": 8.43e-05, "loss": 2.7031, "step": 562 }, { "epoch": 0.37284768211920527, "grad_norm": 1.9921331008283054, "learning_rate": 8.444999999999998e-05, "loss": 2.625, "step": 563 }, { "epoch": 0.37350993377483444, "grad_norm": 2.2038111441940798, "learning_rate": 8.459999999999998e-05, "loss": 2.8281, "step": 564 }, { "epoch": 0.3741721854304636, "grad_norm": 2.061644948976771, "learning_rate": 8.474999999999999e-05, "loss": 2.8125, "step": 565 }, { "epoch": 0.3748344370860927, "grad_norm": 2.035210471472948, "learning_rate": 8.489999999999999e-05, "loss": 2.7031, "step": 566 }, { "epoch": 0.3754966887417219, "grad_norm": 2.1134537403824862, "learning_rate": 8.504999999999998e-05, "loss": 2.7812, "step": 567 }, { "epoch": 0.376158940397351, "grad_norm": 1.8678257670327116, "learning_rate": 8.519999999999998e-05, "loss": 2.375, "step": 568 }, { "epoch": 0.37682119205298015, "grad_norm": 2.114658535914786, "learning_rate": 8.534999999999999e-05, "loss": 2.8438, "step": 569 }, { "epoch": 0.37748344370860926, "grad_norm": 2.159830785368374, "learning_rate": 8.549999999999999e-05, "loss": 2.5938, "step": 570 }, { "epoch": 0.3781456953642384, "grad_norm": 2.034177069544218, "learning_rate": 8.564999999999998e-05, "loss": 2.7031, "step": 571 }, { "epoch": 0.37880794701986753, "grad_norm": 2.073626405444586, "learning_rate": 8.579999999999998e-05, "loss": 2.7656, "step": 572 }, { "epoch": 0.3794701986754967, "grad_norm": 2.1168575441688104, "learning_rate": 8.594999999999999e-05, "loss": 2.5781, "step": 573 }, { "epoch": 0.3801324503311258, "grad_norm": 2.169006369823301, "learning_rate": 8.609999999999999e-05, "loss": 3.0, "step": 574 }, { "epoch": 0.38079470198675497, "grad_norm": 2.3854472275404786, "learning_rate": 8.624999999999998e-05, "loss": 2.7656, "step": 575 }, { "epoch": 0.38145695364238413, "grad_norm": 2.0804248612595466, "learning_rate": 8.639999999999999e-05, "loss": 2.7344, "step": 576 }, { "epoch": 0.38211920529801324, "grad_norm": 2.152225440715124, "learning_rate": 8.654999999999999e-05, "loss": 2.6719, "step": 577 }, { "epoch": 0.3827814569536424, "grad_norm": 1.9819913756203338, "learning_rate": 8.669999999999998e-05, "loss": 2.7188, "step": 578 }, { "epoch": 0.3834437086092715, "grad_norm": 2.139813687539947, "learning_rate": 8.684999999999998e-05, "loss": 3.125, "step": 579 }, { "epoch": 0.3841059602649007, "grad_norm": 2.166576351014731, "learning_rate": 8.699999999999999e-05, "loss": 2.75, "step": 580 }, { "epoch": 0.3847682119205298, "grad_norm": 4.829014486210198, "learning_rate": 8.714999999999999e-05, "loss": 3.0469, "step": 581 }, { "epoch": 0.38543046357615895, "grad_norm": 2.2544202346196096, "learning_rate": 8.729999999999998e-05, "loss": 2.6406, "step": 582 }, { "epoch": 0.38609271523178806, "grad_norm": 2.446261154077621, "learning_rate": 8.744999999999998e-05, "loss": 2.9688, "step": 583 }, { "epoch": 0.38675496688741723, "grad_norm": 1.9489873398139181, "learning_rate": 8.759999999999999e-05, "loss": 2.5938, "step": 584 }, { "epoch": 0.38741721854304634, "grad_norm": 2.024691193681043, "learning_rate": 8.774999999999999e-05, "loss": 2.875, "step": 585 }, { "epoch": 0.3880794701986755, "grad_norm": 2.389301092223556, "learning_rate": 8.789999999999998e-05, "loss": 2.8125, "step": 586 }, { "epoch": 0.3887417218543046, "grad_norm": 2.049220232738323, "learning_rate": 8.804999999999999e-05, "loss": 3.0, "step": 587 }, { "epoch": 0.3894039735099338, "grad_norm": 2.0842758833328094, "learning_rate": 8.819999999999999e-05, "loss": 2.6719, "step": 588 }, { "epoch": 0.39006622516556294, "grad_norm": 2.054243717019012, "learning_rate": 8.834999999999999e-05, "loss": 2.375, "step": 589 }, { "epoch": 0.39072847682119205, "grad_norm": 2.5472372555582523, "learning_rate": 8.849999999999998e-05, "loss": 2.9219, "step": 590 }, { "epoch": 0.3913907284768212, "grad_norm": 2.0970125602736798, "learning_rate": 8.864999999999999e-05, "loss": 2.9844, "step": 591 }, { "epoch": 0.3920529801324503, "grad_norm": 2.208078333809169, "learning_rate": 8.879999999999999e-05, "loss": 2.8281, "step": 592 }, { "epoch": 0.3927152317880795, "grad_norm": 2.1727502142386035, "learning_rate": 8.895e-05, "loss": 2.7344, "step": 593 }, { "epoch": 0.3933774834437086, "grad_norm": 2.092224867746833, "learning_rate": 8.909999999999998e-05, "loss": 2.875, "step": 594 }, { "epoch": 0.39403973509933776, "grad_norm": 2.0422559691902524, "learning_rate": 8.924999999999999e-05, "loss": 2.7656, "step": 595 }, { "epoch": 0.39470198675496687, "grad_norm": 2.0152291133598395, "learning_rate": 8.939999999999999e-05, "loss": 2.8594, "step": 596 }, { "epoch": 0.39536423841059604, "grad_norm": 2.145609256213379, "learning_rate": 8.955e-05, "loss": 2.9375, "step": 597 }, { "epoch": 0.39602649006622515, "grad_norm": 2.0574829298359547, "learning_rate": 8.969999999999998e-05, "loss": 3.0, "step": 598 }, { "epoch": 0.3966887417218543, "grad_norm": 2.263714036168474, "learning_rate": 8.984999999999999e-05, "loss": 2.8125, "step": 599 }, { "epoch": 0.3973509933774834, "grad_norm": 1.9919958102962712, "learning_rate": 8.999999999999999e-05, "loss": 2.6875, "step": 600 }, { "epoch": 0.3980132450331126, "grad_norm": 2.2260974867140146, "learning_rate": 9.014999999999998e-05, "loss": 2.6719, "step": 601 }, { "epoch": 0.39867549668874175, "grad_norm": 2.18945850177853, "learning_rate": 9.029999999999999e-05, "loss": 2.7656, "step": 602 }, { "epoch": 0.39933774834437086, "grad_norm": 2.069689174843609, "learning_rate": 9.044999999999999e-05, "loss": 3.0156, "step": 603 }, { "epoch": 0.4, "grad_norm": 1.9069673794761737, "learning_rate": 9.059999999999999e-05, "loss": 2.625, "step": 604 }, { "epoch": 0.40066225165562913, "grad_norm": 1.9436850056906902, "learning_rate": 9.074999999999998e-05, "loss": 2.8281, "step": 605 }, { "epoch": 0.4013245033112583, "grad_norm": 2.358587861874932, "learning_rate": 9.089999999999999e-05, "loss": 3.0938, "step": 606 }, { "epoch": 0.4019867549668874, "grad_norm": 2.1850120974104845, "learning_rate": 9.104999999999999e-05, "loss": 2.7656, "step": 607 }, { "epoch": 0.40264900662251657, "grad_norm": 2.0374017039032486, "learning_rate": 9.12e-05, "loss": 2.8281, "step": 608 }, { "epoch": 0.4033112582781457, "grad_norm": 1.9800040999475284, "learning_rate": 9.134999999999998e-05, "loss": 2.7969, "step": 609 }, { "epoch": 0.40397350993377484, "grad_norm": 2.0900735210016017, "learning_rate": 9.149999999999999e-05, "loss": 2.9062, "step": 610 }, { "epoch": 0.40463576158940395, "grad_norm": 2.2748193817236704, "learning_rate": 9.164999999999999e-05, "loss": 2.7812, "step": 611 }, { "epoch": 0.4052980132450331, "grad_norm": 2.132043060040571, "learning_rate": 9.18e-05, "loss": 2.6094, "step": 612 }, { "epoch": 0.4059602649006622, "grad_norm": 2.130604006164947, "learning_rate": 9.194999999999999e-05, "loss": 2.4531, "step": 613 }, { "epoch": 0.4066225165562914, "grad_norm": 2.019423383713658, "learning_rate": 9.209999999999999e-05, "loss": 2.875, "step": 614 }, { "epoch": 0.40728476821192056, "grad_norm": 2.4919837549111192, "learning_rate": 9.224999999999999e-05, "loss": 2.7344, "step": 615 }, { "epoch": 0.40794701986754967, "grad_norm": 2.2673189963571776, "learning_rate": 9.24e-05, "loss": 2.8438, "step": 616 }, { "epoch": 0.40860927152317883, "grad_norm": 1.975442301045306, "learning_rate": 9.254999999999999e-05, "loss": 2.2031, "step": 617 }, { "epoch": 0.40927152317880794, "grad_norm": 2.052400141176238, "learning_rate": 9.269999999999999e-05, "loss": 2.7188, "step": 618 }, { "epoch": 0.4099337748344371, "grad_norm": 1.945795507673163, "learning_rate": 9.285e-05, "loss": 2.7188, "step": 619 }, { "epoch": 0.4105960264900662, "grad_norm": 2.0151199729887894, "learning_rate": 9.3e-05, "loss": 2.6719, "step": 620 }, { "epoch": 0.4112582781456954, "grad_norm": 2.4567728770974457, "learning_rate": 9.314999999999999e-05, "loss": 2.8906, "step": 621 }, { "epoch": 0.4119205298013245, "grad_norm": 2.342716049898175, "learning_rate": 9.329999999999999e-05, "loss": 3.0156, "step": 622 }, { "epoch": 0.41258278145695365, "grad_norm": 2.2088217534027397, "learning_rate": 9.345e-05, "loss": 2.4219, "step": 623 }, { "epoch": 0.41324503311258276, "grad_norm": 2.1911966560071474, "learning_rate": 9.36e-05, "loss": 2.7969, "step": 624 }, { "epoch": 0.4139072847682119, "grad_norm": 2.03171666106166, "learning_rate": 9.374999999999999e-05, "loss": 3.0312, "step": 625 }, { "epoch": 0.41456953642384103, "grad_norm": 1.8861248268857203, "learning_rate": 9.389999999999999e-05, "loss": 2.7188, "step": 626 }, { "epoch": 0.4152317880794702, "grad_norm": 1.8014354436451963, "learning_rate": 9.405e-05, "loss": 2.2812, "step": 627 }, { "epoch": 0.41589403973509936, "grad_norm": 2.0901665087319063, "learning_rate": 9.419999999999999e-05, "loss": 2.9844, "step": 628 }, { "epoch": 0.4165562913907285, "grad_norm": 1.9522955103133746, "learning_rate": 9.434999999999999e-05, "loss": 2.9219, "step": 629 }, { "epoch": 0.41721854304635764, "grad_norm": 2.1130816889847677, "learning_rate": 9.449999999999999e-05, "loss": 2.9219, "step": 630 }, { "epoch": 0.41788079470198675, "grad_norm": 2.010240010726036, "learning_rate": 9.465e-05, "loss": 2.7344, "step": 631 }, { "epoch": 0.4185430463576159, "grad_norm": 2.3610163372817152, "learning_rate": 9.479999999999999e-05, "loss": 2.7656, "step": 632 }, { "epoch": 0.419205298013245, "grad_norm": 2.315609038462489, "learning_rate": 9.494999999999999e-05, "loss": 3.0469, "step": 633 }, { "epoch": 0.4198675496688742, "grad_norm": 1.9516267011309307, "learning_rate": 9.51e-05, "loss": 2.7969, "step": 634 }, { "epoch": 0.4205298013245033, "grad_norm": 1.936401379573993, "learning_rate": 9.525e-05, "loss": 2.5312, "step": 635 }, { "epoch": 0.42119205298013246, "grad_norm": 2.164936526837924, "learning_rate": 9.539999999999999e-05, "loss": 2.8438, "step": 636 }, { "epoch": 0.42185430463576157, "grad_norm": 1.811260752899351, "learning_rate": 9.554999999999999e-05, "loss": 2.6562, "step": 637 }, { "epoch": 0.42251655629139073, "grad_norm": 1.9666396458689808, "learning_rate": 9.57e-05, "loss": 2.7031, "step": 638 }, { "epoch": 0.42317880794701984, "grad_norm": 2.0875182774650343, "learning_rate": 9.585e-05, "loss": 2.9375, "step": 639 }, { "epoch": 0.423841059602649, "grad_norm": 2.0909587175752393, "learning_rate": 9.599999999999999e-05, "loss": 2.75, "step": 640 }, { "epoch": 0.42450331125827817, "grad_norm": 2.0494336173868803, "learning_rate": 9.614999999999999e-05, "loss": 2.4375, "step": 641 }, { "epoch": 0.4251655629139073, "grad_norm": 2.2941128104491284, "learning_rate": 9.63e-05, "loss": 2.6406, "step": 642 }, { "epoch": 0.42582781456953644, "grad_norm": 1.8406419411542303, "learning_rate": 9.645e-05, "loss": 2.7188, "step": 643 }, { "epoch": 0.42649006622516555, "grad_norm": 3.072637208363104, "learning_rate": 9.659999999999999e-05, "loss": 2.2656, "step": 644 }, { "epoch": 0.4271523178807947, "grad_norm": 1.9011114934831905, "learning_rate": 9.675e-05, "loss": 2.5625, "step": 645 }, { "epoch": 0.4278145695364238, "grad_norm": 2.162374522265662, "learning_rate": 9.69e-05, "loss": 2.8125, "step": 646 }, { "epoch": 0.428476821192053, "grad_norm": 2.293962730094481, "learning_rate": 9.705e-05, "loss": 3.0938, "step": 647 }, { "epoch": 0.4291390728476821, "grad_norm": 1.980727209921109, "learning_rate": 9.719999999999999e-05, "loss": 2.6406, "step": 648 }, { "epoch": 0.42980132450331127, "grad_norm": 2.0854019802407695, "learning_rate": 9.735e-05, "loss": 2.875, "step": 649 }, { "epoch": 0.4304635761589404, "grad_norm": 2.019699534933072, "learning_rate": 9.75e-05, "loss": 2.7344, "step": 650 }, { "epoch": 0.43112582781456954, "grad_norm": 2.027642675309942, "learning_rate": 9.764999999999999e-05, "loss": 2.6562, "step": 651 }, { "epoch": 0.43178807947019865, "grad_norm": 2.0938822188759825, "learning_rate": 9.779999999999999e-05, "loss": 2.5312, "step": 652 }, { "epoch": 0.4324503311258278, "grad_norm": 1.935918085413364, "learning_rate": 9.795e-05, "loss": 2.5312, "step": 653 }, { "epoch": 0.433112582781457, "grad_norm": 1.9155078370573648, "learning_rate": 9.81e-05, "loss": 2.5938, "step": 654 }, { "epoch": 0.4337748344370861, "grad_norm": 2.026061939320236, "learning_rate": 9.824999999999999e-05, "loss": 2.5156, "step": 655 }, { "epoch": 0.43443708609271525, "grad_norm": 1.8698782681785027, "learning_rate": 9.839999999999999e-05, "loss": 2.7031, "step": 656 }, { "epoch": 0.43509933774834436, "grad_norm": 2.025533528818808, "learning_rate": 9.855e-05, "loss": 2.8438, "step": 657 }, { "epoch": 0.4357615894039735, "grad_norm": 1.9182580412716894, "learning_rate": 9.87e-05, "loss": 2.8281, "step": 658 }, { "epoch": 0.43642384105960264, "grad_norm": 1.7999357845154618, "learning_rate": 9.884999999999999e-05, "loss": 2.6562, "step": 659 }, { "epoch": 0.4370860927152318, "grad_norm": 1.8457615345099856, "learning_rate": 9.9e-05, "loss": 2.6406, "step": 660 }, { "epoch": 0.4377483443708609, "grad_norm": 2.07931221344651, "learning_rate": 9.915e-05, "loss": 3.0469, "step": 661 }, { "epoch": 0.4384105960264901, "grad_norm": 1.8892852393077035, "learning_rate": 9.93e-05, "loss": 2.5312, "step": 662 }, { "epoch": 0.4390728476821192, "grad_norm": 1.8394517245732003, "learning_rate": 9.944999999999999e-05, "loss": 2.4062, "step": 663 }, { "epoch": 0.43973509933774835, "grad_norm": 2.0161302979519653, "learning_rate": 9.96e-05, "loss": 2.7656, "step": 664 }, { "epoch": 0.44039735099337746, "grad_norm": 1.919299333070575, "learning_rate": 9.975e-05, "loss": 2.7969, "step": 665 }, { "epoch": 0.4410596026490066, "grad_norm": 2.0292755557135096, "learning_rate": 9.99e-05, "loss": 2.8438, "step": 666 }, { "epoch": 0.4417218543046358, "grad_norm": 2.1344458842194016, "learning_rate": 0.00010004999999999999, "loss": 2.8281, "step": 667 }, { "epoch": 0.4423841059602649, "grad_norm": 2.117217670341517, "learning_rate": 0.0001002, "loss": 2.8125, "step": 668 }, { "epoch": 0.44304635761589406, "grad_norm": 1.8164444075974442, "learning_rate": 0.00010035, "loss": 2.5156, "step": 669 }, { "epoch": 0.44370860927152317, "grad_norm": 2.0998039410610176, "learning_rate": 0.0001005, "loss": 2.7656, "step": 670 }, { "epoch": 0.44437086092715233, "grad_norm": 2.0638105437895673, "learning_rate": 0.00010065, "loss": 2.8125, "step": 671 }, { "epoch": 0.44503311258278144, "grad_norm": 2.3107447633676332, "learning_rate": 0.0001008, "loss": 2.3438, "step": 672 }, { "epoch": 0.4456953642384106, "grad_norm": 2.1500168990952657, "learning_rate": 0.00010095, "loss": 2.9531, "step": 673 }, { "epoch": 0.4463576158940397, "grad_norm": 2.1030641234995624, "learning_rate": 0.0001011, "loss": 2.8594, "step": 674 }, { "epoch": 0.4470198675496689, "grad_norm": 2.0355334057389696, "learning_rate": 0.00010125, "loss": 2.7812, "step": 675 }, { "epoch": 0.447682119205298, "grad_norm": 1.984890442020896, "learning_rate": 0.0001014, "loss": 2.7812, "step": 676 }, { "epoch": 0.44834437086092715, "grad_norm": 1.9466000003504182, "learning_rate": 0.00010155, "loss": 2.9062, "step": 677 }, { "epoch": 0.44900662251655626, "grad_norm": 2.0935410302089568, "learning_rate": 0.00010169999999999999, "loss": 2.6406, "step": 678 }, { "epoch": 0.44966887417218543, "grad_norm": 1.8879317401494364, "learning_rate": 0.00010185, "loss": 2.8594, "step": 679 }, { "epoch": 0.4503311258278146, "grad_norm": 2.055970696021107, "learning_rate": 0.000102, "loss": 2.7188, "step": 680 }, { "epoch": 0.4509933774834437, "grad_norm": 2.185238399914902, "learning_rate": 0.00010215, "loss": 2.875, "step": 681 }, { "epoch": 0.45165562913907287, "grad_norm": 2.028180092120401, "learning_rate": 0.00010229999999999999, "loss": 2.5781, "step": 682 }, { "epoch": 0.452317880794702, "grad_norm": 1.9564392740966563, "learning_rate": 0.00010245, "loss": 2.7812, "step": 683 }, { "epoch": 0.45298013245033114, "grad_norm": 1.8469818602858226, "learning_rate": 0.0001026, "loss": 2.8125, "step": 684 }, { "epoch": 0.45364238410596025, "grad_norm": 1.7789251265103483, "learning_rate": 0.00010275, "loss": 2.7969, "step": 685 }, { "epoch": 0.4543046357615894, "grad_norm": 1.9462696541428774, "learning_rate": 0.0001029, "loss": 2.7969, "step": 686 }, { "epoch": 0.4549668874172185, "grad_norm": 1.9375468631358366, "learning_rate": 0.00010305, "loss": 2.2344, "step": 687 }, { "epoch": 0.4556291390728477, "grad_norm": 1.8878963337194594, "learning_rate": 0.00010319999999999999, "loss": 2.8281, "step": 688 }, { "epoch": 0.4562913907284768, "grad_norm": 1.856681928092609, "learning_rate": 0.00010334999999999998, "loss": 2.7656, "step": 689 }, { "epoch": 0.45695364238410596, "grad_norm": 1.921732450139847, "learning_rate": 0.00010349999999999998, "loss": 2.7969, "step": 690 }, { "epoch": 0.45761589403973507, "grad_norm": 1.8701944887818123, "learning_rate": 0.00010364999999999999, "loss": 2.7188, "step": 691 }, { "epoch": 0.45827814569536424, "grad_norm": 4.169335389273231, "learning_rate": 0.00010379999999999999, "loss": 2.9688, "step": 692 }, { "epoch": 0.4589403973509934, "grad_norm": 2.167018530169718, "learning_rate": 0.00010394999999999998, "loss": 2.9062, "step": 693 }, { "epoch": 0.4596026490066225, "grad_norm": 1.960256407129528, "learning_rate": 0.00010409999999999998, "loss": 2.75, "step": 694 }, { "epoch": 0.4602649006622517, "grad_norm": 2.1755736510939907, "learning_rate": 0.00010424999999999999, "loss": 2.7969, "step": 695 }, { "epoch": 0.4609271523178808, "grad_norm": 2.0297946354927285, "learning_rate": 0.00010439999999999999, "loss": 2.8594, "step": 696 }, { "epoch": 0.46158940397350995, "grad_norm": 2.0238840685003683, "learning_rate": 0.00010454999999999998, "loss": 2.9062, "step": 697 }, { "epoch": 0.46225165562913906, "grad_norm": 1.9731908343869506, "learning_rate": 0.00010469999999999998, "loss": 2.5, "step": 698 }, { "epoch": 0.4629139072847682, "grad_norm": 1.871549365036509, "learning_rate": 0.00010484999999999999, "loss": 2.8125, "step": 699 }, { "epoch": 0.46357615894039733, "grad_norm": 1.9245222022391868, "learning_rate": 0.00010499999999999999, "loss": 2.7656, "step": 700 }, { "epoch": 0.4642384105960265, "grad_norm": 1.9720270702151717, "learning_rate": 0.00010514999999999998, "loss": 2.7031, "step": 701 }, { "epoch": 0.4649006622516556, "grad_norm": 1.9171855121684747, "learning_rate": 0.00010529999999999998, "loss": 2.75, "step": 702 }, { "epoch": 0.46556291390728477, "grad_norm": 2.124760553049768, "learning_rate": 0.00010544999999999999, "loss": 2.7031, "step": 703 }, { "epoch": 0.46622516556291393, "grad_norm": 2.005358457356863, "learning_rate": 0.00010559999999999998, "loss": 2.8594, "step": 704 }, { "epoch": 0.46688741721854304, "grad_norm": 2.0586727803774836, "learning_rate": 0.00010574999999999998, "loss": 2.6719, "step": 705 }, { "epoch": 0.4675496688741722, "grad_norm": 2.3268192813796182, "learning_rate": 0.00010589999999999999, "loss": 2.7812, "step": 706 }, { "epoch": 0.4682119205298013, "grad_norm": 1.7245416728998577, "learning_rate": 0.00010604999999999999, "loss": 2.6094, "step": 707 }, { "epoch": 0.4688741721854305, "grad_norm": 1.8464381653716269, "learning_rate": 0.00010619999999999998, "loss": 2.5625, "step": 708 }, { "epoch": 0.4695364238410596, "grad_norm": 1.8839914065997356, "learning_rate": 0.00010634999999999998, "loss": 2.6719, "step": 709 }, { "epoch": 0.47019867549668876, "grad_norm": 2.0549100826173086, "learning_rate": 0.00010649999999999999, "loss": 2.7188, "step": 710 }, { "epoch": 0.47086092715231787, "grad_norm": 2.1669033195048932, "learning_rate": 0.00010664999999999999, "loss": 2.9062, "step": 711 }, { "epoch": 0.47152317880794703, "grad_norm": 4.242307772503889, "learning_rate": 0.00010679999999999998, "loss": 2.3438, "step": 712 }, { "epoch": 0.47218543046357614, "grad_norm": 2.230049474588384, "learning_rate": 0.00010694999999999998, "loss": 2.5, "step": 713 }, { "epoch": 0.4728476821192053, "grad_norm": 2.5508420933245186, "learning_rate": 0.00010709999999999999, "loss": 2.9375, "step": 714 }, { "epoch": 0.4735099337748344, "grad_norm": 1.9773917686858677, "learning_rate": 0.00010724999999999999, "loss": 2.625, "step": 715 }, { "epoch": 0.4741721854304636, "grad_norm": 2.3073193812785617, "learning_rate": 0.00010739999999999998, "loss": 2.5781, "step": 716 }, { "epoch": 0.47483443708609274, "grad_norm": 2.1683524322665675, "learning_rate": 0.00010754999999999999, "loss": 3.0, "step": 717 }, { "epoch": 0.47549668874172185, "grad_norm": 1.7891675793673438, "learning_rate": 0.00010769999999999999, "loss": 2.6875, "step": 718 }, { "epoch": 0.476158940397351, "grad_norm": 1.963404117289492, "learning_rate": 0.00010784999999999999, "loss": 2.6094, "step": 719 }, { "epoch": 0.4768211920529801, "grad_norm": 2.102721907639093, "learning_rate": 0.00010799999999999998, "loss": 2.7969, "step": 720 }, { "epoch": 0.4774834437086093, "grad_norm": 2.1921804967485627, "learning_rate": 0.00010814999999999999, "loss": 2.6875, "step": 721 }, { "epoch": 0.4781456953642384, "grad_norm": 2.1426187353732526, "learning_rate": 0.00010829999999999999, "loss": 2.7188, "step": 722 }, { "epoch": 0.47880794701986756, "grad_norm": 2.0710641618379264, "learning_rate": 0.00010845, "loss": 2.8125, "step": 723 }, { "epoch": 0.4794701986754967, "grad_norm": 2.0973632106241973, "learning_rate": 0.00010859999999999998, "loss": 2.8594, "step": 724 }, { "epoch": 0.48013245033112584, "grad_norm": 2.0027135007023915, "learning_rate": 0.00010874999999999999, "loss": 2.5625, "step": 725 }, { "epoch": 0.48079470198675495, "grad_norm": 1.7345598374122004, "learning_rate": 0.00010889999999999999, "loss": 2.5312, "step": 726 }, { "epoch": 0.4814569536423841, "grad_norm": 1.8985617274201236, "learning_rate": 0.00010904999999999998, "loss": 2.6094, "step": 727 }, { "epoch": 0.4821192052980132, "grad_norm": 1.8312647294044948, "learning_rate": 0.00010919999999999998, "loss": 2.6406, "step": 728 }, { "epoch": 0.4827814569536424, "grad_norm": 1.9805082205584479, "learning_rate": 0.00010934999999999999, "loss": 2.5781, "step": 729 }, { "epoch": 0.48344370860927155, "grad_norm": 2.071072549754882, "learning_rate": 0.00010949999999999999, "loss": 2.6719, "step": 730 }, { "epoch": 0.48410596026490066, "grad_norm": 1.9270290991948522, "learning_rate": 0.00010964999999999998, "loss": 2.75, "step": 731 }, { "epoch": 0.4847682119205298, "grad_norm": 1.956164832463769, "learning_rate": 0.00010979999999999999, "loss": 2.625, "step": 732 }, { "epoch": 0.48543046357615893, "grad_norm": 2.219261601014454, "learning_rate": 0.00010994999999999999, "loss": 3.0312, "step": 733 }, { "epoch": 0.4860927152317881, "grad_norm": 2.048459710802969, "learning_rate": 0.00011009999999999999, "loss": 2.8125, "step": 734 }, { "epoch": 0.4867549668874172, "grad_norm": 2.0887789379854698, "learning_rate": 0.00011024999999999998, "loss": 3.0625, "step": 735 }, { "epoch": 0.48741721854304637, "grad_norm": 1.821516856922936, "learning_rate": 0.00011039999999999999, "loss": 2.6406, "step": 736 }, { "epoch": 0.4880794701986755, "grad_norm": 1.8905215179124095, "learning_rate": 0.00011054999999999999, "loss": 2.9688, "step": 737 }, { "epoch": 0.48874172185430464, "grad_norm": 2.0583695629368086, "learning_rate": 0.0001107, "loss": 2.8594, "step": 738 }, { "epoch": 0.48940397350993375, "grad_norm": 1.9024595576268803, "learning_rate": 0.00011084999999999998, "loss": 2.6562, "step": 739 }, { "epoch": 0.4900662251655629, "grad_norm": 2.0054043650657287, "learning_rate": 0.00011099999999999999, "loss": 2.9375, "step": 740 }, { "epoch": 0.49072847682119203, "grad_norm": 2.0531208576121274, "learning_rate": 0.00011114999999999999, "loss": 2.5625, "step": 741 }, { "epoch": 0.4913907284768212, "grad_norm": 2.066240447010065, "learning_rate": 0.0001113, "loss": 2.7969, "step": 742 }, { "epoch": 0.49205298013245036, "grad_norm": 1.8133406532223322, "learning_rate": 0.00011144999999999998, "loss": 2.7969, "step": 743 }, { "epoch": 0.49271523178807947, "grad_norm": 2.0337928729066523, "learning_rate": 0.00011159999999999999, "loss": 2.625, "step": 744 }, { "epoch": 0.49337748344370863, "grad_norm": 1.92101661813358, "learning_rate": 0.00011174999999999999, "loss": 2.8594, "step": 745 }, { "epoch": 0.49403973509933774, "grad_norm": 1.7178335519352124, "learning_rate": 0.0001119, "loss": 2.75, "step": 746 }, { "epoch": 0.4947019867549669, "grad_norm": 1.9742838470927657, "learning_rate": 0.00011204999999999999, "loss": 3.0156, "step": 747 }, { "epoch": 0.495364238410596, "grad_norm": 2.0930678039048245, "learning_rate": 0.00011219999999999999, "loss": 2.9219, "step": 748 }, { "epoch": 0.4960264900662252, "grad_norm": 2.1000336000050717, "learning_rate": 0.00011235, "loss": 2.4531, "step": 749 }, { "epoch": 0.4966887417218543, "grad_norm": 1.689868266249042, "learning_rate": 0.0001125, "loss": 2.4688, "step": 750 }, { "epoch": 0.49735099337748345, "grad_norm": 2.3162031291766674, "learning_rate": 0.00011264999999999999, "loss": 3.1094, "step": 751 }, { "epoch": 0.49801324503311256, "grad_norm": 2.0474483134251655, "learning_rate": 0.00011279999999999999, "loss": 2.8438, "step": 752 }, { "epoch": 0.4986754966887417, "grad_norm": 1.6925971215396884, "learning_rate": 0.00011295, "loss": 2.8281, "step": 753 }, { "epoch": 0.49933774834437084, "grad_norm": 1.8550156080538551, "learning_rate": 0.00011309999999999998, "loss": 2.625, "step": 754 }, { "epoch": 0.5, "grad_norm": 1.8886038363874273, "learning_rate": 0.00011324999999999999, "loss": 2.6719, "step": 755 }, { "epoch": 0.5006622516556292, "grad_norm": 1.7207522598131173, "learning_rate": 0.00011339999999999999, "loss": 2.7812, "step": 756 }, { "epoch": 0.5013245033112583, "grad_norm": 1.7412690874170982, "learning_rate": 0.00011355, "loss": 2.6406, "step": 757 }, { "epoch": 0.5019867549668874, "grad_norm": 1.830895304100342, "learning_rate": 0.00011369999999999999, "loss": 2.6875, "step": 758 }, { "epoch": 0.5026490066225165, "grad_norm": 1.861511388202317, "learning_rate": 0.00011384999999999999, "loss": 3.0938, "step": 759 }, { "epoch": 0.5033112582781457, "grad_norm": 1.86325234077852, "learning_rate": 0.00011399999999999999, "loss": 2.2812, "step": 760 }, { "epoch": 0.5039735099337749, "grad_norm": 1.8623911294127775, "learning_rate": 0.00011415, "loss": 2.3906, "step": 761 }, { "epoch": 0.5046357615894039, "grad_norm": 2.1326876062244486, "learning_rate": 0.00011429999999999999, "loss": 3.0312, "step": 762 }, { "epoch": 0.5052980132450331, "grad_norm": 1.720494118454427, "learning_rate": 0.00011444999999999999, "loss": 2.5312, "step": 763 }, { "epoch": 0.5059602649006623, "grad_norm": 1.9522265564763535, "learning_rate": 0.0001146, "loss": 2.7031, "step": 764 }, { "epoch": 0.5066225165562914, "grad_norm": 2.2010704199527096, "learning_rate": 0.00011475, "loss": 2.5781, "step": 765 }, { "epoch": 0.5072847682119205, "grad_norm": 1.7752474504916527, "learning_rate": 0.00011489999999999999, "loss": 2.75, "step": 766 }, { "epoch": 0.5079470198675496, "grad_norm": 1.9573557173601654, "learning_rate": 0.00011504999999999999, "loss": 2.3906, "step": 767 }, { "epoch": 0.5086092715231788, "grad_norm": 2.32855193882003, "learning_rate": 0.0001152, "loss": 3.0469, "step": 768 }, { "epoch": 0.509271523178808, "grad_norm": 1.922631377589676, "learning_rate": 0.00011535, "loss": 2.7969, "step": 769 }, { "epoch": 0.5099337748344371, "grad_norm": 1.9175037190709405, "learning_rate": 0.00011549999999999999, "loss": 2.3125, "step": 770 }, { "epoch": 0.5105960264900662, "grad_norm": 2.6069228766491324, "learning_rate": 0.00011564999999999999, "loss": 2.7344, "step": 771 }, { "epoch": 0.5112582781456954, "grad_norm": 2.20275784898294, "learning_rate": 0.0001158, "loss": 2.7812, "step": 772 }, { "epoch": 0.5119205298013245, "grad_norm": 2.0780138867352433, "learning_rate": 0.00011595, "loss": 2.6719, "step": 773 }, { "epoch": 0.5125827814569537, "grad_norm": 1.9779476186374148, "learning_rate": 0.00011609999999999999, "loss": 2.2969, "step": 774 }, { "epoch": 0.5132450331125827, "grad_norm": 2.0382052240868065, "learning_rate": 0.00011624999999999999, "loss": 2.8594, "step": 775 }, { "epoch": 0.5139072847682119, "grad_norm": 1.6902369875968617, "learning_rate": 0.0001164, "loss": 2.625, "step": 776 }, { "epoch": 0.5145695364238411, "grad_norm": 1.9211441968645673, "learning_rate": 0.00011654999999999999, "loss": 2.7656, "step": 777 }, { "epoch": 0.5152317880794702, "grad_norm": 1.8897116323932281, "learning_rate": 0.00011669999999999999, "loss": 2.75, "step": 778 }, { "epoch": 0.5158940397350993, "grad_norm": 1.7821653101284112, "learning_rate": 0.00011685, "loss": 2.6719, "step": 779 }, { "epoch": 0.5165562913907285, "grad_norm": 1.8477325585046043, "learning_rate": 0.000117, "loss": 2.5, "step": 780 }, { "epoch": 0.5172185430463576, "grad_norm": 2.0495959384389857, "learning_rate": 0.00011714999999999999, "loss": 2.5469, "step": 781 }, { "epoch": 0.5178807947019868, "grad_norm": 1.7864365744361108, "learning_rate": 0.00011729999999999999, "loss": 2.6875, "step": 782 }, { "epoch": 0.5185430463576159, "grad_norm": 1.8740829841389737, "learning_rate": 0.00011745, "loss": 2.7031, "step": 783 }, { "epoch": 0.519205298013245, "grad_norm": 1.859576335191902, "learning_rate": 0.0001176, "loss": 2.7969, "step": 784 }, { "epoch": 0.5198675496688742, "grad_norm": 1.7466716734440257, "learning_rate": 0.00011774999999999999, "loss": 2.0938, "step": 785 }, { "epoch": 0.5205298013245033, "grad_norm": 2.170867807668343, "learning_rate": 0.00011789999999999999, "loss": 2.5469, "step": 786 }, { "epoch": 0.5211920529801325, "grad_norm": 13.077275299264398, "learning_rate": 0.00011805, "loss": 2.3438, "step": 787 }, { "epoch": 0.5218543046357615, "grad_norm": 2.0245919838606157, "learning_rate": 0.0001182, "loss": 2.7656, "step": 788 }, { "epoch": 0.5225165562913907, "grad_norm": 1.7673280376866838, "learning_rate": 0.00011834999999999999, "loss": 2.9219, "step": 789 }, { "epoch": 0.5231788079470199, "grad_norm": 2.1047909435557806, "learning_rate": 0.0001185, "loss": 2.7656, "step": 790 }, { "epoch": 0.523841059602649, "grad_norm": 1.878513072489257, "learning_rate": 0.00011865, "loss": 2.7812, "step": 791 }, { "epoch": 0.5245033112582781, "grad_norm": 1.8641099710480125, "learning_rate": 0.0001188, "loss": 2.7812, "step": 792 }, { "epoch": 0.5251655629139073, "grad_norm": 2.06959887252658, "learning_rate": 0.00011894999999999999, "loss": 2.8594, "step": 793 }, { "epoch": 0.5258278145695364, "grad_norm": 1.7496189165860319, "learning_rate": 0.0001191, "loss": 2.6875, "step": 794 }, { "epoch": 0.5264900662251656, "grad_norm": 1.9912100526135885, "learning_rate": 0.00011925, "loss": 2.875, "step": 795 }, { "epoch": 0.5271523178807948, "grad_norm": 1.8402023511717387, "learning_rate": 0.0001194, "loss": 2.4219, "step": 796 }, { "epoch": 0.5278145695364238, "grad_norm": 1.9057031816263097, "learning_rate": 0.00011954999999999999, "loss": 2.3594, "step": 797 }, { "epoch": 0.528476821192053, "grad_norm": 1.7956758706384683, "learning_rate": 0.0001197, "loss": 2.75, "step": 798 }, { "epoch": 0.5291390728476821, "grad_norm": 1.8087190772255215, "learning_rate": 0.00011985, "loss": 2.2031, "step": 799 }, { "epoch": 0.5298013245033113, "grad_norm": 1.6472549287780949, "learning_rate": 0.00011999999999999999, "loss": 2.5938, "step": 800 }, { "epoch": 0.5304635761589404, "grad_norm": 6.530946947138037, "learning_rate": 0.00012014999999999999, "loss": 3.0469, "step": 801 }, { "epoch": 0.5311258278145695, "grad_norm": 1.9019258907496284, "learning_rate": 0.0001203, "loss": 2.8125, "step": 802 }, { "epoch": 0.5317880794701987, "grad_norm": 1.7844438310074837, "learning_rate": 0.00012045, "loss": 2.9062, "step": 803 }, { "epoch": 0.5324503311258278, "grad_norm": 1.7783963177802107, "learning_rate": 0.00012059999999999999, "loss": 2.875, "step": 804 }, { "epoch": 0.5331125827814569, "grad_norm": 1.937865201568551, "learning_rate": 0.00012075, "loss": 2.8125, "step": 805 }, { "epoch": 0.5337748344370861, "grad_norm": 1.6858975743511662, "learning_rate": 0.0001209, "loss": 2.6094, "step": 806 }, { "epoch": 0.5344370860927152, "grad_norm": 1.9154947485714664, "learning_rate": 0.00012105, "loss": 3.0469, "step": 807 }, { "epoch": 0.5350993377483444, "grad_norm": 2.150974937389423, "learning_rate": 0.00012119999999999999, "loss": 2.5938, "step": 808 }, { "epoch": 0.5357615894039736, "grad_norm": 1.7799830052191583, "learning_rate": 0.00012135, "loss": 2.625, "step": 809 }, { "epoch": 0.5364238410596026, "grad_norm": 2.081073355218276, "learning_rate": 0.0001215, "loss": 2.7812, "step": 810 }, { "epoch": 0.5370860927152318, "grad_norm": 1.8018274757001718, "learning_rate": 0.00012165, "loss": 2.8281, "step": 811 }, { "epoch": 0.5377483443708609, "grad_norm": 1.8423803656335467, "learning_rate": 0.00012179999999999999, "loss": 2.6406, "step": 812 }, { "epoch": 0.5384105960264901, "grad_norm": 2.026709485612334, "learning_rate": 0.00012194999999999998, "loss": 3.0625, "step": 813 }, { "epoch": 0.5390728476821192, "grad_norm": 1.8936927142722246, "learning_rate": 0.00012209999999999999, "loss": 2.7188, "step": 814 }, { "epoch": 0.5397350993377483, "grad_norm": 1.8765357563989975, "learning_rate": 0.00012225, "loss": 2.2344, "step": 815 }, { "epoch": 0.5403973509933775, "grad_norm": 1.9195032529778533, "learning_rate": 0.0001224, "loss": 2.7344, "step": 816 }, { "epoch": 0.5410596026490067, "grad_norm": 1.8918723376543463, "learning_rate": 0.00012254999999999997, "loss": 2.7344, "step": 817 }, { "epoch": 0.5417218543046357, "grad_norm": 1.8326477133298356, "learning_rate": 0.00012269999999999997, "loss": 2.6094, "step": 818 }, { "epoch": 0.5423841059602649, "grad_norm": 1.7975830196144638, "learning_rate": 0.00012284999999999998, "loss": 2.6406, "step": 819 }, { "epoch": 0.543046357615894, "grad_norm": 1.8278078032739093, "learning_rate": 0.00012299999999999998, "loss": 2.625, "step": 820 }, { "epoch": 0.5437086092715232, "grad_norm": 1.8222155606884072, "learning_rate": 0.00012314999999999998, "loss": 2.6875, "step": 821 }, { "epoch": 0.5443708609271524, "grad_norm": 1.7723078578833016, "learning_rate": 0.0001233, "loss": 2.7812, "step": 822 }, { "epoch": 0.5450331125827814, "grad_norm": 1.7667910066878394, "learning_rate": 0.00012345, "loss": 2.4375, "step": 823 }, { "epoch": 0.5456953642384106, "grad_norm": 1.7770662078586374, "learning_rate": 0.0001236, "loss": 2.8281, "step": 824 }, { "epoch": 0.5463576158940397, "grad_norm": 2.0470842973375807, "learning_rate": 0.00012374999999999997, "loss": 2.7969, "step": 825 }, { "epoch": 0.5470198675496689, "grad_norm": 2.032816439716145, "learning_rate": 0.00012389999999999998, "loss": 2.6562, "step": 826 }, { "epoch": 0.547682119205298, "grad_norm": 1.9518305930955175, "learning_rate": 0.00012404999999999998, "loss": 2.8594, "step": 827 }, { "epoch": 0.5483443708609271, "grad_norm": 2.0702529032951333, "learning_rate": 0.00012419999999999998, "loss": 2.7812, "step": 828 }, { "epoch": 0.5490066225165563, "grad_norm": 1.7810457757515485, "learning_rate": 0.00012435, "loss": 2.7812, "step": 829 }, { "epoch": 0.5496688741721855, "grad_norm": 1.874339582643248, "learning_rate": 0.0001245, "loss": 2.9062, "step": 830 }, { "epoch": 0.5503311258278145, "grad_norm": 1.9115618687119742, "learning_rate": 0.00012465, "loss": 2.75, "step": 831 }, { "epoch": 0.5509933774834437, "grad_norm": 1.6226128753757272, "learning_rate": 0.00012479999999999997, "loss": 2.625, "step": 832 }, { "epoch": 0.5516556291390728, "grad_norm": 1.8403839566351927, "learning_rate": 0.00012494999999999997, "loss": 2.8281, "step": 833 }, { "epoch": 0.552317880794702, "grad_norm": 1.7786123114107988, "learning_rate": 0.00012509999999999998, "loss": 2.8438, "step": 834 }, { "epoch": 0.5529801324503312, "grad_norm": 1.9010352888493076, "learning_rate": 0.00012524999999999998, "loss": 2.1719, "step": 835 }, { "epoch": 0.5536423841059602, "grad_norm": 1.8555955810398956, "learning_rate": 0.00012539999999999999, "loss": 2.75, "step": 836 }, { "epoch": 0.5543046357615894, "grad_norm": 1.7684581460775344, "learning_rate": 0.00012555, "loss": 2.7969, "step": 837 }, { "epoch": 0.5549668874172186, "grad_norm": 1.8618762906510204, "learning_rate": 0.0001257, "loss": 2.2188, "step": 838 }, { "epoch": 0.5556291390728477, "grad_norm": 1.9331369675858678, "learning_rate": 0.00012585, "loss": 2.6875, "step": 839 }, { "epoch": 0.5562913907284768, "grad_norm": 1.82406671427144, "learning_rate": 0.00012599999999999997, "loss": 2.6875, "step": 840 }, { "epoch": 0.5569536423841059, "grad_norm": 1.772148670243457, "learning_rate": 0.00012614999999999998, "loss": 2.7031, "step": 841 }, { "epoch": 0.5576158940397351, "grad_norm": 1.7488091792948688, "learning_rate": 0.00012629999999999998, "loss": 2.8594, "step": 842 }, { "epoch": 0.5582781456953643, "grad_norm": 1.8065000955581838, "learning_rate": 0.00012644999999999998, "loss": 2.2188, "step": 843 }, { "epoch": 0.5589403973509933, "grad_norm": 1.7098073194862848, "learning_rate": 0.0001266, "loss": 2.6875, "step": 844 }, { "epoch": 0.5596026490066225, "grad_norm": 1.9531921753484562, "learning_rate": 0.00012675, "loss": 3.0, "step": 845 }, { "epoch": 0.5602649006622517, "grad_norm": 1.963000496497309, "learning_rate": 0.0001269, "loss": 2.9844, "step": 846 }, { "epoch": 0.5609271523178808, "grad_norm": 1.710875948917655, "learning_rate": 0.00012705, "loss": 2.7344, "step": 847 }, { "epoch": 0.56158940397351, "grad_norm": 2.4151996667056617, "learning_rate": 0.00012719999999999997, "loss": 2.7344, "step": 848 }, { "epoch": 0.562251655629139, "grad_norm": 1.8125472612640734, "learning_rate": 0.00012734999999999998, "loss": 2.8281, "step": 849 }, { "epoch": 0.5629139072847682, "grad_norm": 1.8140943095423947, "learning_rate": 0.00012749999999999998, "loss": 2.8594, "step": 850 }, { "epoch": 0.5635761589403974, "grad_norm": 1.7342284250214337, "learning_rate": 0.00012764999999999999, "loss": 2.2344, "step": 851 }, { "epoch": 0.5642384105960265, "grad_norm": 1.942351624840455, "learning_rate": 0.0001278, "loss": 3.0312, "step": 852 }, { "epoch": 0.5649006622516556, "grad_norm": 1.7930118269153643, "learning_rate": 0.00012795, "loss": 2.8906, "step": 853 }, { "epoch": 0.5655629139072847, "grad_norm": 1.9409660419617345, "learning_rate": 0.0001281, "loss": 2.9219, "step": 854 }, { "epoch": 0.5662251655629139, "grad_norm": 1.7411916959780829, "learning_rate": 0.00012824999999999997, "loss": 2.5938, "step": 855 }, { "epoch": 0.5668874172185431, "grad_norm": 1.7628585189329957, "learning_rate": 0.00012839999999999998, "loss": 2.8594, "step": 856 }, { "epoch": 0.5675496688741721, "grad_norm": 1.9807303407821348, "learning_rate": 0.00012854999999999998, "loss": 2.75, "step": 857 }, { "epoch": 0.5682119205298013, "grad_norm": 1.8718308918108135, "learning_rate": 0.00012869999999999998, "loss": 2.4062, "step": 858 }, { "epoch": 0.5688741721854305, "grad_norm": 1.9789335376073256, "learning_rate": 0.00012885, "loss": 2.7188, "step": 859 }, { "epoch": 0.5695364238410596, "grad_norm": 1.7599091552080293, "learning_rate": 0.000129, "loss": 2.7031, "step": 860 }, { "epoch": 0.5701986754966888, "grad_norm": 1.8880914188513047, "learning_rate": 0.00012915, "loss": 2.8594, "step": 861 }, { "epoch": 0.5708609271523178, "grad_norm": 1.7033803644464172, "learning_rate": 0.0001293, "loss": 2.8125, "step": 862 }, { "epoch": 0.571523178807947, "grad_norm": 1.6279207101353814, "learning_rate": 0.00012944999999999998, "loss": 2.2969, "step": 863 }, { "epoch": 0.5721854304635762, "grad_norm": 1.7806742967800837, "learning_rate": 0.00012959999999999998, "loss": 2.625, "step": 864 }, { "epoch": 0.5728476821192053, "grad_norm": 1.6403256333752165, "learning_rate": 0.00012974999999999998, "loss": 2.6562, "step": 865 }, { "epoch": 0.5735099337748344, "grad_norm": 1.7393439532601982, "learning_rate": 0.00012989999999999999, "loss": 2.9688, "step": 866 }, { "epoch": 0.5741721854304636, "grad_norm": 1.84486249966741, "learning_rate": 0.00013005, "loss": 2.9375, "step": 867 }, { "epoch": 0.5748344370860927, "grad_norm": 1.6615087785848357, "learning_rate": 0.0001302, "loss": 2.2656, "step": 868 }, { "epoch": 0.5754966887417219, "grad_norm": 1.7348453249349052, "learning_rate": 0.00013035, "loss": 2.75, "step": 869 }, { "epoch": 0.5761589403973509, "grad_norm": 1.8505684990104836, "learning_rate": 0.0001305, "loss": 2.6719, "step": 870 }, { "epoch": 0.5768211920529801, "grad_norm": 1.7487787552640692, "learning_rate": 0.00013064999999999998, "loss": 2.9219, "step": 871 }, { "epoch": 0.5774834437086093, "grad_norm": 1.6390476132277572, "learning_rate": 0.00013079999999999998, "loss": 2.4531, "step": 872 }, { "epoch": 0.5781456953642384, "grad_norm": 6.920371927179222, "learning_rate": 0.00013094999999999998, "loss": 2.875, "step": 873 }, { "epoch": 0.5788079470198676, "grad_norm": 1.6653294520602393, "learning_rate": 0.0001311, "loss": 2.7656, "step": 874 }, { "epoch": 0.5794701986754967, "grad_norm": 1.9043293860322916, "learning_rate": 0.00013125, "loss": 2.625, "step": 875 }, { "epoch": 0.5801324503311258, "grad_norm": 1.7400297709953005, "learning_rate": 0.0001314, "loss": 2.7812, "step": 876 }, { "epoch": 0.580794701986755, "grad_norm": 2.127468788413566, "learning_rate": 0.00013155, "loss": 2.3125, "step": 877 }, { "epoch": 0.5814569536423841, "grad_norm": 3.3141446300449053, "learning_rate": 0.00013169999999999998, "loss": 2.75, "step": 878 }, { "epoch": 0.5821192052980132, "grad_norm": 1.6401374573515344, "learning_rate": 0.00013184999999999998, "loss": 2.4844, "step": 879 }, { "epoch": 0.5827814569536424, "grad_norm": 2.092500690803508, "learning_rate": 0.00013199999999999998, "loss": 2.4844, "step": 880 }, { "epoch": 0.5834437086092715, "grad_norm": 1.7528797960299052, "learning_rate": 0.00013215, "loss": 2.75, "step": 881 }, { "epoch": 0.5841059602649007, "grad_norm": 1.63393423635461, "learning_rate": 0.0001323, "loss": 2.6094, "step": 882 }, { "epoch": 0.5847682119205299, "grad_norm": 1.9253028232849625, "learning_rate": 0.00013245, "loss": 2.75, "step": 883 }, { "epoch": 0.5854304635761589, "grad_norm": 1.715249792015313, "learning_rate": 0.0001326, "loss": 2.7812, "step": 884 }, { "epoch": 0.5860927152317881, "grad_norm": 1.6981377614464783, "learning_rate": 0.00013275, "loss": 2.6562, "step": 885 }, { "epoch": 0.5867549668874172, "grad_norm": 1.8802317299290754, "learning_rate": 0.00013289999999999998, "loss": 2.9531, "step": 886 }, { "epoch": 0.5874172185430464, "grad_norm": 1.739780615891697, "learning_rate": 0.00013304999999999998, "loss": 2.2812, "step": 887 }, { "epoch": 0.5880794701986755, "grad_norm": 1.7960771387390673, "learning_rate": 0.00013319999999999999, "loss": 2.7969, "step": 888 }, { "epoch": 0.5887417218543046, "grad_norm": 1.5937602429617455, "learning_rate": 0.00013335, "loss": 2.3906, "step": 889 }, { "epoch": 0.5894039735099338, "grad_norm": 1.792822056451431, "learning_rate": 0.0001335, "loss": 2.6719, "step": 890 }, { "epoch": 0.590066225165563, "grad_norm": 1.6338513392343388, "learning_rate": 0.00013365, "loss": 2.5938, "step": 891 }, { "epoch": 0.590728476821192, "grad_norm": 11.809064105651226, "learning_rate": 0.0001338, "loss": 2.75, "step": 892 }, { "epoch": 0.5913907284768212, "grad_norm": 2.052321374588075, "learning_rate": 0.00013395, "loss": 2.7031, "step": 893 }, { "epoch": 0.5920529801324503, "grad_norm": 1.6701304241121868, "learning_rate": 0.00013409999999999998, "loss": 2.2031, "step": 894 }, { "epoch": 0.5927152317880795, "grad_norm": 1.9535092787535708, "learning_rate": 0.00013424999999999998, "loss": 2.625, "step": 895 }, { "epoch": 0.5933774834437087, "grad_norm": 2.0016210945510418, "learning_rate": 0.0001344, "loss": 2.625, "step": 896 }, { "epoch": 0.5940397350993377, "grad_norm": 1.8708541582692386, "learning_rate": 0.00013455, "loss": 2.625, "step": 897 }, { "epoch": 0.5947019867549669, "grad_norm": 13.725045749176015, "learning_rate": 0.0001347, "loss": 2.7031, "step": 898 }, { "epoch": 0.595364238410596, "grad_norm": 1.8149991284796483, "learning_rate": 0.00013485, "loss": 2.6094, "step": 899 }, { "epoch": 0.5960264900662252, "grad_norm": 1.866814167931646, "learning_rate": 0.000135, "loss": 2.75, "step": 900 }, { "epoch": 0.5966887417218543, "grad_norm": 1.5959615345966283, "learning_rate": 0.00013514999999999998, "loss": 2.9062, "step": 901 }, { "epoch": 0.5973509933774834, "grad_norm": 1.808896172507155, "learning_rate": 0.00013529999999999998, "loss": 2.7188, "step": 902 }, { "epoch": 0.5980132450331126, "grad_norm": 2.043905475817425, "learning_rate": 0.00013544999999999999, "loss": 2.7188, "step": 903 }, { "epoch": 0.5986754966887418, "grad_norm": 1.6476851986779213, "learning_rate": 0.0001356, "loss": 2.9688, "step": 904 }, { "epoch": 0.5993377483443708, "grad_norm": 1.6072455464870483, "learning_rate": 0.00013575, "loss": 2.7344, "step": 905 }, { "epoch": 0.6, "grad_norm": 1.8522763141275878, "learning_rate": 0.0001359, "loss": 2.6875, "step": 906 }, { "epoch": 0.6006622516556291, "grad_norm": 1.695948375368472, "learning_rate": 0.00013605, "loss": 2.5625, "step": 907 }, { "epoch": 0.6013245033112583, "grad_norm": 1.6639403183630144, "learning_rate": 0.0001362, "loss": 2.6562, "step": 908 }, { "epoch": 0.6019867549668875, "grad_norm": 1.8018234664253343, "learning_rate": 0.00013634999999999998, "loss": 2.7188, "step": 909 }, { "epoch": 0.6026490066225165, "grad_norm": 1.677039196367644, "learning_rate": 0.00013649999999999998, "loss": 2.4688, "step": 910 }, { "epoch": 0.6033112582781457, "grad_norm": 1.5578843517473313, "learning_rate": 0.00013665, "loss": 2.3125, "step": 911 }, { "epoch": 0.6039735099337749, "grad_norm": 1.7357545254241622, "learning_rate": 0.0001368, "loss": 2.5625, "step": 912 }, { "epoch": 0.604635761589404, "grad_norm": 1.9213122814043826, "learning_rate": 0.00013695, "loss": 2.8438, "step": 913 }, { "epoch": 0.6052980132450331, "grad_norm": 1.649019879446452, "learning_rate": 0.0001371, "loss": 2.5312, "step": 914 }, { "epoch": 0.6059602649006622, "grad_norm": 1.686970394985107, "learning_rate": 0.00013725, "loss": 2.8594, "step": 915 }, { "epoch": 0.6066225165562914, "grad_norm": 1.7640827305030788, "learning_rate": 0.0001374, "loss": 2.3125, "step": 916 }, { "epoch": 0.6072847682119206, "grad_norm": 1.6961250615744448, "learning_rate": 0.00013754999999999998, "loss": 2.6094, "step": 917 }, { "epoch": 0.6079470198675496, "grad_norm": 2.016597384861727, "learning_rate": 0.00013769999999999999, "loss": 2.875, "step": 918 }, { "epoch": 0.6086092715231788, "grad_norm": 1.676860467349367, "learning_rate": 0.00013785, "loss": 2.7969, "step": 919 }, { "epoch": 0.609271523178808, "grad_norm": 2.4126555280345108, "learning_rate": 0.000138, "loss": 2.875, "step": 920 }, { "epoch": 0.6099337748344371, "grad_norm": 2.0035792229007314, "learning_rate": 0.00013815, "loss": 2.7656, "step": 921 }, { "epoch": 0.6105960264900663, "grad_norm": 1.6268260828469265, "learning_rate": 0.0001383, "loss": 2.7031, "step": 922 }, { "epoch": 0.6112582781456953, "grad_norm": 1.4593864314131886, "learning_rate": 0.00013845, "loss": 2.625, "step": 923 }, { "epoch": 0.6119205298013245, "grad_norm": 1.5424854855511951, "learning_rate": 0.0001386, "loss": 2.2031, "step": 924 }, { "epoch": 0.6125827814569537, "grad_norm": 2.75304929730052, "learning_rate": 0.00013874999999999998, "loss": 2.9219, "step": 925 }, { "epoch": 0.6132450331125828, "grad_norm": 2.0559003214152507, "learning_rate": 0.0001389, "loss": 2.7969, "step": 926 }, { "epoch": 0.6139072847682119, "grad_norm": 1.9994822373639374, "learning_rate": 0.00013905, "loss": 2.8594, "step": 927 }, { "epoch": 0.614569536423841, "grad_norm": 1.6849127577336724, "learning_rate": 0.0001392, "loss": 2.6875, "step": 928 }, { "epoch": 0.6152317880794702, "grad_norm": 1.8266263209664524, "learning_rate": 0.00013935, "loss": 2.7969, "step": 929 }, { "epoch": 0.6158940397350994, "grad_norm": 1.6556111186298015, "learning_rate": 0.0001395, "loss": 2.7188, "step": 930 }, { "epoch": 0.6165562913907284, "grad_norm": 1.5679796143528912, "learning_rate": 0.00013965, "loss": 2.5781, "step": 931 }, { "epoch": 0.6172185430463576, "grad_norm": 1.58608251661062, "learning_rate": 0.00013979999999999998, "loss": 2.7031, "step": 932 }, { "epoch": 0.6178807947019868, "grad_norm": 3.047964856430622, "learning_rate": 0.00013995, "loss": 2.7031, "step": 933 }, { "epoch": 0.6185430463576159, "grad_norm": 1.745642590387234, "learning_rate": 0.0001401, "loss": 2.625, "step": 934 }, { "epoch": 0.6192052980132451, "grad_norm": 1.6562030519640611, "learning_rate": 0.00014025, "loss": 2.7656, "step": 935 }, { "epoch": 0.6198675496688741, "grad_norm": 1.716649912453909, "learning_rate": 0.0001404, "loss": 2.75, "step": 936 }, { "epoch": 0.6205298013245033, "grad_norm": 1.5208361491671771, "learning_rate": 0.00014055, "loss": 2.75, "step": 937 }, { "epoch": 0.6211920529801325, "grad_norm": 1.70209635066516, "learning_rate": 0.00014069999999999998, "loss": 2.6719, "step": 938 }, { "epoch": 0.6218543046357616, "grad_norm": 1.6688952814411828, "learning_rate": 0.00014084999999999998, "loss": 2.7344, "step": 939 }, { "epoch": 0.6225165562913907, "grad_norm": 1.6280279993384466, "learning_rate": 0.00014099999999999998, "loss": 2.7969, "step": 940 }, { "epoch": 0.6231788079470199, "grad_norm": 1.6652758556387341, "learning_rate": 0.00014115, "loss": 2.7031, "step": 941 }, { "epoch": 0.623841059602649, "grad_norm": 1.739727663333885, "learning_rate": 0.0001413, "loss": 2.6562, "step": 942 }, { "epoch": 0.6245033112582782, "grad_norm": 1.737100148044706, "learning_rate": 0.00014144999999999997, "loss": 2.9531, "step": 943 }, { "epoch": 0.6251655629139072, "grad_norm": 1.8127289729274938, "learning_rate": 0.00014159999999999997, "loss": 2.7656, "step": 944 }, { "epoch": 0.6258278145695364, "grad_norm": 3.9350798549343593, "learning_rate": 0.00014174999999999998, "loss": 2.7812, "step": 945 }, { "epoch": 0.6264900662251656, "grad_norm": 1.840507817769587, "learning_rate": 0.00014189999999999998, "loss": 2.875, "step": 946 }, { "epoch": 0.6271523178807947, "grad_norm": 1.9119205332764484, "learning_rate": 0.00014204999999999998, "loss": 2.9375, "step": 947 }, { "epoch": 0.6278145695364239, "grad_norm": 1.7183680360721574, "learning_rate": 0.0001422, "loss": 2.8594, "step": 948 }, { "epoch": 0.628476821192053, "grad_norm": 1.6202688773542453, "learning_rate": 0.00014235, "loss": 2.625, "step": 949 }, { "epoch": 0.6291390728476821, "grad_norm": 1.7141548790172465, "learning_rate": 0.0001425, "loss": 2.75, "step": 950 }, { "epoch": 0.6298013245033113, "grad_norm": 1.623418135236663, "learning_rate": 0.00014264999999999997, "loss": 2.5625, "step": 951 }, { "epoch": 0.6304635761589404, "grad_norm": 1.6054248191490805, "learning_rate": 0.00014279999999999997, "loss": 2.3906, "step": 952 }, { "epoch": 0.6311258278145695, "grad_norm": 1.9691784389211007, "learning_rate": 0.00014294999999999998, "loss": 3.0625, "step": 953 }, { "epoch": 0.6317880794701987, "grad_norm": 1.912655902154441, "learning_rate": 0.00014309999999999998, "loss": 2.7188, "step": 954 }, { "epoch": 0.6324503311258278, "grad_norm": 1.6952050742878066, "learning_rate": 0.00014324999999999999, "loss": 2.5625, "step": 955 }, { "epoch": 0.633112582781457, "grad_norm": 3.434851730611255, "learning_rate": 0.0001434, "loss": 2.7812, "step": 956 }, { "epoch": 0.633774834437086, "grad_norm": 1.7388535992297727, "learning_rate": 0.00014355, "loss": 2.7969, "step": 957 }, { "epoch": 0.6344370860927152, "grad_norm": 1.6055065300055347, "learning_rate": 0.00014369999999999997, "loss": 2.8281, "step": 958 }, { "epoch": 0.6350993377483444, "grad_norm": 1.712475597222323, "learning_rate": 0.00014384999999999997, "loss": 2.6406, "step": 959 }, { "epoch": 0.6357615894039735, "grad_norm": 2.08335751499764, "learning_rate": 0.00014399999999999998, "loss": 2.8125, "step": 960 }, { "epoch": 0.6364238410596027, "grad_norm": 1.8171953911096144, "learning_rate": 0.00014414999999999998, "loss": 2.75, "step": 961 }, { "epoch": 0.6370860927152318, "grad_norm": 1.8735260700476095, "learning_rate": 0.00014429999999999998, "loss": 2.9219, "step": 962 }, { "epoch": 0.6377483443708609, "grad_norm": 1.8277704598450062, "learning_rate": 0.00014445, "loss": 2.8594, "step": 963 }, { "epoch": 0.6384105960264901, "grad_norm": 1.8210905265718296, "learning_rate": 0.0001446, "loss": 2.6562, "step": 964 }, { "epoch": 0.6390728476821192, "grad_norm": 1.505712954329362, "learning_rate": 0.00014475, "loss": 2.7656, "step": 965 }, { "epoch": 0.6397350993377483, "grad_norm": 1.918094753000461, "learning_rate": 0.00014489999999999997, "loss": 2.5938, "step": 966 }, { "epoch": 0.6403973509933775, "grad_norm": 1.767840050701229, "learning_rate": 0.00014504999999999997, "loss": 2.8281, "step": 967 }, { "epoch": 0.6410596026490066, "grad_norm": 1.653334231483156, "learning_rate": 0.00014519999999999998, "loss": 2.6875, "step": 968 }, { "epoch": 0.6417218543046358, "grad_norm": 2.0702035722707492, "learning_rate": 0.00014534999999999998, "loss": 3.0156, "step": 969 }, { "epoch": 0.6423841059602649, "grad_norm": 1.8596393640124391, "learning_rate": 0.00014549999999999999, "loss": 2.8281, "step": 970 }, { "epoch": 0.643046357615894, "grad_norm": 2.000203493554406, "learning_rate": 0.00014565, "loss": 2.6406, "step": 971 }, { "epoch": 0.6437086092715232, "grad_norm": 1.7274961291458457, "learning_rate": 0.0001458, "loss": 2.7969, "step": 972 }, { "epoch": 0.6443708609271523, "grad_norm": 1.9269270903513855, "learning_rate": 0.00014595, "loss": 2.3906, "step": 973 }, { "epoch": 0.6450331125827815, "grad_norm": 3.1400011087166444, "learning_rate": 0.00014609999999999997, "loss": 3.0625, "step": 974 }, { "epoch": 0.6456953642384106, "grad_norm": 1.7523819664212454, "learning_rate": 0.00014624999999999998, "loss": 2.6875, "step": 975 }, { "epoch": 0.6463576158940397, "grad_norm": 25.527572332226274, "learning_rate": 0.00014639999999999998, "loss": 3.0156, "step": 976 }, { "epoch": 0.6470198675496689, "grad_norm": 1.8407935632318109, "learning_rate": 0.00014654999999999998, "loss": 2.6719, "step": 977 }, { "epoch": 0.6476821192052981, "grad_norm": 1.9471448024888802, "learning_rate": 0.0001467, "loss": 2.6719, "step": 978 }, { "epoch": 0.6483443708609271, "grad_norm": 1.563538400749834, "learning_rate": 0.00014685, "loss": 2.5, "step": 979 }, { "epoch": 0.6490066225165563, "grad_norm": 1.916901970502345, "learning_rate": 0.000147, "loss": 2.7969, "step": 980 }, { "epoch": 0.6496688741721854, "grad_norm": 1.6939028458532706, "learning_rate": 0.00014714999999999997, "loss": 2.8594, "step": 981 }, { "epoch": 0.6503311258278146, "grad_norm": 1.671618081457123, "learning_rate": 0.00014729999999999998, "loss": 2.6406, "step": 982 }, { "epoch": 0.6509933774834437, "grad_norm": 2.3761553135046967, "learning_rate": 0.00014744999999999998, "loss": 3.1406, "step": 983 }, { "epoch": 0.6516556291390728, "grad_norm": 1.8718716012885332, "learning_rate": 0.00014759999999999998, "loss": 2.7031, "step": 984 }, { "epoch": 0.652317880794702, "grad_norm": 57.70429864989221, "learning_rate": 0.00014774999999999999, "loss": 3.1719, "step": 985 }, { "epoch": 0.6529801324503312, "grad_norm": 3.0096527716779415, "learning_rate": 0.0001479, "loss": 2.3281, "step": 986 }, { "epoch": 0.6536423841059603, "grad_norm": 82.97157024276366, "learning_rate": 0.00014805, "loss": 3.75, "step": 987 }, { "epoch": 0.6543046357615894, "grad_norm": 5.726878910172308, "learning_rate": 0.0001482, "loss": 3.1875, "step": 988 }, { "epoch": 0.6549668874172185, "grad_norm": 5.335748417256277, "learning_rate": 0.00014834999999999997, "loss": 3.125, "step": 989 }, { "epoch": 0.6556291390728477, "grad_norm": 3.9508229302334, "learning_rate": 0.00014849999999999998, "loss": 2.4688, "step": 990 }, { "epoch": 0.6562913907284769, "grad_norm": 3.9259101951527584, "learning_rate": 0.00014864999999999998, "loss": 2.8594, "step": 991 }, { "epoch": 0.6569536423841059, "grad_norm": 2.2498843377397946, "learning_rate": 0.00014879999999999998, "loss": 2.9219, "step": 992 }, { "epoch": 0.6576158940397351, "grad_norm": 2.7333916534135865, "learning_rate": 0.00014895, "loss": 2.7969, "step": 993 }, { "epoch": 0.6582781456953642, "grad_norm": 2.505994662255294, "learning_rate": 0.0001491, "loss": 2.8281, "step": 994 }, { "epoch": 0.6589403973509934, "grad_norm": 7.310629741063587, "learning_rate": 0.00014925, "loss": 2.7188, "step": 995 }, { "epoch": 0.6596026490066225, "grad_norm": 7.104904348913685, "learning_rate": 0.0001494, "loss": 2.875, "step": 996 }, { "epoch": 0.6602649006622516, "grad_norm": 2.574295802004421, "learning_rate": 0.00014954999999999998, "loss": 2.8281, "step": 997 }, { "epoch": 0.6609271523178808, "grad_norm": 2.1454345008062816, "learning_rate": 0.00014969999999999998, "loss": 2.9062, "step": 998 }, { "epoch": 0.66158940397351, "grad_norm": 10.795391745321732, "learning_rate": 0.00014984999999999998, "loss": 2.9062, "step": 999 }, { "epoch": 0.6622516556291391, "grad_norm": 10.886884174593044, "learning_rate": 0.00015, "loss": 2.6562, "step": 1000 }, { "epoch": 0.6629139072847682, "grad_norm": 2.6117790463445445, "learning_rate": 0.00015014999999999996, "loss": 2.3906, "step": 1001 }, { "epoch": 0.6635761589403973, "grad_norm": 1.9249744313079986, "learning_rate": 0.0001503, "loss": 2.4219, "step": 1002 }, { "epoch": 0.6642384105960265, "grad_norm": 1.8509413229809597, "learning_rate": 0.00015044999999999997, "loss": 2.5625, "step": 1003 }, { "epoch": 0.6649006622516557, "grad_norm": 4.9498943833068605, "learning_rate": 0.00015059999999999997, "loss": 3.3281, "step": 1004 }, { "epoch": 0.6655629139072847, "grad_norm": 2.4285161977290506, "learning_rate": 0.00015074999999999998, "loss": 2.7969, "step": 1005 }, { "epoch": 0.6662251655629139, "grad_norm": 1.983827443090624, "learning_rate": 0.00015089999999999998, "loss": 2.875, "step": 1006 }, { "epoch": 0.666887417218543, "grad_norm": 2.17557794399654, "learning_rate": 0.00015104999999999996, "loss": 2.6562, "step": 1007 }, { "epoch": 0.6675496688741722, "grad_norm": 1.9396356928330014, "learning_rate": 0.0001512, "loss": 2.7812, "step": 1008 }, { "epoch": 0.6682119205298013, "grad_norm": 1.7949651075224655, "learning_rate": 0.00015134999999999997, "loss": 2.6719, "step": 1009 }, { "epoch": 0.6688741721854304, "grad_norm": 1.7589255366193848, "learning_rate": 0.0001515, "loss": 2.8281, "step": 1010 }, { "epoch": 0.6695364238410596, "grad_norm": 1.738110057575885, "learning_rate": 0.00015164999999999997, "loss": 2.625, "step": 1011 }, { "epoch": 0.6701986754966888, "grad_norm": 1.827687088024332, "learning_rate": 0.00015179999999999998, "loss": 2.8125, "step": 1012 }, { "epoch": 0.6708609271523179, "grad_norm": 1.7129323981113438, "learning_rate": 0.00015194999999999998, "loss": 2.9688, "step": 1013 }, { "epoch": 0.671523178807947, "grad_norm": 1.8001785819020812, "learning_rate": 0.00015209999999999998, "loss": 2.5781, "step": 1014 }, { "epoch": 0.6721854304635762, "grad_norm": 6.554720271804624, "learning_rate": 0.00015224999999999996, "loss": 2.75, "step": 1015 }, { "epoch": 0.6728476821192053, "grad_norm": 1.7577700458290837, "learning_rate": 0.0001524, "loss": 2.1719, "step": 1016 }, { "epoch": 0.6735099337748345, "grad_norm": 1.9973660960273123, "learning_rate": 0.00015254999999999997, "loss": 2.8906, "step": 1017 }, { "epoch": 0.6741721854304635, "grad_norm": 4.426299592921524, "learning_rate": 0.0001527, "loss": 2.7344, "step": 1018 }, { "epoch": 0.6748344370860927, "grad_norm": 2.45074666365465, "learning_rate": 0.00015284999999999997, "loss": 2.625, "step": 1019 }, { "epoch": 0.6754966887417219, "grad_norm": 9.857719662659395, "learning_rate": 0.00015299999999999998, "loss": 3.1094, "step": 1020 }, { "epoch": 0.676158940397351, "grad_norm": 3.1033165148249817, "learning_rate": 0.00015314999999999998, "loss": 2.8125, "step": 1021 }, { "epoch": 0.6768211920529801, "grad_norm": 1.6451130588089793, "learning_rate": 0.00015329999999999999, "loss": 2.4844, "step": 1022 }, { "epoch": 0.6774834437086092, "grad_norm": 13.75871382888496, "learning_rate": 0.00015344999999999996, "loss": 3.0, "step": 1023 }, { "epoch": 0.6781456953642384, "grad_norm": 2.282975667806467, "learning_rate": 0.0001536, "loss": 2.9219, "step": 1024 }, { "epoch": 0.6788079470198676, "grad_norm": 7.514357103470541, "learning_rate": 0.00015374999999999997, "loss": 2.8594, "step": 1025 }, { "epoch": 0.6794701986754967, "grad_norm": 1.6854414816745074, "learning_rate": 0.0001539, "loss": 2.6094, "step": 1026 }, { "epoch": 0.6801324503311258, "grad_norm": 2.0967675538747717, "learning_rate": 0.00015404999999999998, "loss": 3.0312, "step": 1027 }, { "epoch": 0.680794701986755, "grad_norm": 6.4179893390034675, "learning_rate": 0.00015419999999999998, "loss": 3.1094, "step": 1028 }, { "epoch": 0.6814569536423841, "grad_norm": 1.9097735661715451, "learning_rate": 0.00015434999999999998, "loss": 3.0156, "step": 1029 }, { "epoch": 0.6821192052980133, "grad_norm": 2.5422131026892996, "learning_rate": 0.0001545, "loss": 2.8438, "step": 1030 }, { "epoch": 0.6827814569536423, "grad_norm": 1.927948696981671, "learning_rate": 0.00015464999999999996, "loss": 2.8438, "step": 1031 }, { "epoch": 0.6834437086092715, "grad_norm": 1.64953856454503, "learning_rate": 0.0001548, "loss": 2.2656, "step": 1032 }, { "epoch": 0.6841059602649007, "grad_norm": 1.6065647789242645, "learning_rate": 0.00015494999999999997, "loss": 2.5312, "step": 1033 }, { "epoch": 0.6847682119205298, "grad_norm": 2.040874751706338, "learning_rate": 0.0001551, "loss": 2.7656, "step": 1034 }, { "epoch": 0.6854304635761589, "grad_norm": 2.274138839387037, "learning_rate": 0.00015524999999999998, "loss": 2.7656, "step": 1035 }, { "epoch": 0.686092715231788, "grad_norm": 1.6322645562043083, "learning_rate": 0.00015539999999999998, "loss": 2.2031, "step": 1036 }, { "epoch": 0.6867549668874172, "grad_norm": 1.9914365557392222, "learning_rate": 0.00015554999999999999, "loss": 2.6719, "step": 1037 }, { "epoch": 0.6874172185430464, "grad_norm": 1.838952202627971, "learning_rate": 0.0001557, "loss": 2.8281, "step": 1038 }, { "epoch": 0.6880794701986755, "grad_norm": 1.7542502181405408, "learning_rate": 0.00015584999999999997, "loss": 2.5625, "step": 1039 }, { "epoch": 0.6887417218543046, "grad_norm": 1.8490427296896466, "learning_rate": 0.000156, "loss": 2.7656, "step": 1040 }, { "epoch": 0.6894039735099338, "grad_norm": 2.1268093649368414, "learning_rate": 0.00015614999999999997, "loss": 2.7656, "step": 1041 }, { "epoch": 0.6900662251655629, "grad_norm": 1.6469146169253956, "learning_rate": 0.0001563, "loss": 2.5312, "step": 1042 }, { "epoch": 0.6907284768211921, "grad_norm": 19.612814415289833, "learning_rate": 0.00015644999999999998, "loss": 2.875, "step": 1043 }, { "epoch": 0.6913907284768211, "grad_norm": 2.307580353944933, "learning_rate": 0.00015659999999999998, "loss": 2.7031, "step": 1044 }, { "epoch": 0.6920529801324503, "grad_norm": 3.7436321331711726, "learning_rate": 0.00015675, "loss": 2.6719, "step": 1045 }, { "epoch": 0.6927152317880795, "grad_norm": 3.4970479060494783, "learning_rate": 0.0001569, "loss": 2.6562, "step": 1046 }, { "epoch": 0.6933774834437086, "grad_norm": 4.035166923923399, "learning_rate": 0.00015704999999999997, "loss": 2.9688, "step": 1047 }, { "epoch": 0.6940397350993377, "grad_norm": 82.00195264618144, "learning_rate": 0.0001572, "loss": 3.0469, "step": 1048 }, { "epoch": 0.6947019867549669, "grad_norm": 2.111553657317677, "learning_rate": 0.00015734999999999998, "loss": 2.2812, "step": 1049 }, { "epoch": 0.695364238410596, "grad_norm": 1.848216894262673, "learning_rate": 0.00015749999999999998, "loss": 2.3125, "step": 1050 }, { "epoch": 0.6960264900662252, "grad_norm": 3.5719733930568136, "learning_rate": 0.00015764999999999998, "loss": 2.8438, "step": 1051 }, { "epoch": 0.6966887417218544, "grad_norm": 3.4325585709821635, "learning_rate": 0.0001578, "loss": 2.8281, "step": 1052 }, { "epoch": 0.6973509933774834, "grad_norm": 2.2526681835147433, "learning_rate": 0.00015794999999999996, "loss": 2.8906, "step": 1053 }, { "epoch": 0.6980132450331126, "grad_norm": 2.157125414703428, "learning_rate": 0.0001581, "loss": 2.7344, "step": 1054 }, { "epoch": 0.6986754966887417, "grad_norm": 2.3286727163211607, "learning_rate": 0.00015824999999999997, "loss": 2.9531, "step": 1055 }, { "epoch": 0.6993377483443709, "grad_norm": 13.342268056820014, "learning_rate": 0.0001584, "loss": 2.9375, "step": 1056 }, { "epoch": 0.7, "grad_norm": 3.521314202240274, "learning_rate": 0.00015854999999999998, "loss": 2.875, "step": 1057 }, { "epoch": 0.7006622516556291, "grad_norm": 1.9241392967254913, "learning_rate": 0.00015869999999999998, "loss": 2.5, "step": 1058 }, { "epoch": 0.7013245033112583, "grad_norm": 7.157991129787909, "learning_rate": 0.00015884999999999999, "loss": 2.7344, "step": 1059 }, { "epoch": 0.7019867549668874, "grad_norm": 2.2843200199431157, "learning_rate": 0.000159, "loss": 2.7188, "step": 1060 }, { "epoch": 0.7026490066225165, "grad_norm": 1.670884075377115, "learning_rate": 0.00015914999999999997, "loss": 2.5625, "step": 1061 }, { "epoch": 0.7033112582781457, "grad_norm": 1.5757300597782906, "learning_rate": 0.0001593, "loss": 2.3438, "step": 1062 }, { "epoch": 0.7039735099337748, "grad_norm": 1.7203881141189485, "learning_rate": 0.00015944999999999997, "loss": 2.9375, "step": 1063 }, { "epoch": 0.704635761589404, "grad_norm": 1.8323788540640873, "learning_rate": 0.0001596, "loss": 3.1562, "step": 1064 }, { "epoch": 0.7052980132450332, "grad_norm": 4.966260446189065, "learning_rate": 0.00015974999999999998, "loss": 2.8594, "step": 1065 }, { "epoch": 0.7059602649006622, "grad_norm": 2.6036617400557485, "learning_rate": 0.00015989999999999998, "loss": 2.9219, "step": 1066 }, { "epoch": 0.7066225165562914, "grad_norm": 10.026339079934212, "learning_rate": 0.00016005, "loss": 2.8438, "step": 1067 }, { "epoch": 0.7072847682119205, "grad_norm": 1.779084842249553, "learning_rate": 0.0001602, "loss": 2.6719, "step": 1068 }, { "epoch": 0.7079470198675497, "grad_norm": 2.0472940875697088, "learning_rate": 0.00016034999999999997, "loss": 2.7656, "step": 1069 }, { "epoch": 0.7086092715231788, "grad_norm": 12.37951358298566, "learning_rate": 0.0001605, "loss": 2.7969, "step": 1070 }, { "epoch": 0.7092715231788079, "grad_norm": 2.5714672491432413, "learning_rate": 0.00016064999999999997, "loss": 2.8125, "step": 1071 }, { "epoch": 0.7099337748344371, "grad_norm": 3.2973828299741945, "learning_rate": 0.0001608, "loss": 3.125, "step": 1072 }, { "epoch": 0.7105960264900663, "grad_norm": 2.0773601601362355, "learning_rate": 0.00016094999999999998, "loss": 2.9219, "step": 1073 }, { "epoch": 0.7112582781456953, "grad_norm": 1.9576271793369373, "learning_rate": 0.00016109999999999999, "loss": 2.8594, "step": 1074 }, { "epoch": 0.7119205298013245, "grad_norm": 2.253081544525511, "learning_rate": 0.00016125, "loss": 2.8594, "step": 1075 }, { "epoch": 0.7125827814569536, "grad_norm": 17.849086542019425, "learning_rate": 0.0001614, "loss": 2.4688, "step": 1076 }, { "epoch": 0.7132450331125828, "grad_norm": 25.22103142092267, "learning_rate": 0.00016154999999999997, "loss": 3.2344, "step": 1077 }, { "epoch": 0.713907284768212, "grad_norm": 2.711755792004473, "learning_rate": 0.0001617, "loss": 2.9375, "step": 1078 }, { "epoch": 0.714569536423841, "grad_norm": 2.0429867634332206, "learning_rate": 0.00016184999999999998, "loss": 2.9531, "step": 1079 }, { "epoch": 0.7152317880794702, "grad_norm": 2.448420710327908, "learning_rate": 0.000162, "loss": 2.6875, "step": 1080 }, { "epoch": 0.7158940397350994, "grad_norm": 4.041013141288397, "learning_rate": 0.00016214999999999998, "loss": 3.2188, "step": 1081 }, { "epoch": 0.7165562913907285, "grad_norm": 2.207189529714852, "learning_rate": 0.0001623, "loss": 2.3906, "step": 1082 }, { "epoch": 0.7172185430463576, "grad_norm": 2.054389781140217, "learning_rate": 0.00016245, "loss": 3.1406, "step": 1083 }, { "epoch": 0.7178807947019867, "grad_norm": 9.055365864590497, "learning_rate": 0.0001626, "loss": 2.9688, "step": 1084 }, { "epoch": 0.7185430463576159, "grad_norm": 2.663922557828683, "learning_rate": 0.00016274999999999997, "loss": 2.8438, "step": 1085 }, { "epoch": 0.7192052980132451, "grad_norm": 2.7509920567552717, "learning_rate": 0.0001629, "loss": 2.8281, "step": 1086 }, { "epoch": 0.7198675496688741, "grad_norm": 2.1229102338276644, "learning_rate": 0.00016304999999999998, "loss": 2.5625, "step": 1087 }, { "epoch": 0.7205298013245033, "grad_norm": 4.655309629283799, "learning_rate": 0.0001632, "loss": 3.1406, "step": 1088 }, { "epoch": 0.7211920529801324, "grad_norm": 1.9126178330922006, "learning_rate": 0.00016334999999999999, "loss": 2.75, "step": 1089 }, { "epoch": 0.7218543046357616, "grad_norm": 1.8408699477695372, "learning_rate": 0.0001635, "loss": 2.7812, "step": 1090 }, { "epoch": 0.7225165562913908, "grad_norm": 1.7483772203157484, "learning_rate": 0.00016365, "loss": 2.7812, "step": 1091 }, { "epoch": 0.7231788079470198, "grad_norm": 1.8387604982818866, "learning_rate": 0.0001638, "loss": 2.7188, "step": 1092 }, { "epoch": 0.723841059602649, "grad_norm": 1.707319886269538, "learning_rate": 0.00016394999999999997, "loss": 2.8281, "step": 1093 }, { "epoch": 0.7245033112582782, "grad_norm": 3.0826758758914394, "learning_rate": 0.0001641, "loss": 3.1406, "step": 1094 }, { "epoch": 0.7251655629139073, "grad_norm": 2.3764448119774895, "learning_rate": 0.00016424999999999998, "loss": 2.4219, "step": 1095 }, { "epoch": 0.7258278145695364, "grad_norm": 3444.5459784831883, "learning_rate": 0.0001644, "loss": 10.25, "step": 1096 }, { "epoch": 0.7264900662251655, "grad_norm": 17.867142675853835, "learning_rate": 0.00016455, "loss": 4.1562, "step": 1097 }, { "epoch": 0.7271523178807947, "grad_norm": 49.17803503861254, "learning_rate": 0.0001647, "loss": 8.0, "step": 1098 }, { "epoch": 0.7278145695364239, "grad_norm": 13.75747538456727, "learning_rate": 0.00016485, "loss": 6.7188, "step": 1099 }, { "epoch": 0.7284768211920529, "grad_norm": 38.729371654488006, "learning_rate": 0.000165, "loss": 8.125, "step": 1100 }, { "epoch": 0.7291390728476821, "grad_norm": 16.74322854983273, "learning_rate": 0.00016514999999999998, "loss": 9.875, "step": 1101 }, { "epoch": 0.7298013245033113, "grad_norm": 9.992920049190671, "learning_rate": 0.0001653, "loss": 7.9375, "step": 1102 }, { "epoch": 0.7304635761589404, "grad_norm": 15.864214199617486, "learning_rate": 0.00016544999999999998, "loss": 6.8438, "step": 1103 }, { "epoch": 0.7311258278145696, "grad_norm": 8.989462769805998, "learning_rate": 0.0001656, "loss": 6.9688, "step": 1104 }, { "epoch": 0.7317880794701986, "grad_norm": 8.785505747260729, "learning_rate": 0.00016575, "loss": 7.3125, "step": 1105 }, { "epoch": 0.7324503311258278, "grad_norm": 11.864246352127116, "learning_rate": 0.0001659, "loss": 7.0625, "step": 1106 }, { "epoch": 0.733112582781457, "grad_norm": 11.880346608664418, "learning_rate": 0.00016604999999999997, "loss": 6.9688, "step": 1107 }, { "epoch": 0.7337748344370861, "grad_norm": 6.598027809605002, "learning_rate": 0.0001662, "loss": 6.25, "step": 1108 }, { "epoch": 0.7344370860927152, "grad_norm": 10.962003277808929, "learning_rate": 0.00016634999999999998, "loss": 6.3125, "step": 1109 }, { "epoch": 0.7350993377483444, "grad_norm": 11.981328536106767, "learning_rate": 0.0001665, "loss": 6.5938, "step": 1110 }, { "epoch": 0.7357615894039735, "grad_norm": 8.163918223592617, "learning_rate": 0.00016664999999999998, "loss": 6.2188, "step": 1111 }, { "epoch": 0.7364238410596027, "grad_norm": 8.281678213205572, "learning_rate": 0.0001668, "loss": 6.25, "step": 1112 }, { "epoch": 0.7370860927152317, "grad_norm": 4.036171711158652, "learning_rate": 0.00016695, "loss": 6.0938, "step": 1113 }, { "epoch": 0.7377483443708609, "grad_norm": 7.9542243143751215, "learning_rate": 0.0001671, "loss": 6.375, "step": 1114 }, { "epoch": 0.7384105960264901, "grad_norm": 3.367303358746958, "learning_rate": 0.00016724999999999997, "loss": 6.0, "step": 1115 }, { "epoch": 0.7390728476821192, "grad_norm": 4.621776213828051, "learning_rate": 0.0001674, "loss": 6.0938, "step": 1116 }, { "epoch": 0.7397350993377484, "grad_norm": 4.2742046020169315, "learning_rate": 0.00016754999999999998, "loss": 5.875, "step": 1117 }, { "epoch": 0.7403973509933774, "grad_norm": 2.6442755026214853, "learning_rate": 0.0001677, "loss": 5.8125, "step": 1118 }, { "epoch": 0.7410596026490066, "grad_norm": 4.969781805285279, "learning_rate": 0.00016785, "loss": 5.9062, "step": 1119 }, { "epoch": 0.7417218543046358, "grad_norm": 2.745704449580607, "learning_rate": 0.000168, "loss": 5.6562, "step": 1120 }, { "epoch": 0.7423841059602649, "grad_norm": 3.864828880700123, "learning_rate": 0.00016815, "loss": 5.6875, "step": 1121 }, { "epoch": 0.743046357615894, "grad_norm": 4.3791471954058085, "learning_rate": 0.0001683, "loss": 5.6875, "step": 1122 }, { "epoch": 0.7437086092715232, "grad_norm": 5.008083568549501, "learning_rate": 0.00016844999999999997, "loss": 5.5625, "step": 1123 }, { "epoch": 0.7443708609271523, "grad_norm": 3.6363444020555074, "learning_rate": 0.0001686, "loss": 5.6562, "step": 1124 }, { "epoch": 0.7450331125827815, "grad_norm": 3.721993365974379, "learning_rate": 0.00016874999999999998, "loss": 5.5938, "step": 1125 }, { "epoch": 0.7456953642384105, "grad_norm": 4.861834570471408, "learning_rate": 0.00016889999999999996, "loss": 5.5312, "step": 1126 }, { "epoch": 0.7463576158940397, "grad_norm": 3.1067774595793356, "learning_rate": 0.00016905, "loss": 5.4062, "step": 1127 }, { "epoch": 0.7470198675496689, "grad_norm": 2.939371966057707, "learning_rate": 0.00016919999999999997, "loss": 5.375, "step": 1128 }, { "epoch": 0.747682119205298, "grad_norm": 3.2585358377266367, "learning_rate": 0.00016935, "loss": 5.375, "step": 1129 }, { "epoch": 0.7483443708609272, "grad_norm": 3.3072919016763818, "learning_rate": 0.00016949999999999997, "loss": 5.25, "step": 1130 }, { "epoch": 0.7490066225165563, "grad_norm": 3.1650707861293674, "learning_rate": 0.00016964999999999998, "loss": 5.2188, "step": 1131 }, { "epoch": 0.7496688741721854, "grad_norm": 3.036197716763989, "learning_rate": 0.00016979999999999998, "loss": 5.2188, "step": 1132 }, { "epoch": 0.7503311258278146, "grad_norm": 3.127313215998794, "learning_rate": 0.00016994999999999998, "loss": 5.1875, "step": 1133 }, { "epoch": 0.7509933774834437, "grad_norm": 3.180130713822329, "learning_rate": 0.00017009999999999996, "loss": 5.2188, "step": 1134 }, { "epoch": 0.7516556291390728, "grad_norm": 2.4751777221760816, "learning_rate": 0.00017025, "loss": 5.0938, "step": 1135 }, { "epoch": 0.752317880794702, "grad_norm": 3.051657455882634, "learning_rate": 0.00017039999999999997, "loss": 5.125, "step": 1136 }, { "epoch": 0.7529801324503311, "grad_norm": 2.6189251628348194, "learning_rate": 0.00017055, "loss": 5.0625, "step": 1137 }, { "epoch": 0.7536423841059603, "grad_norm": 3.1082433355039942, "learning_rate": 0.00017069999999999998, "loss": 5.3125, "step": 1138 }, { "epoch": 0.7543046357615895, "grad_norm": 3.5379893101501905, "learning_rate": 0.00017084999999999998, "loss": 4.8438, "step": 1139 }, { "epoch": 0.7549668874172185, "grad_norm": 2.4137320275098935, "learning_rate": 0.00017099999999999998, "loss": 5.0938, "step": 1140 }, { "epoch": 0.7556291390728477, "grad_norm": 2.445364322628325, "learning_rate": 0.00017114999999999999, "loss": 4.875, "step": 1141 }, { "epoch": 0.7562913907284768, "grad_norm": 2.996873642910113, "learning_rate": 0.00017129999999999996, "loss": 5.0312, "step": 1142 }, { "epoch": 0.756953642384106, "grad_norm": 2.747291371714837, "learning_rate": 0.00017145, "loss": 4.875, "step": 1143 }, { "epoch": 0.7576158940397351, "grad_norm": 2.3558271492536136, "learning_rate": 0.00017159999999999997, "loss": 4.8438, "step": 1144 }, { "epoch": 0.7582781456953642, "grad_norm": 2.396160124853817, "learning_rate": 0.00017175, "loss": 4.9375, "step": 1145 }, { "epoch": 0.7589403973509934, "grad_norm": 2.969213369509106, "learning_rate": 0.00017189999999999998, "loss": 5.0, "step": 1146 }, { "epoch": 0.7596026490066226, "grad_norm": 2.2281532318148476, "learning_rate": 0.00017204999999999998, "loss": 5.0, "step": 1147 }, { "epoch": 0.7602649006622516, "grad_norm": 1.9678017337465405, "learning_rate": 0.00017219999999999998, "loss": 4.9375, "step": 1148 }, { "epoch": 0.7609271523178808, "grad_norm": 3.5458666927800766, "learning_rate": 0.00017235, "loss": 4.625, "step": 1149 }, { "epoch": 0.7615894039735099, "grad_norm": 2.1742722267479464, "learning_rate": 0.00017249999999999996, "loss": 4.9688, "step": 1150 }, { "epoch": 0.7622516556291391, "grad_norm": 3.906668526879908, "learning_rate": 0.00017265, "loss": 5.0312, "step": 1151 }, { "epoch": 0.7629139072847683, "grad_norm": 2.8571033177981904, "learning_rate": 0.00017279999999999997, "loss": 4.6875, "step": 1152 }, { "epoch": 0.7635761589403973, "grad_norm": 2.44066632890416, "learning_rate": 0.00017294999999999998, "loss": 4.75, "step": 1153 }, { "epoch": 0.7642384105960265, "grad_norm": 3.0635149614168893, "learning_rate": 0.00017309999999999998, "loss": 4.9062, "step": 1154 }, { "epoch": 0.7649006622516556, "grad_norm": 3.0418996654086006, "learning_rate": 0.00017324999999999998, "loss": 4.9688, "step": 1155 }, { "epoch": 0.7655629139072848, "grad_norm": 2.878370527327323, "learning_rate": 0.00017339999999999996, "loss": 4.8125, "step": 1156 }, { "epoch": 0.7662251655629139, "grad_norm": 3.566548171130857, "learning_rate": 0.00017355, "loss": 4.5938, "step": 1157 }, { "epoch": 0.766887417218543, "grad_norm": 2.3162439094836023, "learning_rate": 0.00017369999999999997, "loss": 4.5938, "step": 1158 }, { "epoch": 0.7675496688741722, "grad_norm": 3.617027262946174, "learning_rate": 0.00017385, "loss": 4.6562, "step": 1159 }, { "epoch": 0.7682119205298014, "grad_norm": 3.1546286741904983, "learning_rate": 0.00017399999999999997, "loss": 4.6562, "step": 1160 }, { "epoch": 0.7688741721854304, "grad_norm": 3.1997932902603687, "learning_rate": 0.00017414999999999998, "loss": 4.7812, "step": 1161 }, { "epoch": 0.7695364238410596, "grad_norm": 3.0074242982373787, "learning_rate": 0.00017429999999999998, "loss": 4.5625, "step": 1162 }, { "epoch": 0.7701986754966887, "grad_norm": 2.701669718999158, "learning_rate": 0.00017444999999999998, "loss": 4.75, "step": 1163 }, { "epoch": 0.7708609271523179, "grad_norm": 7.04521838374954, "learning_rate": 0.00017459999999999996, "loss": 4.6562, "step": 1164 }, { "epoch": 0.7715231788079471, "grad_norm": 2.7844368233784516, "learning_rate": 0.00017475, "loss": 4.7188, "step": 1165 }, { "epoch": 0.7721854304635761, "grad_norm": 2.5870788598183725, "learning_rate": 0.00017489999999999997, "loss": 4.5312, "step": 1166 }, { "epoch": 0.7728476821192053, "grad_norm": 3.0942102177798163, "learning_rate": 0.00017505, "loss": 4.625, "step": 1167 }, { "epoch": 0.7735099337748345, "grad_norm": 2.275111777542878, "learning_rate": 0.00017519999999999998, "loss": 4.625, "step": 1168 }, { "epoch": 0.7741721854304636, "grad_norm": 4.275960709796497, "learning_rate": 0.00017534999999999998, "loss": 4.4375, "step": 1169 }, { "epoch": 0.7748344370860927, "grad_norm": 3.6781087158113683, "learning_rate": 0.00017549999999999998, "loss": 4.5312, "step": 1170 }, { "epoch": 0.7754966887417218, "grad_norm": 3.550133481338501, "learning_rate": 0.00017565, "loss": 4.5312, "step": 1171 }, { "epoch": 0.776158940397351, "grad_norm": 2.328802091089066, "learning_rate": 0.00017579999999999996, "loss": 4.5938, "step": 1172 }, { "epoch": 0.7768211920529802, "grad_norm": 2.8815278322495557, "learning_rate": 0.00017595, "loss": 4.4688, "step": 1173 }, { "epoch": 0.7774834437086092, "grad_norm": 3.0685788093620197, "learning_rate": 0.00017609999999999997, "loss": 4.6562, "step": 1174 }, { "epoch": 0.7781456953642384, "grad_norm": 3.3633050476223203, "learning_rate": 0.00017625, "loss": 4.3125, "step": 1175 }, { "epoch": 0.7788079470198676, "grad_norm": 3.2921344274738247, "learning_rate": 0.00017639999999999998, "loss": 4.625, "step": 1176 }, { "epoch": 0.7794701986754967, "grad_norm": 3.0224334797484866, "learning_rate": 0.00017654999999999998, "loss": 4.3125, "step": 1177 }, { "epoch": 0.7801324503311259, "grad_norm": 4.310652612911602, "learning_rate": 0.00017669999999999999, "loss": 4.3438, "step": 1178 }, { "epoch": 0.7807947019867549, "grad_norm": 4.730327550566992, "learning_rate": 0.00017685, "loss": 4.5938, "step": 1179 }, { "epoch": 0.7814569536423841, "grad_norm": 3.7953680303299877, "learning_rate": 0.00017699999999999997, "loss": 4.3438, "step": 1180 }, { "epoch": 0.7821192052980133, "grad_norm": 3.9264451708090933, "learning_rate": 0.00017715, "loss": 4.375, "step": 1181 }, { "epoch": 0.7827814569536424, "grad_norm": 4.018045666660239, "learning_rate": 0.00017729999999999997, "loss": 4.7812, "step": 1182 }, { "epoch": 0.7834437086092715, "grad_norm": 4.8093140960148295, "learning_rate": 0.00017745, "loss": 4.2188, "step": 1183 }, { "epoch": 0.7841059602649006, "grad_norm": 3.7042612668074377, "learning_rate": 0.00017759999999999998, "loss": 4.4688, "step": 1184 }, { "epoch": 0.7847682119205298, "grad_norm": 2.8490327246696245, "learning_rate": 0.00017774999999999998, "loss": 4.25, "step": 1185 }, { "epoch": 0.785430463576159, "grad_norm": 3.0756689714784096, "learning_rate": 0.0001779, "loss": 4.25, "step": 1186 }, { "epoch": 0.786092715231788, "grad_norm": 3.210672943463751, "learning_rate": 0.00017805, "loss": 4.5, "step": 1187 }, { "epoch": 0.7867549668874172, "grad_norm": 2.6754651788415327, "learning_rate": 0.00017819999999999997, "loss": 4.25, "step": 1188 }, { "epoch": 0.7874172185430464, "grad_norm": 19.95414281009093, "learning_rate": 0.00017835, "loss": 4.1875, "step": 1189 }, { "epoch": 0.7880794701986755, "grad_norm": 9.188228181131823, "learning_rate": 0.00017849999999999997, "loss": 4.5312, "step": 1190 }, { "epoch": 0.7887417218543047, "grad_norm": 5.779872680125628, "learning_rate": 0.00017865, "loss": 4.4375, "step": 1191 }, { "epoch": 0.7894039735099337, "grad_norm": 5.593414000170351, "learning_rate": 0.00017879999999999998, "loss": 4.4062, "step": 1192 }, { "epoch": 0.7900662251655629, "grad_norm": 6.6034356606854185, "learning_rate": 0.00017894999999999999, "loss": 4.5312, "step": 1193 }, { "epoch": 0.7907284768211921, "grad_norm": 4.821609846684629, "learning_rate": 0.0001791, "loss": 4.0312, "step": 1194 }, { "epoch": 0.7913907284768212, "grad_norm": 4.829948455363373, "learning_rate": 0.00017925, "loss": 4.4688, "step": 1195 }, { "epoch": 0.7920529801324503, "grad_norm": 2.6500477647093903, "learning_rate": 0.00017939999999999997, "loss": 4.5, "step": 1196 }, { "epoch": 0.7927152317880795, "grad_norm": 2.8810942989297685, "learning_rate": 0.00017955, "loss": 4.1875, "step": 1197 }, { "epoch": 0.7933774834437086, "grad_norm": 3.1668885708924988, "learning_rate": 0.00017969999999999998, "loss": 4.3438, "step": 1198 }, { "epoch": 0.7940397350993378, "grad_norm": 3.712795619492887, "learning_rate": 0.00017984999999999998, "loss": 4.4375, "step": 1199 }, { "epoch": 0.7947019867549668, "grad_norm": 2.4360986873577852, "learning_rate": 0.00017999999999999998, "loss": 4.2188, "step": 1200 }, { "epoch": 0.795364238410596, "grad_norm": 3.2064211835310794, "learning_rate": 0.00018015, "loss": 4.0625, "step": 1201 }, { "epoch": 0.7960264900662252, "grad_norm": 2.837892589566235, "learning_rate": 0.00018029999999999996, "loss": 4.0, "step": 1202 }, { "epoch": 0.7966887417218543, "grad_norm": 2.845337413652026, "learning_rate": 0.00018045, "loss": 4.3125, "step": 1203 }, { "epoch": 0.7973509933774835, "grad_norm": 3.5258121209948476, "learning_rate": 0.00018059999999999997, "loss": 4.3438, "step": 1204 }, { "epoch": 0.7980132450331126, "grad_norm": 2.7474604714915842, "learning_rate": 0.00018075, "loss": 4.2188, "step": 1205 }, { "epoch": 0.7986754966887417, "grad_norm": 2.2968730034268514, "learning_rate": 0.00018089999999999998, "loss": 4.0312, "step": 1206 }, { "epoch": 0.7993377483443709, "grad_norm": 2.6714874146838268, "learning_rate": 0.00018104999999999998, "loss": 3.875, "step": 1207 }, { "epoch": 0.8, "grad_norm": 3.07964517217086, "learning_rate": 0.00018119999999999999, "loss": 4.25, "step": 1208 }, { "epoch": 0.8006622516556291, "grad_norm": 2.435872017997588, "learning_rate": 0.00018135, "loss": 4.0312, "step": 1209 }, { "epoch": 0.8013245033112583, "grad_norm": 2.927330196530365, "learning_rate": 0.00018149999999999997, "loss": 4.0312, "step": 1210 }, { "epoch": 0.8019867549668874, "grad_norm": 2.5841002857107793, "learning_rate": 0.00018165, "loss": 4.2812, "step": 1211 }, { "epoch": 0.8026490066225166, "grad_norm": 2.5491835847603266, "learning_rate": 0.00018179999999999997, "loss": 3.8438, "step": 1212 }, { "epoch": 0.8033112582781456, "grad_norm": 3.121414494504496, "learning_rate": 0.00018195, "loss": 3.7188, "step": 1213 }, { "epoch": 0.8039735099337748, "grad_norm": 1.9150506810556445, "learning_rate": 0.00018209999999999998, "loss": 3.875, "step": 1214 }, { "epoch": 0.804635761589404, "grad_norm": 2.427306225432991, "learning_rate": 0.00018224999999999998, "loss": 3.9375, "step": 1215 }, { "epoch": 0.8052980132450331, "grad_norm": 2.3211485574252784, "learning_rate": 0.0001824, "loss": 4.1562, "step": 1216 }, { "epoch": 0.8059602649006623, "grad_norm": 2.3610528809973896, "learning_rate": 0.00018255, "loss": 3.7812, "step": 1217 }, { "epoch": 0.8066225165562914, "grad_norm": 2.331605604821229, "learning_rate": 0.00018269999999999997, "loss": 4.0625, "step": 1218 }, { "epoch": 0.8072847682119205, "grad_norm": 2.6490920241212472, "learning_rate": 0.00018285, "loss": 3.6406, "step": 1219 }, { "epoch": 0.8079470198675497, "grad_norm": 2.1481865914914713, "learning_rate": 0.00018299999999999998, "loss": 4.0312, "step": 1220 }, { "epoch": 0.8086092715231789, "grad_norm": 2.7477610714638305, "learning_rate": 0.00018315, "loss": 3.8438, "step": 1221 }, { "epoch": 0.8092715231788079, "grad_norm": 2.4444429311502294, "learning_rate": 0.00018329999999999998, "loss": 3.9531, "step": 1222 }, { "epoch": 0.8099337748344371, "grad_norm": 2.415903654978864, "learning_rate": 0.00018345, "loss": 3.9688, "step": 1223 }, { "epoch": 0.8105960264900662, "grad_norm": 2.301674740187517, "learning_rate": 0.0001836, "loss": 3.9219, "step": 1224 }, { "epoch": 0.8112582781456954, "grad_norm": 2.440961166859283, "learning_rate": 0.00018375, "loss": 3.7656, "step": 1225 }, { "epoch": 0.8119205298013245, "grad_norm": 2.038892872401617, "learning_rate": 0.00018389999999999997, "loss": 4.0, "step": 1226 }, { "epoch": 0.8125827814569536, "grad_norm": 3.1551766935288916, "learning_rate": 0.00018405, "loss": 3.9688, "step": 1227 }, { "epoch": 0.8132450331125828, "grad_norm": 2.1976280186158776, "learning_rate": 0.00018419999999999998, "loss": 4.0, "step": 1228 }, { "epoch": 0.813907284768212, "grad_norm": 2.97147560371195, "learning_rate": 0.00018435, "loss": 3.8125, "step": 1229 }, { "epoch": 0.8145695364238411, "grad_norm": 2.0671922356715418, "learning_rate": 0.00018449999999999999, "loss": 3.5938, "step": 1230 }, { "epoch": 0.8152317880794702, "grad_norm": 2.26941514711728, "learning_rate": 0.00018465, "loss": 3.8438, "step": 1231 }, { "epoch": 0.8158940397350993, "grad_norm": 2.3056419546058144, "learning_rate": 0.0001848, "loss": 3.9844, "step": 1232 }, { "epoch": 0.8165562913907285, "grad_norm": 4.82203660223364, "learning_rate": 0.00018495, "loss": 3.3906, "step": 1233 }, { "epoch": 0.8172185430463577, "grad_norm": 2.3594325093544266, "learning_rate": 0.00018509999999999997, "loss": 3.6875, "step": 1234 }, { "epoch": 0.8178807947019867, "grad_norm": 5.277144784562463, "learning_rate": 0.00018525, "loss": 3.375, "step": 1235 }, { "epoch": 0.8185430463576159, "grad_norm": 3.0511893627282034, "learning_rate": 0.00018539999999999998, "loss": 3.9531, "step": 1236 }, { "epoch": 0.819205298013245, "grad_norm": 2.8432612415683716, "learning_rate": 0.00018555, "loss": 3.8125, "step": 1237 }, { "epoch": 0.8198675496688742, "grad_norm": 3.606638322133848, "learning_rate": 0.0001857, "loss": 3.8906, "step": 1238 }, { "epoch": 0.8205298013245033, "grad_norm": 2.9541853612007865, "learning_rate": 0.00018585, "loss": 3.7969, "step": 1239 }, { "epoch": 0.8211920529801324, "grad_norm": 2.0184868079297926, "learning_rate": 0.000186, "loss": 3.875, "step": 1240 }, { "epoch": 0.8218543046357616, "grad_norm": 2.260662031836412, "learning_rate": 0.00018615, "loss": 3.9375, "step": 1241 }, { "epoch": 0.8225165562913908, "grad_norm": 2.7224847798881497, "learning_rate": 0.00018629999999999997, "loss": 3.7812, "step": 1242 }, { "epoch": 0.8231788079470199, "grad_norm": 3.646639369145067, "learning_rate": 0.00018645, "loss": 3.8281, "step": 1243 }, { "epoch": 0.823841059602649, "grad_norm": 3.0261305484025964, "learning_rate": 0.00018659999999999998, "loss": 3.2969, "step": 1244 }, { "epoch": 0.8245033112582781, "grad_norm": 2.542471918682892, "learning_rate": 0.00018675, "loss": 3.7344, "step": 1245 }, { "epoch": 0.8251655629139073, "grad_norm": 2.659117751392304, "learning_rate": 0.0001869, "loss": 4.0312, "step": 1246 }, { "epoch": 0.8258278145695365, "grad_norm": 2.134016312158818, "learning_rate": 0.00018705, "loss": 3.75, "step": 1247 }, { "epoch": 0.8264900662251655, "grad_norm": 2.3472444130810306, "learning_rate": 0.0001872, "loss": 3.75, "step": 1248 }, { "epoch": 0.8271523178807947, "grad_norm": 2.7658089412372338, "learning_rate": 0.00018735, "loss": 3.6719, "step": 1249 }, { "epoch": 0.8278145695364238, "grad_norm": 2.1061883116592552, "learning_rate": 0.00018749999999999998, "loss": 3.5156, "step": 1250 }, { "epoch": 0.828476821192053, "grad_norm": 2.8141025758938856, "learning_rate": 0.00018764999999999998, "loss": 3.6094, "step": 1251 }, { "epoch": 0.8291390728476821, "grad_norm": 2.0635282223933387, "learning_rate": 0.00018779999999999998, "loss": 3.6406, "step": 1252 }, { "epoch": 0.8298013245033112, "grad_norm": 2.247434687275756, "learning_rate": 0.00018794999999999996, "loss": 3.3125, "step": 1253 }, { "epoch": 0.8304635761589404, "grad_norm": 2.395702865018192, "learning_rate": 0.0001881, "loss": 3.4688, "step": 1254 }, { "epoch": 0.8311258278145696, "grad_norm": 2.0580642792743125, "learning_rate": 0.00018824999999999997, "loss": 3.5781, "step": 1255 }, { "epoch": 0.8317880794701987, "grad_norm": 2.2153006546281553, "learning_rate": 0.00018839999999999997, "loss": 3.375, "step": 1256 }, { "epoch": 0.8324503311258278, "grad_norm": 2.886146167349155, "learning_rate": 0.00018854999999999998, "loss": 3.0312, "step": 1257 }, { "epoch": 0.833112582781457, "grad_norm": 2.489217698079595, "learning_rate": 0.00018869999999999998, "loss": 3.8125, "step": 1258 }, { "epoch": 0.8337748344370861, "grad_norm": 3.8364593763290333, "learning_rate": 0.00018884999999999996, "loss": 3.4688, "step": 1259 }, { "epoch": 0.8344370860927153, "grad_norm": 2.3706238227239775, "learning_rate": 0.00018899999999999999, "loss": 3.3438, "step": 1260 }, { "epoch": 0.8350993377483443, "grad_norm": 2.6372026366170127, "learning_rate": 0.00018914999999999996, "loss": 3.0312, "step": 1261 }, { "epoch": 0.8357615894039735, "grad_norm": 2.3995036789981734, "learning_rate": 0.0001893, "loss": 3.1875, "step": 1262 }, { "epoch": 0.8364238410596027, "grad_norm": 2.9600273894537295, "learning_rate": 0.00018944999999999997, "loss": 3.0312, "step": 1263 }, { "epoch": 0.8370860927152318, "grad_norm": 2.801629215762054, "learning_rate": 0.00018959999999999997, "loss": 3.5312, "step": 1264 }, { "epoch": 0.8377483443708609, "grad_norm": 2.601399129858437, "learning_rate": 0.00018974999999999998, "loss": 3.625, "step": 1265 }, { "epoch": 0.83841059602649, "grad_norm": 2.442192861302586, "learning_rate": 0.00018989999999999998, "loss": 3.4531, "step": 1266 }, { "epoch": 0.8390728476821192, "grad_norm": 2.2439706155054155, "learning_rate": 0.00019004999999999996, "loss": 3.3125, "step": 1267 }, { "epoch": 0.8397350993377484, "grad_norm": 2.2463404716809983, "learning_rate": 0.0001902, "loss": 3.3281, "step": 1268 }, { "epoch": 0.8403973509933775, "grad_norm": 1.9614469501235225, "learning_rate": 0.00019034999999999996, "loss": 3.375, "step": 1269 }, { "epoch": 0.8410596026490066, "grad_norm": 1.797348593299781, "learning_rate": 0.0001905, "loss": 3.2188, "step": 1270 }, { "epoch": 0.8417218543046358, "grad_norm": 1.9667144495040036, "learning_rate": 0.00019064999999999997, "loss": 3.3125, "step": 1271 }, { "epoch": 0.8423841059602649, "grad_norm": 4.360274011206607, "learning_rate": 0.00019079999999999998, "loss": 2.875, "step": 1272 }, { "epoch": 0.8430463576158941, "grad_norm": 2.830328621433694, "learning_rate": 0.00019094999999999998, "loss": 3.3906, "step": 1273 }, { "epoch": 0.8437086092715231, "grad_norm": 2.0166246119722397, "learning_rate": 0.00019109999999999998, "loss": 3.3906, "step": 1274 }, { "epoch": 0.8443708609271523, "grad_norm": 2.4632450806613044, "learning_rate": 0.00019124999999999996, "loss": 3.0781, "step": 1275 }, { "epoch": 0.8450331125827815, "grad_norm": 3.041269482810716, "learning_rate": 0.0001914, "loss": 3.25, "step": 1276 }, { "epoch": 0.8456953642384106, "grad_norm": 2.094047835217932, "learning_rate": 0.00019154999999999997, "loss": 2.7656, "step": 1277 }, { "epoch": 0.8463576158940397, "grad_norm": 2.2279015293145012, "learning_rate": 0.0001917, "loss": 3.1562, "step": 1278 }, { "epoch": 0.8470198675496688, "grad_norm": 2.1197165579219033, "learning_rate": 0.00019184999999999997, "loss": 3.0625, "step": 1279 }, { "epoch": 0.847682119205298, "grad_norm": 1.9285421104999607, "learning_rate": 0.00019199999999999998, "loss": 3.0938, "step": 1280 }, { "epoch": 0.8483443708609272, "grad_norm": 1.9637823823706557, "learning_rate": 0.00019214999999999998, "loss": 3.0938, "step": 1281 }, { "epoch": 0.8490066225165563, "grad_norm": 2.102949037342207, "learning_rate": 0.00019229999999999999, "loss": 3.3125, "step": 1282 }, { "epoch": 0.8496688741721854, "grad_norm": 12.934159881862051, "learning_rate": 0.00019244999999999996, "loss": 3.0312, "step": 1283 }, { "epoch": 0.8503311258278146, "grad_norm": 2.5260629999943522, "learning_rate": 0.0001926, "loss": 3.1094, "step": 1284 }, { "epoch": 0.8509933774834437, "grad_norm": 3.0771487887927162, "learning_rate": 0.00019274999999999997, "loss": 2.4844, "step": 1285 }, { "epoch": 0.8516556291390729, "grad_norm": 12.293546759115715, "learning_rate": 0.0001929, "loss": 3.0781, "step": 1286 }, { "epoch": 0.8523178807947019, "grad_norm": 4.2734784355264335, "learning_rate": 0.00019304999999999998, "loss": 3.3281, "step": 1287 }, { "epoch": 0.8529801324503311, "grad_norm": 6.514369784301302, "learning_rate": 0.00019319999999999998, "loss": 3.2969, "step": 1288 }, { "epoch": 0.8536423841059603, "grad_norm": 2.480476949100863, "learning_rate": 0.00019334999999999998, "loss": 3.1094, "step": 1289 }, { "epoch": 0.8543046357615894, "grad_norm": 2.5250184781188767, "learning_rate": 0.0001935, "loss": 3.2969, "step": 1290 }, { "epoch": 0.8549668874172185, "grad_norm": 2.5897213788116296, "learning_rate": 0.00019364999999999996, "loss": 3.0, "step": 1291 }, { "epoch": 0.8556291390728477, "grad_norm": 2.1751238634615775, "learning_rate": 0.0001938, "loss": 3.0781, "step": 1292 }, { "epoch": 0.8562913907284768, "grad_norm": 2.003691957663341, "learning_rate": 0.00019394999999999997, "loss": 3.0312, "step": 1293 }, { "epoch": 0.856953642384106, "grad_norm": 3.425486841135925, "learning_rate": 0.0001941, "loss": 3.0156, "step": 1294 }, { "epoch": 0.8576158940397351, "grad_norm": 2.2281731877279447, "learning_rate": 0.00019424999999999998, "loss": 3.0312, "step": 1295 }, { "epoch": 0.8582781456953642, "grad_norm": 1.902503861135649, "learning_rate": 0.00019439999999999998, "loss": 3.1562, "step": 1296 }, { "epoch": 0.8589403973509934, "grad_norm": 2.937040665800666, "learning_rate": 0.00019454999999999999, "loss": 3.4688, "step": 1297 }, { "epoch": 0.8596026490066225, "grad_norm": 1.8101821516295078, "learning_rate": 0.0001947, "loss": 3.0781, "step": 1298 }, { "epoch": 0.8602649006622517, "grad_norm": 1.9145431161306536, "learning_rate": 0.00019484999999999997, "loss": 3.0156, "step": 1299 }, { "epoch": 0.8609271523178808, "grad_norm": 2.0362130933049416, "learning_rate": 0.000195, "loss": 2.9062, "step": 1300 }, { "epoch": 0.8615894039735099, "grad_norm": 1.6833551132776159, "learning_rate": 0.00019514999999999997, "loss": 2.8594, "step": 1301 }, { "epoch": 0.8622516556291391, "grad_norm": 3.2980914550470635, "learning_rate": 0.00019529999999999998, "loss": 3.0469, "step": 1302 }, { "epoch": 0.8629139072847682, "grad_norm": 2.2716150420434436, "learning_rate": 0.00019544999999999998, "loss": 2.7969, "step": 1303 }, { "epoch": 0.8635761589403973, "grad_norm": 1.866670695652504, "learning_rate": 0.00019559999999999998, "loss": 3.0312, "step": 1304 }, { "epoch": 0.8642384105960265, "grad_norm": 2.2407126827811457, "learning_rate": 0.00019574999999999996, "loss": 2.8906, "step": 1305 }, { "epoch": 0.8649006622516556, "grad_norm": 1.4297108464832682, "learning_rate": 0.0001959, "loss": 2.7812, "step": 1306 }, { "epoch": 0.8655629139072848, "grad_norm": 2.5384163063915834, "learning_rate": 0.00019604999999999997, "loss": 2.9375, "step": 1307 }, { "epoch": 0.866225165562914, "grad_norm": 1.7005327695549182, "learning_rate": 0.0001962, "loss": 2.9375, "step": 1308 }, { "epoch": 0.866887417218543, "grad_norm": 1.6533228202857764, "learning_rate": 0.00019634999999999998, "loss": 2.3438, "step": 1309 }, { "epoch": 0.8675496688741722, "grad_norm": 1.6390461491230774, "learning_rate": 0.00019649999999999998, "loss": 2.8281, "step": 1310 }, { "epoch": 0.8682119205298013, "grad_norm": 2.205631297152919, "learning_rate": 0.00019664999999999998, "loss": 3.1406, "step": 1311 }, { "epoch": 0.8688741721854305, "grad_norm": 1.7256990002269397, "learning_rate": 0.00019679999999999999, "loss": 2.3906, "step": 1312 }, { "epoch": 0.8695364238410596, "grad_norm": 1.8440750787405453, "learning_rate": 0.00019694999999999996, "loss": 3.0156, "step": 1313 }, { "epoch": 0.8701986754966887, "grad_norm": 1.6721072079437616, "learning_rate": 0.0001971, "loss": 3.1719, "step": 1314 }, { "epoch": 0.8708609271523179, "grad_norm": 3.0880752748553926, "learning_rate": 0.00019724999999999997, "loss": 2.875, "step": 1315 }, { "epoch": 0.871523178807947, "grad_norm": 1.591474979173941, "learning_rate": 0.0001974, "loss": 2.75, "step": 1316 }, { "epoch": 0.8721854304635761, "grad_norm": 1.499523289724405, "learning_rate": 0.00019754999999999998, "loss": 2.5625, "step": 1317 }, { "epoch": 0.8728476821192053, "grad_norm": 1.7969873592866519, "learning_rate": 0.00019769999999999998, "loss": 3.0781, "step": 1318 }, { "epoch": 0.8735099337748344, "grad_norm": 1.5605144901323431, "learning_rate": 0.00019784999999999998, "loss": 3.0312, "step": 1319 }, { "epoch": 0.8741721854304636, "grad_norm": 1.9429963964773207, "learning_rate": 0.000198, "loss": 3.0625, "step": 1320 }, { "epoch": 0.8748344370860928, "grad_norm": 1.55742681982741, "learning_rate": 0.00019814999999999996, "loss": 2.7969, "step": 1321 }, { "epoch": 0.8754966887417218, "grad_norm": 1.5818051331928482, "learning_rate": 0.0001983, "loss": 2.75, "step": 1322 }, { "epoch": 0.876158940397351, "grad_norm": 1.4961222930268283, "learning_rate": 0.00019844999999999997, "loss": 2.9062, "step": 1323 }, { "epoch": 0.8768211920529801, "grad_norm": 5.03589782682638, "learning_rate": 0.0001986, "loss": 2.8281, "step": 1324 }, { "epoch": 0.8774834437086093, "grad_norm": 1.8509996192290328, "learning_rate": 0.00019874999999999998, "loss": 2.9219, "step": 1325 }, { "epoch": 0.8781456953642384, "grad_norm": 1.83029097083417, "learning_rate": 0.00019889999999999998, "loss": 2.9375, "step": 1326 }, { "epoch": 0.8788079470198675, "grad_norm": 1.8718981928000682, "learning_rate": 0.00019905, "loss": 2.6406, "step": 1327 }, { "epoch": 0.8794701986754967, "grad_norm": 2.0359175764126305, "learning_rate": 0.0001992, "loss": 2.8281, "step": 1328 }, { "epoch": 0.8801324503311259, "grad_norm": 1.9720091395108976, "learning_rate": 0.00019934999999999997, "loss": 3.0625, "step": 1329 }, { "epoch": 0.8807947019867549, "grad_norm": 1.7165135233602165, "learning_rate": 0.0001995, "loss": 3.0312, "step": 1330 }, { "epoch": 0.8814569536423841, "grad_norm": 1.7387206413647913, "learning_rate": 0.00019964999999999997, "loss": 3.0, "step": 1331 }, { "epoch": 0.8821192052980132, "grad_norm": 1.5945304515390302, "learning_rate": 0.0001998, "loss": 2.8906, "step": 1332 }, { "epoch": 0.8827814569536424, "grad_norm": 1.7365850409684334, "learning_rate": 0.00019994999999999998, "loss": 2.8281, "step": 1333 }, { "epoch": 0.8834437086092716, "grad_norm": 1.6997557620462878, "learning_rate": 0.00020009999999999998, "loss": 2.375, "step": 1334 }, { "epoch": 0.8841059602649006, "grad_norm": 1.5913269309673954, "learning_rate": 0.00020025, "loss": 2.9062, "step": 1335 }, { "epoch": 0.8847682119205298, "grad_norm": 1.7332359839626117, "learning_rate": 0.0002004, "loss": 2.9219, "step": 1336 }, { "epoch": 0.885430463576159, "grad_norm": 1.4041720781314866, "learning_rate": 0.00020054999999999997, "loss": 2.4062, "step": 1337 }, { "epoch": 0.8860927152317881, "grad_norm": 1.74773019119878, "learning_rate": 0.0002007, "loss": 3.0312, "step": 1338 }, { "epoch": 0.8867549668874172, "grad_norm": 1.474956182698185, "learning_rate": 0.00020084999999999998, "loss": 2.9531, "step": 1339 }, { "epoch": 0.8874172185430463, "grad_norm": 2.463410637891758, "learning_rate": 0.000201, "loss": 2.6094, "step": 1340 }, { "epoch": 0.8880794701986755, "grad_norm": 1.4774451046450296, "learning_rate": 0.00020114999999999998, "loss": 3.1094, "step": 1341 }, { "epoch": 0.8887417218543047, "grad_norm": 1.5346116839122577, "learning_rate": 0.0002013, "loss": 2.6875, "step": 1342 }, { "epoch": 0.8894039735099337, "grad_norm": 1.6829863526467448, "learning_rate": 0.00020145, "loss": 2.9688, "step": 1343 }, { "epoch": 0.8900662251655629, "grad_norm": 1.4938465070677558, "learning_rate": 0.0002016, "loss": 2.7188, "step": 1344 }, { "epoch": 0.890728476821192, "grad_norm": 1.3753626821988953, "learning_rate": 0.00020174999999999997, "loss": 2.2031, "step": 1345 }, { "epoch": 0.8913907284768212, "grad_norm": 1.449124975048414, "learning_rate": 0.0002019, "loss": 2.9531, "step": 1346 }, { "epoch": 0.8920529801324504, "grad_norm": 1.4164700142217563, "learning_rate": 0.00020204999999999998, "loss": 2.8438, "step": 1347 }, { "epoch": 0.8927152317880794, "grad_norm": 1.561906060182665, "learning_rate": 0.0002022, "loss": 2.7656, "step": 1348 }, { "epoch": 0.8933774834437086, "grad_norm": 1.50374427364562, "learning_rate": 0.00020234999999999999, "loss": 2.9219, "step": 1349 }, { "epoch": 0.8940397350993378, "grad_norm": 1.7678186446785271, "learning_rate": 0.0002025, "loss": 2.8281, "step": 1350 }, { "epoch": 0.8947019867549669, "grad_norm": 1.7658211908889372, "learning_rate": 0.00020264999999999997, "loss": 2.8125, "step": 1351 }, { "epoch": 0.895364238410596, "grad_norm": 1.4397884138710912, "learning_rate": 0.0002028, "loss": 2.75, "step": 1352 }, { "epoch": 0.8960264900662251, "grad_norm": 1.4548613283237661, "learning_rate": 0.00020294999999999997, "loss": 2.3594, "step": 1353 }, { "epoch": 0.8966887417218543, "grad_norm": 1.7180858602217157, "learning_rate": 0.0002031, "loss": 2.8125, "step": 1354 }, { "epoch": 0.8973509933774835, "grad_norm": 1.4663944084563982, "learning_rate": 0.00020324999999999998, "loss": 2.7344, "step": 1355 }, { "epoch": 0.8980132450331125, "grad_norm": 1.533038867656512, "learning_rate": 0.00020339999999999998, "loss": 2.9531, "step": 1356 }, { "epoch": 0.8986754966887417, "grad_norm": 1.620822625361471, "learning_rate": 0.00020355, "loss": 2.6094, "step": 1357 }, { "epoch": 0.8993377483443709, "grad_norm": 1.447237712739006, "learning_rate": 0.0002037, "loss": 2.7031, "step": 1358 }, { "epoch": 0.9, "grad_norm": 1.6105157965875507, "learning_rate": 0.00020384999999999997, "loss": 2.9531, "step": 1359 }, { "epoch": 0.9006622516556292, "grad_norm": 4.571638031607098, "learning_rate": 0.000204, "loss": 2.875, "step": 1360 }, { "epoch": 0.9013245033112582, "grad_norm": 1.7738790930088484, "learning_rate": 0.00020414999999999997, "loss": 2.6562, "step": 1361 }, { "epoch": 0.9019867549668874, "grad_norm": 1.5899087195644968, "learning_rate": 0.0002043, "loss": 2.7969, "step": 1362 }, { "epoch": 0.9026490066225166, "grad_norm": 1.5037662290863272, "learning_rate": 0.00020444999999999998, "loss": 2.8125, "step": 1363 }, { "epoch": 0.9033112582781457, "grad_norm": 1.5222018208359926, "learning_rate": 0.00020459999999999999, "loss": 2.3906, "step": 1364 }, { "epoch": 0.9039735099337748, "grad_norm": 1.6118353293337373, "learning_rate": 0.00020475, "loss": 2.8594, "step": 1365 }, { "epoch": 0.904635761589404, "grad_norm": 1.502161009638808, "learning_rate": 0.0002049, "loss": 2.3906, "step": 1366 }, { "epoch": 0.9052980132450331, "grad_norm": 1.5597003886186351, "learning_rate": 0.00020504999999999997, "loss": 2.8438, "step": 1367 }, { "epoch": 0.9059602649006623, "grad_norm": 1.5554827438380323, "learning_rate": 0.0002052, "loss": 2.2812, "step": 1368 }, { "epoch": 0.9066225165562913, "grad_norm": 1.5836107931876346, "learning_rate": 0.00020534999999999998, "loss": 2.8594, "step": 1369 }, { "epoch": 0.9072847682119205, "grad_norm": 1.78331122867097, "learning_rate": 0.0002055, "loss": 3.0625, "step": 1370 }, { "epoch": 0.9079470198675497, "grad_norm": 1.4242900341146445, "learning_rate": 0.00020564999999999998, "loss": 2.6875, "step": 1371 }, { "epoch": 0.9086092715231788, "grad_norm": 1.357918874171911, "learning_rate": 0.0002058, "loss": 2.8438, "step": 1372 }, { "epoch": 0.909271523178808, "grad_norm": 1.4618490258419004, "learning_rate": 0.00020595, "loss": 2.6719, "step": 1373 }, { "epoch": 0.909933774834437, "grad_norm": 1.5027818471955123, "learning_rate": 0.0002061, "loss": 3.1562, "step": 1374 }, { "epoch": 0.9105960264900662, "grad_norm": 1.4237550740618516, "learning_rate": 0.00020624999999999997, "loss": 2.8906, "step": 1375 }, { "epoch": 0.9112582781456954, "grad_norm": 1.3951903615686183, "learning_rate": 0.00020639999999999998, "loss": 2.8125, "step": 1376 }, { "epoch": 0.9119205298013245, "grad_norm": 1.5472104206974824, "learning_rate": 0.00020654999999999998, "loss": 2.9219, "step": 1377 }, { "epoch": 0.9125827814569536, "grad_norm": 1.393140708389783, "learning_rate": 0.00020669999999999996, "loss": 2.9219, "step": 1378 }, { "epoch": 0.9132450331125828, "grad_norm": 1.7274961045766346, "learning_rate": 0.00020684999999999999, "loss": 2.9531, "step": 1379 }, { "epoch": 0.9139072847682119, "grad_norm": 1.4538649519869518, "learning_rate": 0.00020699999999999996, "loss": 2.75, "step": 1380 }, { "epoch": 0.9145695364238411, "grad_norm": 1.279091826306969, "learning_rate": 0.00020715, "loss": 2.7656, "step": 1381 }, { "epoch": 0.9152317880794701, "grad_norm": 1.5609246305548645, "learning_rate": 0.00020729999999999997, "loss": 3.0312, "step": 1382 }, { "epoch": 0.9158940397350993, "grad_norm": 1.538887972920399, "learning_rate": 0.00020744999999999997, "loss": 2.7188, "step": 1383 }, { "epoch": 0.9165562913907285, "grad_norm": 1.4381341232781528, "learning_rate": 0.00020759999999999998, "loss": 2.875, "step": 1384 }, { "epoch": 0.9172185430463576, "grad_norm": 1.369532596945704, "learning_rate": 0.00020774999999999998, "loss": 2.6562, "step": 1385 }, { "epoch": 0.9178807947019868, "grad_norm": 1.5035880126747518, "learning_rate": 0.00020789999999999996, "loss": 3.0938, "step": 1386 }, { "epoch": 0.9185430463576159, "grad_norm": 1.7210000314337632, "learning_rate": 0.00020805, "loss": 2.6094, "step": 1387 }, { "epoch": 0.919205298013245, "grad_norm": 1.499158745828499, "learning_rate": 0.00020819999999999996, "loss": 2.75, "step": 1388 }, { "epoch": 0.9198675496688742, "grad_norm": 1.5287500508620049, "learning_rate": 0.00020835, "loss": 2.8906, "step": 1389 }, { "epoch": 0.9205298013245033, "grad_norm": 1.3536655453120798, "learning_rate": 0.00020849999999999997, "loss": 2.4219, "step": 1390 }, { "epoch": 0.9211920529801324, "grad_norm": 1.6649626616424975, "learning_rate": 0.00020864999999999998, "loss": 3.0, "step": 1391 }, { "epoch": 0.9218543046357616, "grad_norm": 1.3470106061149882, "learning_rate": 0.00020879999999999998, "loss": 2.875, "step": 1392 }, { "epoch": 0.9225165562913907, "grad_norm": 1.3212770437782715, "learning_rate": 0.00020894999999999998, "loss": 2.75, "step": 1393 }, { "epoch": 0.9231788079470199, "grad_norm": 1.463062616171229, "learning_rate": 0.00020909999999999996, "loss": 2.375, "step": 1394 }, { "epoch": 0.9238410596026491, "grad_norm": 1.4628881973612466, "learning_rate": 0.00020925, "loss": 2.8125, "step": 1395 }, { "epoch": 0.9245033112582781, "grad_norm": 1.5518256163080537, "learning_rate": 0.00020939999999999997, "loss": 2.8438, "step": 1396 }, { "epoch": 0.9251655629139073, "grad_norm": 1.390316201113342, "learning_rate": 0.00020955, "loss": 2.8438, "step": 1397 }, { "epoch": 0.9258278145695364, "grad_norm": 1.3945650958185598, "learning_rate": 0.00020969999999999997, "loss": 2.7812, "step": 1398 }, { "epoch": 0.9264900662251656, "grad_norm": 2.5940424767660013, "learning_rate": 0.00020984999999999998, "loss": 2.875, "step": 1399 }, { "epoch": 0.9271523178807947, "grad_norm": 1.4273371665104149, "learning_rate": 0.00020999999999999998, "loss": 2.6562, "step": 1400 }, { "epoch": 0.9278145695364238, "grad_norm": 1.470668390919937, "learning_rate": 0.00021014999999999999, "loss": 2.8125, "step": 1401 }, { "epoch": 0.928476821192053, "grad_norm": 2.449382176929562, "learning_rate": 0.00021029999999999996, "loss": 2.4531, "step": 1402 }, { "epoch": 0.9291390728476822, "grad_norm": 1.4482714390262088, "learning_rate": 0.00021045, "loss": 2.9688, "step": 1403 }, { "epoch": 0.9298013245033112, "grad_norm": 1.5969689405241236, "learning_rate": 0.00021059999999999997, "loss": 2.7969, "step": 1404 }, { "epoch": 0.9304635761589404, "grad_norm": 1.48817199692799, "learning_rate": 0.00021074999999999997, "loss": 2.8594, "step": 1405 }, { "epoch": 0.9311258278145695, "grad_norm": 1.5629706737677924, "learning_rate": 0.00021089999999999998, "loss": 2.8281, "step": 1406 }, { "epoch": 0.9317880794701987, "grad_norm": 1.519313146047688, "learning_rate": 0.00021104999999999998, "loss": 2.875, "step": 1407 }, { "epoch": 0.9324503311258279, "grad_norm": 1.2564930413360058, "learning_rate": 0.00021119999999999996, "loss": 2.6562, "step": 1408 }, { "epoch": 0.9331125827814569, "grad_norm": 1.2642865821304912, "learning_rate": 0.00021135, "loss": 2.5625, "step": 1409 }, { "epoch": 0.9337748344370861, "grad_norm": 1.2435791296096543, "learning_rate": 0.00021149999999999996, "loss": 2.7031, "step": 1410 }, { "epoch": 0.9344370860927153, "grad_norm": 1.4854880780755986, "learning_rate": 0.00021165, "loss": 3.0469, "step": 1411 }, { "epoch": 0.9350993377483444, "grad_norm": 1.378053261340132, "learning_rate": 0.00021179999999999997, "loss": 2.75, "step": 1412 }, { "epoch": 0.9357615894039735, "grad_norm": 1.4096308458980293, "learning_rate": 0.00021194999999999997, "loss": 3.0156, "step": 1413 }, { "epoch": 0.9364238410596026, "grad_norm": 1.6356674337850206, "learning_rate": 0.00021209999999999998, "loss": 2.7344, "step": 1414 }, { "epoch": 0.9370860927152318, "grad_norm": 1.5192142930479984, "learning_rate": 0.00021224999999999998, "loss": 2.75, "step": 1415 }, { "epoch": 0.937748344370861, "grad_norm": 1.501254080021138, "learning_rate": 0.00021239999999999996, "loss": 2.8906, "step": 1416 }, { "epoch": 0.93841059602649, "grad_norm": 1.394566827391653, "learning_rate": 0.00021255, "loss": 2.8125, "step": 1417 }, { "epoch": 0.9390728476821192, "grad_norm": 1.316154987913069, "learning_rate": 0.00021269999999999997, "loss": 2.4531, "step": 1418 }, { "epoch": 0.9397350993377483, "grad_norm": 1.270056255112331, "learning_rate": 0.00021285, "loss": 2.5781, "step": 1419 }, { "epoch": 0.9403973509933775, "grad_norm": 1.5399642317981366, "learning_rate": 0.00021299999999999997, "loss": 2.8281, "step": 1420 }, { "epoch": 0.9410596026490067, "grad_norm": 1.2269499858746964, "learning_rate": 0.00021314999999999998, "loss": 2.7344, "step": 1421 }, { "epoch": 0.9417218543046357, "grad_norm": 1.3888560618557948, "learning_rate": 0.00021329999999999998, "loss": 2.8594, "step": 1422 }, { "epoch": 0.9423841059602649, "grad_norm": 1.3530706227235787, "learning_rate": 0.00021344999999999998, "loss": 2.6406, "step": 1423 }, { "epoch": 0.9430463576158941, "grad_norm": 1.3544176318893908, "learning_rate": 0.00021359999999999996, "loss": 2.625, "step": 1424 }, { "epoch": 0.9437086092715232, "grad_norm": 1.4032200727843989, "learning_rate": 0.00021375, "loss": 2.7969, "step": 1425 }, { "epoch": 0.9443708609271523, "grad_norm": 1.4600841457367302, "learning_rate": 0.00021389999999999997, "loss": 2.6875, "step": 1426 }, { "epoch": 0.9450331125827814, "grad_norm": 1.3333425334918014, "learning_rate": 0.00021405, "loss": 2.8281, "step": 1427 }, { "epoch": 0.9456953642384106, "grad_norm": 1.5317319637065545, "learning_rate": 0.00021419999999999998, "loss": 2.7656, "step": 1428 }, { "epoch": 0.9463576158940398, "grad_norm": 1.3257516968812817, "learning_rate": 0.00021434999999999998, "loss": 2.7812, "step": 1429 }, { "epoch": 0.9470198675496688, "grad_norm": 4.545829716436066, "learning_rate": 0.00021449999999999998, "loss": 2.6875, "step": 1430 }, { "epoch": 0.947682119205298, "grad_norm": 1.3742459724148652, "learning_rate": 0.00021464999999999999, "loss": 2.7031, "step": 1431 }, { "epoch": 0.9483443708609272, "grad_norm": 1.5111068358181001, "learning_rate": 0.00021479999999999996, "loss": 2.8281, "step": 1432 }, { "epoch": 0.9490066225165563, "grad_norm": 1.3401822541097885, "learning_rate": 0.00021495, "loss": 2.9688, "step": 1433 }, { "epoch": 0.9496688741721855, "grad_norm": 1.2808650841947398, "learning_rate": 0.00021509999999999997, "loss": 2.7031, "step": 1434 }, { "epoch": 0.9503311258278145, "grad_norm": 1.393405026829189, "learning_rate": 0.00021525, "loss": 2.625, "step": 1435 }, { "epoch": 0.9509933774834437, "grad_norm": 1.3190900170936124, "learning_rate": 0.00021539999999999998, "loss": 2.2031, "step": 1436 }, { "epoch": 0.9516556291390729, "grad_norm": 1.4694674081498431, "learning_rate": 0.00021554999999999998, "loss": 2.9219, "step": 1437 }, { "epoch": 0.952317880794702, "grad_norm": 1.716467710084653, "learning_rate": 0.00021569999999999998, "loss": 2.6875, "step": 1438 }, { "epoch": 0.9529801324503311, "grad_norm": 1.3657841980107903, "learning_rate": 0.00021585, "loss": 2.6562, "step": 1439 }, { "epoch": 0.9536423841059603, "grad_norm": 1.3364184085402722, "learning_rate": 0.00021599999999999996, "loss": 2.75, "step": 1440 }, { "epoch": 0.9543046357615894, "grad_norm": 1.5110590546223863, "learning_rate": 0.00021615, "loss": 2.9844, "step": 1441 }, { "epoch": 0.9549668874172186, "grad_norm": 1.4377294927026996, "learning_rate": 0.00021629999999999997, "loss": 2.9375, "step": 1442 }, { "epoch": 0.9556291390728476, "grad_norm": 1.294891873077855, "learning_rate": 0.00021645, "loss": 2.4844, "step": 1443 }, { "epoch": 0.9562913907284768, "grad_norm": 1.317993300421824, "learning_rate": 0.00021659999999999998, "loss": 2.6406, "step": 1444 }, { "epoch": 0.956953642384106, "grad_norm": 1.3512037382647744, "learning_rate": 0.00021674999999999998, "loss": 2.8281, "step": 1445 }, { "epoch": 0.9576158940397351, "grad_norm": 1.3238621982766001, "learning_rate": 0.0002169, "loss": 2.8438, "step": 1446 }, { "epoch": 0.9582781456953643, "grad_norm": 1.4602664147481539, "learning_rate": 0.00021705, "loss": 3.0, "step": 1447 }, { "epoch": 0.9589403973509933, "grad_norm": 1.3174064478315253, "learning_rate": 0.00021719999999999997, "loss": 2.7969, "step": 1448 }, { "epoch": 0.9596026490066225, "grad_norm": 1.312557581970755, "learning_rate": 0.00021735, "loss": 2.7188, "step": 1449 }, { "epoch": 0.9602649006622517, "grad_norm": 1.2076507764658608, "learning_rate": 0.00021749999999999997, "loss": 2.5312, "step": 1450 }, { "epoch": 0.9609271523178808, "grad_norm": 1.3350720202564774, "learning_rate": 0.00021764999999999998, "loss": 2.8438, "step": 1451 }, { "epoch": 0.9615894039735099, "grad_norm": 1.3047772135309976, "learning_rate": 0.00021779999999999998, "loss": 2.4844, "step": 1452 }, { "epoch": 0.9622516556291391, "grad_norm": 1.352514392344182, "learning_rate": 0.00021794999999999999, "loss": 2.8438, "step": 1453 }, { "epoch": 0.9629139072847682, "grad_norm": 1.288692855024458, "learning_rate": 0.00021809999999999996, "loss": 2.4062, "step": 1454 }, { "epoch": 0.9635761589403974, "grad_norm": 1.324059219810981, "learning_rate": 0.00021825, "loss": 2.6875, "step": 1455 }, { "epoch": 0.9642384105960264, "grad_norm": 1.3247575800966827, "learning_rate": 0.00021839999999999997, "loss": 2.8438, "step": 1456 }, { "epoch": 0.9649006622516556, "grad_norm": 1.3968875007793182, "learning_rate": 0.00021855, "loss": 2.8906, "step": 1457 }, { "epoch": 0.9655629139072848, "grad_norm": 1.271406317706869, "learning_rate": 0.00021869999999999998, "loss": 2.5781, "step": 1458 }, { "epoch": 0.9662251655629139, "grad_norm": 1.27000537797962, "learning_rate": 0.00021884999999999998, "loss": 2.6094, "step": 1459 }, { "epoch": 0.9668874172185431, "grad_norm": 1.2720938100691048, "learning_rate": 0.00021899999999999998, "loss": 2.7656, "step": 1460 }, { "epoch": 0.9675496688741722, "grad_norm": 1.4034429717771355, "learning_rate": 0.00021915, "loss": 2.8281, "step": 1461 }, { "epoch": 0.9682119205298013, "grad_norm": 1.281269090662935, "learning_rate": 0.00021929999999999996, "loss": 2.4375, "step": 1462 }, { "epoch": 0.9688741721854305, "grad_norm": 1.3428427551068691, "learning_rate": 0.00021945, "loss": 2.6562, "step": 1463 }, { "epoch": 0.9695364238410596, "grad_norm": 1.3036853360893546, "learning_rate": 0.00021959999999999997, "loss": 2.6094, "step": 1464 }, { "epoch": 0.9701986754966887, "grad_norm": 1.3470144851290966, "learning_rate": 0.00021975, "loss": 2.8281, "step": 1465 }, { "epoch": 0.9708609271523179, "grad_norm": 1.4369795624670356, "learning_rate": 0.00021989999999999998, "loss": 2.6875, "step": 1466 }, { "epoch": 0.971523178807947, "grad_norm": 1.4456095040656771, "learning_rate": 0.00022004999999999998, "loss": 2.875, "step": 1467 }, { "epoch": 0.9721854304635762, "grad_norm": 1.4529721352808063, "learning_rate": 0.00022019999999999999, "loss": 2.7344, "step": 1468 }, { "epoch": 0.9728476821192052, "grad_norm": 1.254798333969733, "learning_rate": 0.00022035, "loss": 2.7656, "step": 1469 }, { "epoch": 0.9735099337748344, "grad_norm": 1.507387495013173, "learning_rate": 0.00022049999999999997, "loss": 2.6562, "step": 1470 }, { "epoch": 0.9741721854304636, "grad_norm": 1.478097855643656, "learning_rate": 0.00022065, "loss": 2.9531, "step": 1471 }, { "epoch": 0.9748344370860927, "grad_norm": 1.3885810640699483, "learning_rate": 0.00022079999999999997, "loss": 2.7031, "step": 1472 }, { "epoch": 0.9754966887417219, "grad_norm": 1.3301416729862565, "learning_rate": 0.00022095, "loss": 2.6406, "step": 1473 }, { "epoch": 0.976158940397351, "grad_norm": 1.3593749113187064, "learning_rate": 0.00022109999999999998, "loss": 2.2656, "step": 1474 }, { "epoch": 0.9768211920529801, "grad_norm": 1.3769146432485133, "learning_rate": 0.00022124999999999998, "loss": 2.5781, "step": 1475 }, { "epoch": 0.9774834437086093, "grad_norm": 1.369152044166675, "learning_rate": 0.0002214, "loss": 2.6406, "step": 1476 }, { "epoch": 0.9781456953642385, "grad_norm": 1.6658279351130625, "learning_rate": 0.00022155, "loss": 2.8594, "step": 1477 }, { "epoch": 0.9788079470198675, "grad_norm": 1.39317837183439, "learning_rate": 0.00022169999999999997, "loss": 2.9531, "step": 1478 }, { "epoch": 0.9794701986754967, "grad_norm": 1.2817190345444938, "learning_rate": 0.00022185, "loss": 2.7031, "step": 1479 }, { "epoch": 0.9801324503311258, "grad_norm": 1.3399915666046203, "learning_rate": 0.00022199999999999998, "loss": 2.7812, "step": 1480 }, { "epoch": 0.980794701986755, "grad_norm": 1.3255547633109914, "learning_rate": 0.00022215, "loss": 2.4688, "step": 1481 }, { "epoch": 0.9814569536423841, "grad_norm": 1.398686569630114, "learning_rate": 0.00022229999999999998, "loss": 2.7344, "step": 1482 }, { "epoch": 0.9821192052980132, "grad_norm": 1.4177146429090006, "learning_rate": 0.00022244999999999999, "loss": 2.7812, "step": 1483 }, { "epoch": 0.9827814569536424, "grad_norm": 1.2256578714309039, "learning_rate": 0.0002226, "loss": 2.5938, "step": 1484 }, { "epoch": 0.9834437086092715, "grad_norm": 1.3343181263324553, "learning_rate": 0.00022275, "loss": 2.6406, "step": 1485 }, { "epoch": 0.9841059602649007, "grad_norm": 1.449724238441013, "learning_rate": 0.00022289999999999997, "loss": 2.9688, "step": 1486 }, { "epoch": 0.9847682119205298, "grad_norm": 1.1672030860650229, "learning_rate": 0.00022305, "loss": 2.4219, "step": 1487 }, { "epoch": 0.9854304635761589, "grad_norm": 1.4158786412734308, "learning_rate": 0.00022319999999999998, "loss": 2.7344, "step": 1488 }, { "epoch": 0.9860927152317881, "grad_norm": 1.3640840389080837, "learning_rate": 0.00022335, "loss": 2.9531, "step": 1489 }, { "epoch": 0.9867549668874173, "grad_norm": 1.296678057042255, "learning_rate": 0.00022349999999999998, "loss": 2.8438, "step": 1490 }, { "epoch": 0.9874172185430463, "grad_norm": 1.3785105930686457, "learning_rate": 0.00022365, "loss": 2.1719, "step": 1491 }, { "epoch": 0.9880794701986755, "grad_norm": 1.336866249430873, "learning_rate": 0.0002238, "loss": 2.5469, "step": 1492 }, { "epoch": 0.9887417218543046, "grad_norm": 1.2826570369271129, "learning_rate": 0.00022395, "loss": 2.5469, "step": 1493 }, { "epoch": 0.9894039735099338, "grad_norm": 1.368318593098838, "learning_rate": 0.00022409999999999997, "loss": 2.7031, "step": 1494 }, { "epoch": 0.9900662251655629, "grad_norm": 1.3391828909053034, "learning_rate": 0.00022425, "loss": 2.625, "step": 1495 }, { "epoch": 0.990728476821192, "grad_norm": 1.358492861399515, "learning_rate": 0.00022439999999999998, "loss": 2.5469, "step": 1496 }, { "epoch": 0.9913907284768212, "grad_norm": 1.3565271494003894, "learning_rate": 0.00022455, "loss": 2.8281, "step": 1497 }, { "epoch": 0.9920529801324504, "grad_norm": 1.1957861482563874, "learning_rate": 0.0002247, "loss": 2.1406, "step": 1498 }, { "epoch": 0.9927152317880795, "grad_norm": 1.3805488818206735, "learning_rate": 0.00022485, "loss": 2.7344, "step": 1499 }, { "epoch": 0.9933774834437086, "grad_norm": 1.4317760344078536, "learning_rate": 0.000225, "loss": 2.6562, "step": 1500 }, { "epoch": 0.9940397350993377, "grad_norm": 1.3471406939170751, "learning_rate": 0.00022514999999999997, "loss": 2.8125, "step": 1501 }, { "epoch": 0.9947019867549669, "grad_norm": 1.270885753278136, "learning_rate": 0.00022529999999999997, "loss": 2.7344, "step": 1502 }, { "epoch": 0.9953642384105961, "grad_norm": 1.5625251777065006, "learning_rate": 0.00022544999999999995, "loss": 2.7188, "step": 1503 }, { "epoch": 0.9960264900662251, "grad_norm": 1.4056014700380517, "learning_rate": 0.00022559999999999998, "loss": 2.7031, "step": 1504 }, { "epoch": 0.9966887417218543, "grad_norm": 1.4440078609057376, "learning_rate": 0.00022574999999999996, "loss": 2.8906, "step": 1505 }, { "epoch": 0.9973509933774835, "grad_norm": 1.4636963433470624, "learning_rate": 0.0002259, "loss": 3.0156, "step": 1506 }, { "epoch": 0.9980132450331126, "grad_norm": 1.4122316232867143, "learning_rate": 0.00022604999999999997, "loss": 2.6562, "step": 1507 }, { "epoch": 0.9986754966887417, "grad_norm": 1.1385473039829144, "learning_rate": 0.00022619999999999997, "loss": 2.5, "step": 1508 }, { "epoch": 0.9993377483443708, "grad_norm": 1.3077440560589277, "learning_rate": 0.00022634999999999997, "loss": 2.6562, "step": 1509 }, { "epoch": 1.0, "grad_norm": 1.2105287244431298, "learning_rate": 0.00022649999999999998, "loss": 2.7969, "step": 1510 }, { "epoch": 1.0, "eval_loss": 2.625924587249756, "eval_runtime": 34.0853, "eval_samples_per_second": 9.916, "eval_steps_per_second": 9.916, "step": 1510 }, { "epoch": 1.0006622516556292, "grad_norm": 1.2584246200918825, "learning_rate": 0.00022664999999999995, "loss": 2.6406, "step": 1511 }, { "epoch": 1.0013245033112583, "grad_norm": 1.4771743635453922, "learning_rate": 0.00022679999999999998, "loss": 2.7969, "step": 1512 }, { "epoch": 1.0019867549668875, "grad_norm": 1.4118511689346829, "learning_rate": 0.00022694999999999996, "loss": 2.5625, "step": 1513 }, { "epoch": 1.0026490066225167, "grad_norm": 1.2780776327953083, "learning_rate": 0.0002271, "loss": 2.6875, "step": 1514 }, { "epoch": 1.0033112582781456, "grad_norm": 1.2600893150163668, "learning_rate": 0.00022724999999999997, "loss": 2.7031, "step": 1515 }, { "epoch": 1.0039735099337748, "grad_norm": 1.2340528109906057, "learning_rate": 0.00022739999999999997, "loss": 2.6562, "step": 1516 }, { "epoch": 1.004635761589404, "grad_norm": 1.4209038924647308, "learning_rate": 0.00022754999999999997, "loss": 2.7344, "step": 1517 }, { "epoch": 1.005298013245033, "grad_norm": 1.2009470774561128, "learning_rate": 0.00022769999999999998, "loss": 2.9062, "step": 1518 }, { "epoch": 1.0059602649006623, "grad_norm": 1.4607176197867633, "learning_rate": 0.00022784999999999995, "loss": 2.5625, "step": 1519 }, { "epoch": 1.0066225165562914, "grad_norm": 1.329691605058324, "learning_rate": 0.00022799999999999999, "loss": 2.6719, "step": 1520 }, { "epoch": 1.0072847682119206, "grad_norm": 1.2032114553547824, "learning_rate": 0.00022814999999999996, "loss": 2.5156, "step": 1521 }, { "epoch": 1.0079470198675498, "grad_norm": 1.3572314784316437, "learning_rate": 0.0002283, "loss": 2.7188, "step": 1522 }, { "epoch": 1.008609271523179, "grad_norm": 1.2772922516951868, "learning_rate": 0.00022844999999999997, "loss": 2.7344, "step": 1523 }, { "epoch": 1.0092715231788079, "grad_norm": 1.329747253519728, "learning_rate": 0.00022859999999999997, "loss": 2.6875, "step": 1524 }, { "epoch": 1.009933774834437, "grad_norm": 1.2757010985116841, "learning_rate": 0.00022874999999999998, "loss": 2.7031, "step": 1525 }, { "epoch": 1.0105960264900662, "grad_norm": 1.2903175490841643, "learning_rate": 0.00022889999999999998, "loss": 2.7188, "step": 1526 }, { "epoch": 1.0112582781456954, "grad_norm": 1.2892690991267763, "learning_rate": 0.00022904999999999996, "loss": 2.9375, "step": 1527 }, { "epoch": 1.0119205298013245, "grad_norm": 1.3138798604299393, "learning_rate": 0.0002292, "loss": 2.9219, "step": 1528 }, { "epoch": 1.0125827814569537, "grad_norm": 1.2873922465196805, "learning_rate": 0.00022934999999999996, "loss": 2.2188, "step": 1529 }, { "epoch": 1.0132450331125828, "grad_norm": 1.3068908586498, "learning_rate": 0.0002295, "loss": 2.7344, "step": 1530 }, { "epoch": 1.013907284768212, "grad_norm": 1.2519276550832668, "learning_rate": 0.00022964999999999997, "loss": 2.6875, "step": 1531 }, { "epoch": 1.014569536423841, "grad_norm": 1.5501490897814858, "learning_rate": 0.00022979999999999997, "loss": 2.9062, "step": 1532 }, { "epoch": 1.0152317880794701, "grad_norm": 1.169911303649165, "learning_rate": 0.00022994999999999998, "loss": 2.1875, "step": 1533 }, { "epoch": 1.0158940397350993, "grad_norm": 1.1796369624920715, "learning_rate": 0.00023009999999999998, "loss": 2.6406, "step": 1534 }, { "epoch": 1.0165562913907285, "grad_norm": 1.317158553982051, "learning_rate": 0.00023024999999999996, "loss": 2.8125, "step": 1535 }, { "epoch": 1.0172185430463576, "grad_norm": 1.4408749064344644, "learning_rate": 0.0002304, "loss": 2.7344, "step": 1536 }, { "epoch": 1.0178807947019868, "grad_norm": 1.2105410731807122, "learning_rate": 0.00023054999999999997, "loss": 2.2812, "step": 1537 }, { "epoch": 1.018543046357616, "grad_norm": 1.3307348657727653, "learning_rate": 0.0002307, "loss": 2.6094, "step": 1538 }, { "epoch": 1.019205298013245, "grad_norm": 1.4312476891927959, "learning_rate": 0.00023084999999999997, "loss": 2.1719, "step": 1539 }, { "epoch": 1.0198675496688743, "grad_norm": 1.3398737919756114, "learning_rate": 0.00023099999999999998, "loss": 2.6094, "step": 1540 }, { "epoch": 1.0205298013245032, "grad_norm": 1.441897175374856, "learning_rate": 0.00023114999999999998, "loss": 2.7031, "step": 1541 }, { "epoch": 1.0211920529801324, "grad_norm": 1.3154895184656525, "learning_rate": 0.00023129999999999998, "loss": 2.6406, "step": 1542 }, { "epoch": 1.0218543046357615, "grad_norm": 1.0524561937157482, "learning_rate": 0.00023144999999999996, "loss": 2.0312, "step": 1543 }, { "epoch": 1.0225165562913907, "grad_norm": 1.3752767501037098, "learning_rate": 0.0002316, "loss": 2.5312, "step": 1544 }, { "epoch": 1.0231788079470199, "grad_norm": 1.4277261508161245, "learning_rate": 0.00023174999999999997, "loss": 2.8438, "step": 1545 }, { "epoch": 1.023841059602649, "grad_norm": 1.2234578987637423, "learning_rate": 0.0002319, "loss": 2.3125, "step": 1546 }, { "epoch": 1.0245033112582782, "grad_norm": 1.2494664433120775, "learning_rate": 0.00023204999999999998, "loss": 2.5469, "step": 1547 }, { "epoch": 1.0251655629139074, "grad_norm": 1.1840993924120304, "learning_rate": 0.00023219999999999998, "loss": 2.4219, "step": 1548 }, { "epoch": 1.0258278145695363, "grad_norm": 1.1999999896759703, "learning_rate": 0.00023234999999999998, "loss": 2.4531, "step": 1549 }, { "epoch": 1.0264900662251655, "grad_norm": 1.1472696922454548, "learning_rate": 0.00023249999999999999, "loss": 2.7656, "step": 1550 }, { "epoch": 1.0271523178807946, "grad_norm": 1.2133862408287155, "learning_rate": 0.00023264999999999996, "loss": 2.4688, "step": 1551 }, { "epoch": 1.0278145695364238, "grad_norm": 1.2015173124294138, "learning_rate": 0.0002328, "loss": 2.7031, "step": 1552 }, { "epoch": 1.028476821192053, "grad_norm": 1.1870154402479058, "learning_rate": 0.00023294999999999997, "loss": 2.5312, "step": 1553 }, { "epoch": 1.0291390728476821, "grad_norm": 1.2392996326108534, "learning_rate": 0.00023309999999999997, "loss": 2.6094, "step": 1554 }, { "epoch": 1.0298013245033113, "grad_norm": 1.1190456785007425, "learning_rate": 0.00023324999999999998, "loss": 2.6094, "step": 1555 }, { "epoch": 1.0304635761589405, "grad_norm": 1.2978419302281687, "learning_rate": 0.00023339999999999998, "loss": 2.5781, "step": 1556 }, { "epoch": 1.0311258278145696, "grad_norm": 1.3405730590307243, "learning_rate": 0.00023354999999999996, "loss": 2.7188, "step": 1557 }, { "epoch": 1.0317880794701986, "grad_norm": 1.3161156613368452, "learning_rate": 0.0002337, "loss": 2.7188, "step": 1558 }, { "epoch": 1.0324503311258277, "grad_norm": 1.289421967288709, "learning_rate": 0.00023384999999999997, "loss": 2.7656, "step": 1559 }, { "epoch": 1.033112582781457, "grad_norm": 1.1236335256311452, "learning_rate": 0.000234, "loss": 2.4531, "step": 1560 }, { "epoch": 1.033774834437086, "grad_norm": 1.1581344660593105, "learning_rate": 0.00023414999999999997, "loss": 2.7969, "step": 1561 }, { "epoch": 1.0344370860927152, "grad_norm": 1.1788368884894007, "learning_rate": 0.00023429999999999998, "loss": 2.6562, "step": 1562 }, { "epoch": 1.0350993377483444, "grad_norm": 1.260191740547071, "learning_rate": 0.00023444999999999998, "loss": 2.6875, "step": 1563 }, { "epoch": 1.0357615894039736, "grad_norm": 1.2217913822762283, "learning_rate": 0.00023459999999999998, "loss": 2.8125, "step": 1564 }, { "epoch": 1.0364238410596027, "grad_norm": 1.2824185359919227, "learning_rate": 0.00023474999999999996, "loss": 2.4844, "step": 1565 }, { "epoch": 1.0370860927152319, "grad_norm": 1.096439129107709, "learning_rate": 0.0002349, "loss": 2.5312, "step": 1566 }, { "epoch": 1.0377483443708608, "grad_norm": 1.3737963039997907, "learning_rate": 0.00023504999999999997, "loss": 2.6562, "step": 1567 }, { "epoch": 1.03841059602649, "grad_norm": 1.3094219829606462, "learning_rate": 0.0002352, "loss": 2.5469, "step": 1568 }, { "epoch": 1.0390728476821192, "grad_norm": 1.2621084239078817, "learning_rate": 0.00023534999999999997, "loss": 2.5156, "step": 1569 }, { "epoch": 1.0397350993377483, "grad_norm": 1.321421061278166, "learning_rate": 0.00023549999999999998, "loss": 2.7188, "step": 1570 }, { "epoch": 1.0403973509933775, "grad_norm": 1.2486199499154973, "learning_rate": 0.00023564999999999998, "loss": 2.7031, "step": 1571 }, { "epoch": 1.0410596026490067, "grad_norm": 1.0981257589378932, "learning_rate": 0.00023579999999999999, "loss": 2.2031, "step": 1572 }, { "epoch": 1.0417218543046358, "grad_norm": 1.343385244858492, "learning_rate": 0.00023594999999999996, "loss": 2.7344, "step": 1573 }, { "epoch": 1.042384105960265, "grad_norm": 1.1794000908581665, "learning_rate": 0.0002361, "loss": 2.625, "step": 1574 }, { "epoch": 1.0430463576158941, "grad_norm": 1.1591135173877467, "learning_rate": 0.00023624999999999997, "loss": 2.8438, "step": 1575 }, { "epoch": 1.043708609271523, "grad_norm": 1.1676763218705613, "learning_rate": 0.0002364, "loss": 2.4062, "step": 1576 }, { "epoch": 1.0443708609271523, "grad_norm": 1.2244496284831055, "learning_rate": 0.00023654999999999998, "loss": 2.2031, "step": 1577 }, { "epoch": 1.0450331125827814, "grad_norm": 1.3276820451425495, "learning_rate": 0.00023669999999999998, "loss": 2.6094, "step": 1578 }, { "epoch": 1.0456953642384106, "grad_norm": 1.6104766702295796, "learning_rate": 0.00023684999999999998, "loss": 2.5312, "step": 1579 }, { "epoch": 1.0463576158940397, "grad_norm": 1.4810454758411034, "learning_rate": 0.000237, "loss": 2.5781, "step": 1580 }, { "epoch": 1.047019867549669, "grad_norm": 1.172700561529911, "learning_rate": 0.00023714999999999996, "loss": 2.6562, "step": 1581 }, { "epoch": 1.047682119205298, "grad_norm": 1.453269326375865, "learning_rate": 0.0002373, "loss": 3.0469, "step": 1582 }, { "epoch": 1.0483443708609272, "grad_norm": 1.2922896956287713, "learning_rate": 0.00023744999999999997, "loss": 2.5781, "step": 1583 }, { "epoch": 1.0490066225165562, "grad_norm": 1.2038122677186902, "learning_rate": 0.0002376, "loss": 2.5156, "step": 1584 }, { "epoch": 1.0496688741721854, "grad_norm": 1.2329090917741925, "learning_rate": 0.00023774999999999998, "loss": 2.6719, "step": 1585 }, { "epoch": 1.0503311258278145, "grad_norm": 1.5076944294027734, "learning_rate": 0.00023789999999999998, "loss": 2.9844, "step": 1586 }, { "epoch": 1.0509933774834437, "grad_norm": 1.2657993003787582, "learning_rate": 0.00023804999999999999, "loss": 2.7812, "step": 1587 }, { "epoch": 1.0516556291390728, "grad_norm": 1.2728062099835233, "learning_rate": 0.0002382, "loss": 2.7656, "step": 1588 }, { "epoch": 1.052317880794702, "grad_norm": 1.245135169605465, "learning_rate": 0.00023834999999999997, "loss": 2.6875, "step": 1589 }, { "epoch": 1.0529801324503312, "grad_norm": 1.4250464798427807, "learning_rate": 0.0002385, "loss": 2.1875, "step": 1590 }, { "epoch": 1.0536423841059603, "grad_norm": 1.1954242051112403, "learning_rate": 0.00023864999999999997, "loss": 2.7812, "step": 1591 }, { "epoch": 1.0543046357615895, "grad_norm": 1.1388735172418343, "learning_rate": 0.0002388, "loss": 2.6719, "step": 1592 }, { "epoch": 1.0549668874172184, "grad_norm": 1.3377942515109469, "learning_rate": 0.00023894999999999998, "loss": 2.7812, "step": 1593 }, { "epoch": 1.0556291390728476, "grad_norm": 1.3001557888599227, "learning_rate": 0.00023909999999999998, "loss": 2.3438, "step": 1594 }, { "epoch": 1.0562913907284768, "grad_norm": 1.2221372185665975, "learning_rate": 0.00023925, "loss": 2.625, "step": 1595 }, { "epoch": 1.056953642384106, "grad_norm": 1.1734110574068584, "learning_rate": 0.0002394, "loss": 2.6406, "step": 1596 }, { "epoch": 1.057615894039735, "grad_norm": 1.1710055444077583, "learning_rate": 0.00023954999999999997, "loss": 2.4375, "step": 1597 }, { "epoch": 1.0582781456953643, "grad_norm": 1.1088485707544646, "learning_rate": 0.0002397, "loss": 2.5938, "step": 1598 }, { "epoch": 1.0589403973509934, "grad_norm": 1.1576755756800217, "learning_rate": 0.00023984999999999998, "loss": 2.625, "step": 1599 }, { "epoch": 1.0596026490066226, "grad_norm": 1.3247251491326573, "learning_rate": 0.00023999999999999998, "loss": 2.8906, "step": 1600 }, { "epoch": 1.0602649006622515, "grad_norm": 1.1981481143974533, "learning_rate": 0.00024014999999999998, "loss": 2.75, "step": 1601 }, { "epoch": 1.0609271523178807, "grad_norm": 1.3537667539769362, "learning_rate": 0.00024029999999999999, "loss": 2.8281, "step": 1602 }, { "epoch": 1.0615894039735099, "grad_norm": 1.2772292682770163, "learning_rate": 0.00024044999999999996, "loss": 2.3594, "step": 1603 }, { "epoch": 1.062251655629139, "grad_norm": 1.2365721765500288, "learning_rate": 0.0002406, "loss": 2.5312, "step": 1604 }, { "epoch": 1.0629139072847682, "grad_norm": 1.250090677626946, "learning_rate": 0.00024074999999999997, "loss": 2.7188, "step": 1605 }, { "epoch": 1.0635761589403974, "grad_norm": 1.27388031390634, "learning_rate": 0.0002409, "loss": 2.5781, "step": 1606 }, { "epoch": 1.0642384105960265, "grad_norm": 1.2312540272219987, "learning_rate": 0.00024104999999999998, "loss": 2.4062, "step": 1607 }, { "epoch": 1.0649006622516557, "grad_norm": 1.3555542209281282, "learning_rate": 0.00024119999999999998, "loss": 2.7031, "step": 1608 }, { "epoch": 1.0655629139072849, "grad_norm": 1.1456001352380034, "learning_rate": 0.00024134999999999998, "loss": 2.625, "step": 1609 }, { "epoch": 1.0662251655629138, "grad_norm": 1.2047348141768965, "learning_rate": 0.0002415, "loss": 2.5156, "step": 1610 }, { "epoch": 1.066887417218543, "grad_norm": 1.26752882240382, "learning_rate": 0.00024164999999999996, "loss": 2.5625, "step": 1611 }, { "epoch": 1.0675496688741721, "grad_norm": 1.1995893800531765, "learning_rate": 0.0002418, "loss": 2.7656, "step": 1612 }, { "epoch": 1.0682119205298013, "grad_norm": 1.273182808041412, "learning_rate": 0.00024194999999999997, "loss": 2.3281, "step": 1613 }, { "epoch": 1.0688741721854305, "grad_norm": 1.231035323469265, "learning_rate": 0.0002421, "loss": 2.8125, "step": 1614 }, { "epoch": 1.0695364238410596, "grad_norm": 1.332825608098906, "learning_rate": 0.00024224999999999998, "loss": 2.7031, "step": 1615 }, { "epoch": 1.0701986754966888, "grad_norm": 1.2130123545552294, "learning_rate": 0.00024239999999999998, "loss": 2.2031, "step": 1616 }, { "epoch": 1.070860927152318, "grad_norm": 1.2671768177985228, "learning_rate": 0.00024255, "loss": 2.8281, "step": 1617 }, { "epoch": 1.0715231788079471, "grad_norm": 1.333311804670691, "learning_rate": 0.0002427, "loss": 2.5, "step": 1618 }, { "epoch": 1.072185430463576, "grad_norm": 1.2858043740631016, "learning_rate": 0.00024284999999999997, "loss": 2.5312, "step": 1619 }, { "epoch": 1.0728476821192052, "grad_norm": 1.3361191899024603, "learning_rate": 0.000243, "loss": 2.875, "step": 1620 }, { "epoch": 1.0735099337748344, "grad_norm": 1.2260415764590065, "learning_rate": 0.00024314999999999997, "loss": 2.75, "step": 1621 }, { "epoch": 1.0741721854304636, "grad_norm": 1.076658716628758, "learning_rate": 0.0002433, "loss": 2.5469, "step": 1622 }, { "epoch": 1.0748344370860927, "grad_norm": 1.4613902109563353, "learning_rate": 0.00024344999999999998, "loss": 2.8438, "step": 1623 }, { "epoch": 1.0754966887417219, "grad_norm": 1.210706024203546, "learning_rate": 0.00024359999999999999, "loss": 2.5781, "step": 1624 }, { "epoch": 1.076158940397351, "grad_norm": 1.1166941573679585, "learning_rate": 0.00024375, "loss": 2.6875, "step": 1625 }, { "epoch": 1.0768211920529802, "grad_norm": 1.2132920715234536, "learning_rate": 0.00024389999999999997, "loss": 2.4531, "step": 1626 }, { "epoch": 1.0774834437086094, "grad_norm": 1.4011719166615808, "learning_rate": 0.00024404999999999997, "loss": 2.8125, "step": 1627 }, { "epoch": 1.0781456953642383, "grad_norm": 1.4141550951982462, "learning_rate": 0.00024419999999999997, "loss": 2.7344, "step": 1628 }, { "epoch": 1.0788079470198675, "grad_norm": 1.1537325697006269, "learning_rate": 0.00024435, "loss": 2.25, "step": 1629 }, { "epoch": 1.0794701986754967, "grad_norm": 1.421258144061272, "learning_rate": 0.0002445, "loss": 2.9375, "step": 1630 }, { "epoch": 1.0801324503311258, "grad_norm": 1.2052956901126441, "learning_rate": 0.00024464999999999996, "loss": 2.6406, "step": 1631 }, { "epoch": 1.080794701986755, "grad_norm": 1.1730647179923297, "learning_rate": 0.0002448, "loss": 2.9062, "step": 1632 }, { "epoch": 1.0814569536423841, "grad_norm": 1.2074540396578712, "learning_rate": 0.00024494999999999996, "loss": 2.4219, "step": 1633 }, { "epoch": 1.0821192052980133, "grad_norm": 1.1492915684813103, "learning_rate": 0.00024509999999999994, "loss": 2.8906, "step": 1634 }, { "epoch": 1.0827814569536425, "grad_norm": 1.238133783864804, "learning_rate": 0.00024524999999999997, "loss": 2.5938, "step": 1635 }, { "epoch": 1.0834437086092716, "grad_norm": 1.1827514201750082, "learning_rate": 0.00024539999999999995, "loss": 2.3594, "step": 1636 }, { "epoch": 1.0841059602649006, "grad_norm": 1.052229407613025, "learning_rate": 0.00024555, "loss": 2.5, "step": 1637 }, { "epoch": 1.0847682119205297, "grad_norm": 1.2732819129812245, "learning_rate": 0.00024569999999999995, "loss": 2.6094, "step": 1638 }, { "epoch": 1.085430463576159, "grad_norm": 1.2875860681034854, "learning_rate": 0.00024585, "loss": 2.7344, "step": 1639 }, { "epoch": 1.086092715231788, "grad_norm": 1.1405165421664474, "learning_rate": 0.00024599999999999996, "loss": 2.5781, "step": 1640 }, { "epoch": 1.0867549668874172, "grad_norm": 1.2120375235367489, "learning_rate": 0.00024615, "loss": 2.4531, "step": 1641 }, { "epoch": 1.0874172185430464, "grad_norm": 1.2397354601004646, "learning_rate": 0.00024629999999999997, "loss": 2.625, "step": 1642 }, { "epoch": 1.0880794701986756, "grad_norm": 1.1435946704433604, "learning_rate": 0.00024645, "loss": 2.75, "step": 1643 }, { "epoch": 1.0887417218543047, "grad_norm": 2.353588092960578, "learning_rate": 0.0002466, "loss": 2.4219, "step": 1644 }, { "epoch": 1.0894039735099337, "grad_norm": 1.2343085685969089, "learning_rate": 0.00024675, "loss": 2.5312, "step": 1645 }, { "epoch": 1.0900662251655628, "grad_norm": 1.246652446039875, "learning_rate": 0.0002469, "loss": 2.75, "step": 1646 }, { "epoch": 1.090728476821192, "grad_norm": 1.2455743322228694, "learning_rate": 0.00024704999999999996, "loss": 2.4531, "step": 1647 }, { "epoch": 1.0913907284768212, "grad_norm": 1.1590575700470773, "learning_rate": 0.0002472, "loss": 2.625, "step": 1648 }, { "epoch": 1.0920529801324503, "grad_norm": 1.188198469518599, "learning_rate": 0.00024734999999999997, "loss": 2.5625, "step": 1649 }, { "epoch": 1.0927152317880795, "grad_norm": 1.3896858328668413, "learning_rate": 0.00024749999999999994, "loss": 3.1094, "step": 1650 }, { "epoch": 1.0933774834437087, "grad_norm": 1.3182136720279225, "learning_rate": 0.00024765, "loss": 2.8906, "step": 1651 }, { "epoch": 1.0940397350993378, "grad_norm": 1.3558794789532171, "learning_rate": 0.00024779999999999995, "loss": 2.8281, "step": 1652 }, { "epoch": 1.0947019867549668, "grad_norm": 1.2896461186363017, "learning_rate": 0.00024795, "loss": 2.6094, "step": 1653 }, { "epoch": 1.095364238410596, "grad_norm": 1.3043000889636394, "learning_rate": 0.00024809999999999996, "loss": 2.5938, "step": 1654 }, { "epoch": 1.096026490066225, "grad_norm": 1.2844762333350255, "learning_rate": 0.00024825, "loss": 2.625, "step": 1655 }, { "epoch": 1.0966887417218543, "grad_norm": 1.2866248801237585, "learning_rate": 0.00024839999999999997, "loss": 2.2812, "step": 1656 }, { "epoch": 1.0973509933774834, "grad_norm": 1.312814696444647, "learning_rate": 0.00024855, "loss": 2.7188, "step": 1657 }, { "epoch": 1.0980132450331126, "grad_norm": 1.630454701991641, "learning_rate": 0.0002487, "loss": 2.6406, "step": 1658 }, { "epoch": 1.0986754966887418, "grad_norm": 1.1246236115513686, "learning_rate": 0.00024885, "loss": 2.6719, "step": 1659 }, { "epoch": 1.099337748344371, "grad_norm": 1.1126051895827604, "learning_rate": 0.000249, "loss": 2.2031, "step": 1660 }, { "epoch": 1.1, "grad_norm": 1.0972085560166904, "learning_rate": 0.00024914999999999996, "loss": 2.5312, "step": 1661 }, { "epoch": 1.100662251655629, "grad_norm": 1.3072905531956498, "learning_rate": 0.0002493, "loss": 2.9062, "step": 1662 }, { "epoch": 1.1013245033112582, "grad_norm": 1.4518018413097984, "learning_rate": 0.00024944999999999996, "loss": 2.8281, "step": 1663 }, { "epoch": 1.1019867549668874, "grad_norm": 1.20203731948028, "learning_rate": 0.00024959999999999994, "loss": 2.625, "step": 1664 }, { "epoch": 1.1026490066225165, "grad_norm": 1.1615835788212872, "learning_rate": 0.00024974999999999997, "loss": 2.4688, "step": 1665 }, { "epoch": 1.1033112582781457, "grad_norm": 1.2546269774309333, "learning_rate": 0.00024989999999999995, "loss": 2.7969, "step": 1666 }, { "epoch": 1.1039735099337749, "grad_norm": 1.1754226402658847, "learning_rate": 0.00025005, "loss": 2.6719, "step": 1667 }, { "epoch": 1.104635761589404, "grad_norm": 1.176987767046394, "learning_rate": 0.00025019999999999996, "loss": 2.3594, "step": 1668 }, { "epoch": 1.1052980132450332, "grad_norm": 1.1554922206862372, "learning_rate": 0.00025035, "loss": 2.4844, "step": 1669 }, { "epoch": 1.1059602649006623, "grad_norm": 1.0759195957639154, "learning_rate": 0.00025049999999999996, "loss": 2.4531, "step": 1670 }, { "epoch": 1.1066225165562913, "grad_norm": 1.1755267738732726, "learning_rate": 0.00025065, "loss": 2.7812, "step": 1671 }, { "epoch": 1.1072847682119205, "grad_norm": 1.3691510355513095, "learning_rate": 0.00025079999999999997, "loss": 2.8125, "step": 1672 }, { "epoch": 1.1079470198675496, "grad_norm": 1.133783529547844, "learning_rate": 0.00025095, "loss": 2.5781, "step": 1673 }, { "epoch": 1.1086092715231788, "grad_norm": 1.162411982950755, "learning_rate": 0.0002511, "loss": 2.5781, "step": 1674 }, { "epoch": 1.109271523178808, "grad_norm": 1.1715697240606722, "learning_rate": 0.00025125, "loss": 2.7344, "step": 1675 }, { "epoch": 1.1099337748344371, "grad_norm": 1.4625296971749757, "learning_rate": 0.0002514, "loss": 2.6094, "step": 1676 }, { "epoch": 1.1105960264900663, "grad_norm": 1.1710803086685577, "learning_rate": 0.00025154999999999996, "loss": 2.7656, "step": 1677 }, { "epoch": 1.1112582781456954, "grad_norm": 2.323879291813002, "learning_rate": 0.0002517, "loss": 2.1562, "step": 1678 }, { "epoch": 1.1119205298013246, "grad_norm": 1.1364833908535334, "learning_rate": 0.00025184999999999997, "loss": 2.625, "step": 1679 }, { "epoch": 1.1125827814569536, "grad_norm": 2.1486042567064962, "learning_rate": 0.00025199999999999995, "loss": 2.6406, "step": 1680 }, { "epoch": 1.1132450331125827, "grad_norm": 1.1634430751352385, "learning_rate": 0.00025215, "loss": 2.6094, "step": 1681 }, { "epoch": 1.1139072847682119, "grad_norm": 1.3609180209389597, "learning_rate": 0.00025229999999999995, "loss": 2.75, "step": 1682 }, { "epoch": 1.114569536423841, "grad_norm": 1.1500410418858928, "learning_rate": 0.00025245, "loss": 2.7031, "step": 1683 }, { "epoch": 1.1152317880794702, "grad_norm": 1.1877100251352473, "learning_rate": 0.00025259999999999996, "loss": 2.4844, "step": 1684 }, { "epoch": 1.1158940397350994, "grad_norm": 1.104707764604417, "learning_rate": 0.00025275, "loss": 2.5, "step": 1685 }, { "epoch": 1.1165562913907285, "grad_norm": 1.3214005237825592, "learning_rate": 0.00025289999999999997, "loss": 2.8281, "step": 1686 }, { "epoch": 1.1172185430463577, "grad_norm": 1.278809972119621, "learning_rate": 0.00025305, "loss": 2.5625, "step": 1687 }, { "epoch": 1.1178807947019869, "grad_norm": 1.091011274451508, "learning_rate": 0.0002532, "loss": 2.5156, "step": 1688 }, { "epoch": 1.1185430463576158, "grad_norm": 1.0410842105575855, "learning_rate": 0.00025335, "loss": 2.7344, "step": 1689 }, { "epoch": 1.119205298013245, "grad_norm": 1.1961207006253298, "learning_rate": 0.0002535, "loss": 2.7031, "step": 1690 }, { "epoch": 1.1198675496688741, "grad_norm": 1.3488806828599484, "learning_rate": 0.00025365, "loss": 2.875, "step": 1691 }, { "epoch": 1.1205298013245033, "grad_norm": 1.0579380401956666, "learning_rate": 0.0002538, "loss": 2.3594, "step": 1692 }, { "epoch": 1.1211920529801325, "grad_norm": 1.1650879118266533, "learning_rate": 0.00025394999999999997, "loss": 2.6562, "step": 1693 }, { "epoch": 1.1218543046357616, "grad_norm": 1.2260495678601286, "learning_rate": 0.0002541, "loss": 2.625, "step": 1694 }, { "epoch": 1.1225165562913908, "grad_norm": 1.072137291018581, "learning_rate": 0.00025425, "loss": 2.4531, "step": 1695 }, { "epoch": 1.12317880794702, "grad_norm": 1.3105079553922625, "learning_rate": 0.00025439999999999995, "loss": 2.7031, "step": 1696 }, { "epoch": 1.123841059602649, "grad_norm": 1.2354398698425346, "learning_rate": 0.00025455, "loss": 2.6562, "step": 1697 }, { "epoch": 1.124503311258278, "grad_norm": 1.0373715050130747, "learning_rate": 0.00025469999999999996, "loss": 2.4219, "step": 1698 }, { "epoch": 1.1251655629139072, "grad_norm": 1.2978610918200406, "learning_rate": 0.00025485, "loss": 2.8906, "step": 1699 }, { "epoch": 1.1258278145695364, "grad_norm": 1.2790280840366408, "learning_rate": 0.00025499999999999996, "loss": 2.7656, "step": 1700 }, { "epoch": 1.1264900662251656, "grad_norm": 1.159972164201933, "learning_rate": 0.00025515, "loss": 2.625, "step": 1701 }, { "epoch": 1.1271523178807947, "grad_norm": 1.0967399683412424, "learning_rate": 0.00025529999999999997, "loss": 2.4844, "step": 1702 }, { "epoch": 1.127814569536424, "grad_norm": 1.142315209595986, "learning_rate": 0.00025545, "loss": 2.5625, "step": 1703 }, { "epoch": 1.128476821192053, "grad_norm": 1.2966698100551113, "learning_rate": 0.0002556, "loss": 2.5781, "step": 1704 }, { "epoch": 1.129139072847682, "grad_norm": 1.165333404824752, "learning_rate": 0.00025575, "loss": 2.5625, "step": 1705 }, { "epoch": 1.1298013245033112, "grad_norm": 1.20836265587115, "learning_rate": 0.0002559, "loss": 2.6094, "step": 1706 }, { "epoch": 1.1304635761589403, "grad_norm": 1.3134063937569378, "learning_rate": 0.00025604999999999996, "loss": 2.6562, "step": 1707 }, { "epoch": 1.1311258278145695, "grad_norm": 1.1331269296615698, "learning_rate": 0.0002562, "loss": 2.5781, "step": 1708 }, { "epoch": 1.1317880794701987, "grad_norm": 1.3363666135147227, "learning_rate": 0.00025634999999999997, "loss": 2.7031, "step": 1709 }, { "epoch": 1.1324503311258278, "grad_norm": 1.4190394179186674, "learning_rate": 0.00025649999999999995, "loss": 2.7656, "step": 1710 }, { "epoch": 1.133112582781457, "grad_norm": 1.4684299382954138, "learning_rate": 0.00025665, "loss": 2.0938, "step": 1711 }, { "epoch": 1.1337748344370862, "grad_norm": 1.1017845971765399, "learning_rate": 0.00025679999999999995, "loss": 2.75, "step": 1712 }, { "epoch": 1.1344370860927153, "grad_norm": 1.136612360153799, "learning_rate": 0.00025695, "loss": 2.5781, "step": 1713 }, { "epoch": 1.1350993377483443, "grad_norm": 1.2545248225133467, "learning_rate": 0.00025709999999999996, "loss": 2.5312, "step": 1714 }, { "epoch": 1.1357615894039734, "grad_norm": 1.2591202972033981, "learning_rate": 0.00025725, "loss": 2.5938, "step": 1715 }, { "epoch": 1.1364238410596026, "grad_norm": 1.30873961947508, "learning_rate": 0.00025739999999999997, "loss": 2.6562, "step": 1716 }, { "epoch": 1.1370860927152318, "grad_norm": 1.2035134867073713, "learning_rate": 0.00025755, "loss": 2.4531, "step": 1717 }, { "epoch": 1.137748344370861, "grad_norm": 1.158631499261634, "learning_rate": 0.0002577, "loss": 2.2812, "step": 1718 }, { "epoch": 1.13841059602649, "grad_norm": 2.0932306768215154, "learning_rate": 0.00025785, "loss": 2.6406, "step": 1719 }, { "epoch": 1.1390728476821192, "grad_norm": 1.2269890503169205, "learning_rate": 0.000258, "loss": 2.2656, "step": 1720 }, { "epoch": 1.1397350993377484, "grad_norm": 1.066683277383856, "learning_rate": 0.00025815, "loss": 2.0938, "step": 1721 }, { "epoch": 1.1403973509933776, "grad_norm": 1.188651566352273, "learning_rate": 0.0002583, "loss": 2.5469, "step": 1722 }, { "epoch": 1.1410596026490065, "grad_norm": 4.702704389759291, "learning_rate": 0.00025844999999999997, "loss": 2.5781, "step": 1723 }, { "epoch": 1.1417218543046357, "grad_norm": 1.1918284630421399, "learning_rate": 0.0002586, "loss": 2.2969, "step": 1724 }, { "epoch": 1.1423841059602649, "grad_norm": 1.1595351168687273, "learning_rate": 0.00025875, "loss": 2.625, "step": 1725 }, { "epoch": 1.143046357615894, "grad_norm": 1.1159134780954056, "learning_rate": 0.00025889999999999995, "loss": 2.7188, "step": 1726 }, { "epoch": 1.1437086092715232, "grad_norm": 1.8241405672567241, "learning_rate": 0.00025905, "loss": 2.5469, "step": 1727 }, { "epoch": 1.1443708609271523, "grad_norm": 1.1784729568836416, "learning_rate": 0.00025919999999999996, "loss": 2.3125, "step": 1728 }, { "epoch": 1.1450331125827815, "grad_norm": 1.1386849788674025, "learning_rate": 0.00025935, "loss": 2.375, "step": 1729 }, { "epoch": 1.1456953642384107, "grad_norm": 1.23385921784263, "learning_rate": 0.00025949999999999997, "loss": 2.7031, "step": 1730 }, { "epoch": 1.1463576158940398, "grad_norm": 1.1210794209092474, "learning_rate": 0.00025965, "loss": 2.4688, "step": 1731 }, { "epoch": 1.1470198675496688, "grad_norm": 1.1569069294558514, "learning_rate": 0.00025979999999999997, "loss": 2.6562, "step": 1732 }, { "epoch": 1.147682119205298, "grad_norm": 10.521586044719268, "learning_rate": 0.00025995, "loss": 2.8438, "step": 1733 }, { "epoch": 1.148344370860927, "grad_norm": 1.3301959596358284, "learning_rate": 0.0002601, "loss": 2.6406, "step": 1734 }, { "epoch": 1.1490066225165563, "grad_norm": 8.120003968526621, "learning_rate": 0.00026025, "loss": 2.4219, "step": 1735 }, { "epoch": 1.1496688741721854, "grad_norm": 1.1207312445208972, "learning_rate": 0.0002604, "loss": 2.4844, "step": 1736 }, { "epoch": 1.1503311258278146, "grad_norm": 1.3120139170323604, "learning_rate": 0.00026055, "loss": 2.5469, "step": 1737 }, { "epoch": 1.1509933774834438, "grad_norm": 1.2958513980527286, "learning_rate": 0.0002607, "loss": 2.75, "step": 1738 }, { "epoch": 1.151655629139073, "grad_norm": 1.4140541985633366, "learning_rate": 0.00026084999999999997, "loss": 2.6875, "step": 1739 }, { "epoch": 1.152317880794702, "grad_norm": 1.207831720792249, "learning_rate": 0.000261, "loss": 2.6406, "step": 1740 }, { "epoch": 1.152980132450331, "grad_norm": 1.0937650766914155, "learning_rate": 0.00026115, "loss": 2.75, "step": 1741 }, { "epoch": 1.1536423841059602, "grad_norm": 1.1399716762029208, "learning_rate": 0.00026129999999999995, "loss": 2.5938, "step": 1742 }, { "epoch": 1.1543046357615894, "grad_norm": 1.1600664033985133, "learning_rate": 0.00026145, "loss": 2.7656, "step": 1743 }, { "epoch": 1.1549668874172185, "grad_norm": 1.2212739781012554, "learning_rate": 0.00026159999999999996, "loss": 2.7031, "step": 1744 }, { "epoch": 1.1556291390728477, "grad_norm": 1.138208459770132, "learning_rate": 0.00026175, "loss": 2.4844, "step": 1745 }, { "epoch": 1.1562913907284769, "grad_norm": 1.132639228330785, "learning_rate": 0.00026189999999999997, "loss": 2.4219, "step": 1746 }, { "epoch": 1.156953642384106, "grad_norm": 1.2796335914899115, "learning_rate": 0.00026205, "loss": 2.8125, "step": 1747 }, { "epoch": 1.1576158940397352, "grad_norm": 1.0897491600739748, "learning_rate": 0.0002622, "loss": 2.5625, "step": 1748 }, { "epoch": 1.1582781456953644, "grad_norm": 1.2290771899974522, "learning_rate": 0.00026235, "loss": 2.9531, "step": 1749 }, { "epoch": 1.1589403973509933, "grad_norm": 1.2031799640137273, "learning_rate": 0.0002625, "loss": 2.6562, "step": 1750 }, { "epoch": 1.1596026490066225, "grad_norm": 2.9651636154760133, "learning_rate": 0.00026264999999999996, "loss": 2.4375, "step": 1751 }, { "epoch": 1.1602649006622516, "grad_norm": 1.1813701470718891, "learning_rate": 0.0002628, "loss": 2.7188, "step": 1752 }, { "epoch": 1.1609271523178808, "grad_norm": 1.1737875231560162, "learning_rate": 0.00026294999999999997, "loss": 2.6094, "step": 1753 }, { "epoch": 1.16158940397351, "grad_norm": 1.285964083835266, "learning_rate": 0.0002631, "loss": 2.6406, "step": 1754 }, { "epoch": 1.1622516556291391, "grad_norm": 1.1717580706899449, "learning_rate": 0.00026325, "loss": 2.7344, "step": 1755 }, { "epoch": 1.1629139072847683, "grad_norm": 1.1143003248234185, "learning_rate": 0.00026339999999999995, "loss": 2.2969, "step": 1756 }, { "epoch": 1.1635761589403972, "grad_norm": 1.1414163884818587, "learning_rate": 0.00026355, "loss": 2.6719, "step": 1757 }, { "epoch": 1.1642384105960264, "grad_norm": 1.0742409182865917, "learning_rate": 0.00026369999999999996, "loss": 2.1875, "step": 1758 }, { "epoch": 1.1649006622516556, "grad_norm": 1.1572617454121166, "learning_rate": 0.00026384999999999994, "loss": 2.5469, "step": 1759 }, { "epoch": 1.1655629139072847, "grad_norm": 1.050866407227508, "learning_rate": 0.00026399999999999997, "loss": 2.1094, "step": 1760 }, { "epoch": 1.166225165562914, "grad_norm": 1.1222401063498422, "learning_rate": 0.00026414999999999994, "loss": 2.6562, "step": 1761 }, { "epoch": 1.166887417218543, "grad_norm": 1.1194411807069933, "learning_rate": 0.0002643, "loss": 2.8125, "step": 1762 }, { "epoch": 1.1675496688741722, "grad_norm": 1.2329332430223954, "learning_rate": 0.00026444999999999995, "loss": 3.0156, "step": 1763 }, { "epoch": 1.1682119205298014, "grad_norm": 1.131044583189528, "learning_rate": 0.0002646, "loss": 2.7656, "step": 1764 }, { "epoch": 1.1688741721854305, "grad_norm": 1.276013354663163, "learning_rate": 0.00026474999999999996, "loss": 2.625, "step": 1765 }, { "epoch": 1.1695364238410595, "grad_norm": 1.2043168266830409, "learning_rate": 0.0002649, "loss": 2.1719, "step": 1766 }, { "epoch": 1.1701986754966887, "grad_norm": 1.305034018992924, "learning_rate": 0.00026504999999999996, "loss": 2.5469, "step": 1767 }, { "epoch": 1.1708609271523178, "grad_norm": 1.049354475796462, "learning_rate": 0.0002652, "loss": 2.3906, "step": 1768 }, { "epoch": 1.171523178807947, "grad_norm": 1.2748653446990472, "learning_rate": 0.00026534999999999997, "loss": 2.7031, "step": 1769 }, { "epoch": 1.1721854304635762, "grad_norm": 1.1656653287452228, "learning_rate": 0.0002655, "loss": 2.6562, "step": 1770 }, { "epoch": 1.1728476821192053, "grad_norm": 1.147529976887952, "learning_rate": 0.00026565, "loss": 2.5938, "step": 1771 }, { "epoch": 1.1735099337748345, "grad_norm": 2.791023841360208, "learning_rate": 0.00026579999999999996, "loss": 2.8125, "step": 1772 }, { "epoch": 1.1741721854304636, "grad_norm": 1.1588687716075585, "learning_rate": 0.00026595, "loss": 2.4375, "step": 1773 }, { "epoch": 1.1748344370860928, "grad_norm": 1.1222842084178932, "learning_rate": 0.00026609999999999996, "loss": 2.5469, "step": 1774 }, { "epoch": 1.1754966887417218, "grad_norm": 1.1452537517746582, "learning_rate": 0.00026624999999999994, "loss": 2.5469, "step": 1775 }, { "epoch": 1.176158940397351, "grad_norm": 1.0870029544867892, "learning_rate": 0.00026639999999999997, "loss": 2.6406, "step": 1776 }, { "epoch": 1.17682119205298, "grad_norm": 1.1280680366200098, "learning_rate": 0.00026654999999999995, "loss": 2.6875, "step": 1777 }, { "epoch": 1.1774834437086092, "grad_norm": 1.3989267118848066, "learning_rate": 0.0002667, "loss": 2.75, "step": 1778 }, { "epoch": 1.1781456953642384, "grad_norm": 1.25831780808853, "learning_rate": 0.00026684999999999995, "loss": 2.875, "step": 1779 }, { "epoch": 1.1788079470198676, "grad_norm": 1.1252858803886931, "learning_rate": 0.000267, "loss": 2.5938, "step": 1780 }, { "epoch": 1.1794701986754967, "grad_norm": 1.324177542464379, "learning_rate": 0.00026714999999999996, "loss": 2.9062, "step": 1781 }, { "epoch": 1.180132450331126, "grad_norm": 1.1241767300947374, "learning_rate": 0.0002673, "loss": 2.4688, "step": 1782 }, { "epoch": 1.180794701986755, "grad_norm": 1.0944027695429912, "learning_rate": 0.00026744999999999997, "loss": 2.4375, "step": 1783 }, { "epoch": 1.181456953642384, "grad_norm": 1.0478072975769979, "learning_rate": 0.0002676, "loss": 2.625, "step": 1784 }, { "epoch": 1.1821192052980132, "grad_norm": 1.1388623333905465, "learning_rate": 0.00026775, "loss": 2.8594, "step": 1785 }, { "epoch": 1.1827814569536423, "grad_norm": 1.0126028834819254, "learning_rate": 0.0002679, "loss": 2.6094, "step": 1786 }, { "epoch": 1.1834437086092715, "grad_norm": 1.1487591291413297, "learning_rate": 0.00026805, "loss": 2.6406, "step": 1787 }, { "epoch": 1.1841059602649007, "grad_norm": 1.0703098414937817, "learning_rate": 0.00026819999999999996, "loss": 2.6562, "step": 1788 }, { "epoch": 1.1847682119205298, "grad_norm": 1.0900945590520366, "learning_rate": 0.00026835, "loss": 2.7188, "step": 1789 }, { "epoch": 1.185430463576159, "grad_norm": 1.231901960880779, "learning_rate": 0.00026849999999999997, "loss": 2.5469, "step": 1790 }, { "epoch": 1.1860927152317882, "grad_norm": 1.1578911679305506, "learning_rate": 0.00026864999999999994, "loss": 2.7812, "step": 1791 }, { "epoch": 1.1867549668874173, "grad_norm": 1.0761349538375902, "learning_rate": 0.0002688, "loss": 2.4844, "step": 1792 }, { "epoch": 1.1874172185430463, "grad_norm": 1.166318238093845, "learning_rate": 0.00026894999999999995, "loss": 2.7656, "step": 1793 }, { "epoch": 1.1880794701986754, "grad_norm": 1.1443551272558634, "learning_rate": 0.0002691, "loss": 2.0781, "step": 1794 }, { "epoch": 1.1887417218543046, "grad_norm": 1.103815135595975, "learning_rate": 0.00026924999999999996, "loss": 2.4844, "step": 1795 }, { "epoch": 1.1894039735099338, "grad_norm": 1.2259929291235874, "learning_rate": 0.0002694, "loss": 2.8125, "step": 1796 }, { "epoch": 1.190066225165563, "grad_norm": 1.0364629181624325, "learning_rate": 0.00026954999999999997, "loss": 2.3594, "step": 1797 }, { "epoch": 1.190728476821192, "grad_norm": 1.2654999942913223, "learning_rate": 0.0002697, "loss": 2.2188, "step": 1798 }, { "epoch": 1.1913907284768213, "grad_norm": 1.1907409607502855, "learning_rate": 0.00026984999999999997, "loss": 2.6875, "step": 1799 }, { "epoch": 1.1920529801324504, "grad_norm": 1.1686414699543444, "learning_rate": 0.00027, "loss": 2.5312, "step": 1800 }, { "epoch": 1.1927152317880796, "grad_norm": 1.1773070773761583, "learning_rate": 0.00027015, "loss": 2.5312, "step": 1801 }, { "epoch": 1.1933774834437085, "grad_norm": 1.1743247615145778, "learning_rate": 0.00027029999999999996, "loss": 2.4375, "step": 1802 }, { "epoch": 1.1940397350993377, "grad_norm": 1.30979549916338, "learning_rate": 0.00027045, "loss": 2.5, "step": 1803 }, { "epoch": 1.1947019867549669, "grad_norm": 1.0878316245196766, "learning_rate": 0.00027059999999999996, "loss": 2.5156, "step": 1804 }, { "epoch": 1.195364238410596, "grad_norm": 1.2648602433390705, "learning_rate": 0.00027074999999999994, "loss": 2.5156, "step": 1805 }, { "epoch": 1.1960264900662252, "grad_norm": 1.2260850181319787, "learning_rate": 0.00027089999999999997, "loss": 2.5781, "step": 1806 }, { "epoch": 1.1966887417218544, "grad_norm": 1.0678250441718888, "learning_rate": 0.00027104999999999995, "loss": 2.4062, "step": 1807 }, { "epoch": 1.1973509933774835, "grad_norm": 1.1413106256524905, "learning_rate": 0.0002712, "loss": 2.6562, "step": 1808 }, { "epoch": 1.1980132450331125, "grad_norm": 1.3364971931152663, "learning_rate": 0.00027134999999999995, "loss": 2.7344, "step": 1809 }, { "epoch": 1.1986754966887416, "grad_norm": 1.072806367865523, "learning_rate": 0.0002715, "loss": 2.5156, "step": 1810 }, { "epoch": 1.1993377483443708, "grad_norm": 1.0494632182021033, "learning_rate": 0.00027164999999999996, "loss": 2.4375, "step": 1811 }, { "epoch": 1.2, "grad_norm": 1.0825141596084191, "learning_rate": 0.0002718, "loss": 2.1875, "step": 1812 }, { "epoch": 1.2006622516556291, "grad_norm": 1.241389117892049, "learning_rate": 0.00027194999999999997, "loss": 2.5312, "step": 1813 }, { "epoch": 1.2013245033112583, "grad_norm": 1.16805167446297, "learning_rate": 0.0002721, "loss": 2.5469, "step": 1814 }, { "epoch": 1.2019867549668874, "grad_norm": 1.1045410633803046, "learning_rate": 0.00027225, "loss": 2.5625, "step": 1815 }, { "epoch": 1.2026490066225166, "grad_norm": 1.1040235397770117, "learning_rate": 0.0002724, "loss": 1.9922, "step": 1816 }, { "epoch": 1.2033112582781458, "grad_norm": 1.0583540769047224, "learning_rate": 0.00027255, "loss": 2.6094, "step": 1817 }, { "epoch": 1.2039735099337747, "grad_norm": 1.1678403807291557, "learning_rate": 0.00027269999999999996, "loss": 2.6562, "step": 1818 }, { "epoch": 1.2046357615894039, "grad_norm": 1.2511723249734839, "learning_rate": 0.00027285, "loss": 2.875, "step": 1819 }, { "epoch": 1.205298013245033, "grad_norm": 1.138811259626744, "learning_rate": 0.00027299999999999997, "loss": 2.5781, "step": 1820 }, { "epoch": 1.2059602649006622, "grad_norm": 1.0547316711985497, "learning_rate": 0.00027314999999999994, "loss": 2.625, "step": 1821 }, { "epoch": 1.2066225165562914, "grad_norm": 1.1177023229551604, "learning_rate": 0.0002733, "loss": 2.4375, "step": 1822 }, { "epoch": 1.2072847682119205, "grad_norm": 1.0351003788206876, "learning_rate": 0.00027344999999999995, "loss": 2.5938, "step": 1823 }, { "epoch": 1.2079470198675497, "grad_norm": 1.0750611256258018, "learning_rate": 0.0002736, "loss": 2.5, "step": 1824 }, { "epoch": 1.2086092715231789, "grad_norm": 1.212251069032006, "learning_rate": 0.00027374999999999996, "loss": 2.6875, "step": 1825 }, { "epoch": 1.209271523178808, "grad_norm": 1.0892714684363998, "learning_rate": 0.0002739, "loss": 2.6562, "step": 1826 }, { "epoch": 1.209933774834437, "grad_norm": 1.045955630657899, "learning_rate": 0.00027404999999999997, "loss": 2.6094, "step": 1827 }, { "epoch": 1.2105960264900661, "grad_norm": 1.0131862121952844, "learning_rate": 0.0002742, "loss": 2.0625, "step": 1828 }, { "epoch": 1.2112582781456953, "grad_norm": 1.0616344338813846, "learning_rate": 0.00027435, "loss": 2.6094, "step": 1829 }, { "epoch": 1.2119205298013245, "grad_norm": 0.9947857381903571, "learning_rate": 0.0002745, "loss": 2.375, "step": 1830 }, { "epoch": 1.2125827814569536, "grad_norm": 1.0467204551351708, "learning_rate": 0.00027465, "loss": 2.375, "step": 1831 }, { "epoch": 1.2132450331125828, "grad_norm": 1.0531189716657996, "learning_rate": 0.0002748, "loss": 2.1875, "step": 1832 }, { "epoch": 1.213907284768212, "grad_norm": 1.0478528868072308, "learning_rate": 0.00027495, "loss": 2.0938, "step": 1833 }, { "epoch": 1.2145695364238411, "grad_norm": 1.114171507747835, "learning_rate": 0.00027509999999999996, "loss": 2.6562, "step": 1834 }, { "epoch": 1.2152317880794703, "grad_norm": 1.4247476459311124, "learning_rate": 0.00027525, "loss": 2.5938, "step": 1835 }, { "epoch": 1.2158940397350992, "grad_norm": 1.0829716627580173, "learning_rate": 0.00027539999999999997, "loss": 2.5625, "step": 1836 }, { "epoch": 1.2165562913907284, "grad_norm": 1.1803416416483359, "learning_rate": 0.00027554999999999995, "loss": 2.625, "step": 1837 }, { "epoch": 1.2172185430463576, "grad_norm": 1.0615957829610734, "learning_rate": 0.0002757, "loss": 2.6094, "step": 1838 }, { "epoch": 1.2178807947019867, "grad_norm": 1.0915048001427294, "learning_rate": 0.00027584999999999996, "loss": 2.7031, "step": 1839 }, { "epoch": 1.218543046357616, "grad_norm": 1.0812911145445732, "learning_rate": 0.000276, "loss": 2.5156, "step": 1840 }, { "epoch": 1.219205298013245, "grad_norm": 1.1531567311431212, "learning_rate": 0.00027614999999999996, "loss": 2.5469, "step": 1841 }, { "epoch": 1.2198675496688742, "grad_norm": 1.1680675093906006, "learning_rate": 0.0002763, "loss": 2.6875, "step": 1842 }, { "epoch": 1.2205298013245034, "grad_norm": 1.0898100247359879, "learning_rate": 0.00027644999999999997, "loss": 2.0312, "step": 1843 }, { "epoch": 1.2211920529801326, "grad_norm": 1.040513833175346, "learning_rate": 0.0002766, "loss": 2.5156, "step": 1844 }, { "epoch": 1.2218543046357615, "grad_norm": 1.103313297572885, "learning_rate": 0.00027675, "loss": 2.75, "step": 1845 }, { "epoch": 1.2225165562913907, "grad_norm": 1.174291331822553, "learning_rate": 0.0002769, "loss": 2.75, "step": 1846 }, { "epoch": 1.2231788079470198, "grad_norm": 1.101086799159118, "learning_rate": 0.00027705, "loss": 2.1094, "step": 1847 }, { "epoch": 1.223841059602649, "grad_norm": 1.076112265393698, "learning_rate": 0.0002772, "loss": 2.1562, "step": 1848 }, { "epoch": 1.2245033112582782, "grad_norm": 1.1576205632588799, "learning_rate": 0.00027735, "loss": 2.6562, "step": 1849 }, { "epoch": 1.2251655629139073, "grad_norm": 1.0103135757337511, "learning_rate": 0.00027749999999999997, "loss": 2.5938, "step": 1850 }, { "epoch": 1.2258278145695365, "grad_norm": 1.2025434183382095, "learning_rate": 0.00027764999999999995, "loss": 2.6719, "step": 1851 }, { "epoch": 1.2264900662251657, "grad_norm": 1.2376545000528463, "learning_rate": 0.0002778, "loss": 2.5469, "step": 1852 }, { "epoch": 1.2271523178807948, "grad_norm": 1.1023399903362123, "learning_rate": 0.00027794999999999995, "loss": 2.8125, "step": 1853 }, { "epoch": 1.2278145695364238, "grad_norm": 1.1365387951273755, "learning_rate": 0.0002781, "loss": 2.5625, "step": 1854 }, { "epoch": 1.228476821192053, "grad_norm": 1.1832089637703747, "learning_rate": 0.00027824999999999996, "loss": 2.6562, "step": 1855 }, { "epoch": 1.229139072847682, "grad_norm": 1.1236279296599083, "learning_rate": 0.0002784, "loss": 2.5, "step": 1856 }, { "epoch": 1.2298013245033113, "grad_norm": 1.0810551324569708, "learning_rate": 0.00027854999999999997, "loss": 2.2031, "step": 1857 }, { "epoch": 1.2304635761589404, "grad_norm": 1.0537838403223088, "learning_rate": 0.0002787, "loss": 2.5469, "step": 1858 }, { "epoch": 1.2311258278145696, "grad_norm": 1.0704641850473486, "learning_rate": 0.00027885, "loss": 2.5156, "step": 1859 }, { "epoch": 1.2317880794701987, "grad_norm": 1.5470857420778297, "learning_rate": 0.000279, "loss": 2.6875, "step": 1860 }, { "epoch": 1.2324503311258277, "grad_norm": 1.0171333371766393, "learning_rate": 0.00027915, "loss": 2.0625, "step": 1861 }, { "epoch": 1.233112582781457, "grad_norm": 1.0813066921819676, "learning_rate": 0.0002793, "loss": 2.5938, "step": 1862 }, { "epoch": 1.233774834437086, "grad_norm": 1.2930044956571096, "learning_rate": 0.00027945, "loss": 3.0312, "step": 1863 }, { "epoch": 1.2344370860927152, "grad_norm": 1.1085237276168365, "learning_rate": 0.00027959999999999997, "loss": 2.6406, "step": 1864 }, { "epoch": 1.2350993377483444, "grad_norm": 1.0685194282931558, "learning_rate": 0.00027975, "loss": 2.6719, "step": 1865 }, { "epoch": 1.2357615894039735, "grad_norm": 1.1148534327997999, "learning_rate": 0.0002799, "loss": 2.375, "step": 1866 }, { "epoch": 1.2364238410596027, "grad_norm": 1.0274912984756877, "learning_rate": 0.00028004999999999995, "loss": 2.75, "step": 1867 }, { "epoch": 1.2370860927152318, "grad_norm": 1.0478849488062358, "learning_rate": 0.0002802, "loss": 2.7656, "step": 1868 }, { "epoch": 1.237748344370861, "grad_norm": 0.9550521220643732, "learning_rate": 0.00028034999999999996, "loss": 2.5625, "step": 1869 }, { "epoch": 1.23841059602649, "grad_norm": 1.0309101286007627, "learning_rate": 0.0002805, "loss": 2.5781, "step": 1870 }, { "epoch": 1.2390728476821191, "grad_norm": 1.0792114337977892, "learning_rate": 0.00028064999999999996, "loss": 2.5781, "step": 1871 }, { "epoch": 1.2397350993377483, "grad_norm": 1.1951294477952854, "learning_rate": 0.0002808, "loss": 2.8438, "step": 1872 }, { "epoch": 1.2403973509933774, "grad_norm": 1.029946520128323, "learning_rate": 0.00028094999999999997, "loss": 2.6562, "step": 1873 }, { "epoch": 1.2410596026490066, "grad_norm": 0.9641967429063424, "learning_rate": 0.0002811, "loss": 2.3438, "step": 1874 }, { "epoch": 1.2417218543046358, "grad_norm": 1.162198009973546, "learning_rate": 0.00028125, "loss": 2.7656, "step": 1875 }, { "epoch": 1.242384105960265, "grad_norm": 1.0442571370476585, "learning_rate": 0.00028139999999999996, "loss": 2.5156, "step": 1876 }, { "epoch": 1.243046357615894, "grad_norm": 1.0398861604629954, "learning_rate": 0.00028155, "loss": 2.5, "step": 1877 }, { "epoch": 1.2437086092715233, "grad_norm": 0.9622858880520134, "learning_rate": 0.00028169999999999996, "loss": 2.1875, "step": 1878 }, { "epoch": 1.2443708609271522, "grad_norm": 1.0660310584231711, "learning_rate": 0.00028185, "loss": 2.6719, "step": 1879 }, { "epoch": 1.2450331125827814, "grad_norm": 1.0188204358198052, "learning_rate": 0.00028199999999999997, "loss": 2.4375, "step": 1880 }, { "epoch": 1.2456953642384105, "grad_norm": 1.1510881996047657, "learning_rate": 0.00028215, "loss": 2.5156, "step": 1881 }, { "epoch": 1.2463576158940397, "grad_norm": 1.0173357767415563, "learning_rate": 0.0002823, "loss": 2.6875, "step": 1882 }, { "epoch": 1.2470198675496689, "grad_norm": 1.1103666033790824, "learning_rate": 0.00028244999999999995, "loss": 2.8281, "step": 1883 }, { "epoch": 1.247682119205298, "grad_norm": 1.051364719679405, "learning_rate": 0.0002826, "loss": 2.5938, "step": 1884 }, { "epoch": 1.2483443708609272, "grad_norm": 0.9689593068921656, "learning_rate": 0.00028274999999999996, "loss": 2.4844, "step": 1885 }, { "epoch": 1.2490066225165564, "grad_norm": 1.0351034112023918, "learning_rate": 0.00028289999999999994, "loss": 2.8906, "step": 1886 }, { "epoch": 1.2496688741721855, "grad_norm": 1.0992985825572599, "learning_rate": 0.00028304999999999997, "loss": 1.9766, "step": 1887 }, { "epoch": 1.2503311258278145, "grad_norm": 1.0478148931367806, "learning_rate": 0.00028319999999999994, "loss": 2.5625, "step": 1888 }, { "epoch": 1.2509933774834436, "grad_norm": 1.0780593475405176, "learning_rate": 0.00028335, "loss": 2.7656, "step": 1889 }, { "epoch": 1.2516556291390728, "grad_norm": 0.9860731831857292, "learning_rate": 0.00028349999999999995, "loss": 2.5312, "step": 1890 }, { "epoch": 1.252317880794702, "grad_norm": 0.9696993325504033, "learning_rate": 0.00028365, "loss": 2.1562, "step": 1891 }, { "epoch": 1.2529801324503311, "grad_norm": 1.077802281550881, "learning_rate": 0.00028379999999999996, "loss": 2.8281, "step": 1892 }, { "epoch": 1.2536423841059603, "grad_norm": 1.0798474477816749, "learning_rate": 0.00028395, "loss": 2.625, "step": 1893 }, { "epoch": 1.2543046357615895, "grad_norm": 1.1323584111272793, "learning_rate": 0.00028409999999999997, "loss": 2.7656, "step": 1894 }, { "epoch": 1.2549668874172186, "grad_norm": 1.114729870633954, "learning_rate": 0.00028425, "loss": 2.4062, "step": 1895 }, { "epoch": 1.2556291390728478, "grad_norm": 1.2209638086839982, "learning_rate": 0.0002844, "loss": 2.0781, "step": 1896 }, { "epoch": 1.2562913907284767, "grad_norm": 1.3049261131915915, "learning_rate": 0.00028455, "loss": 2.7812, "step": 1897 }, { "epoch": 1.256953642384106, "grad_norm": 1.0604805948446407, "learning_rate": 0.0002847, "loss": 2.5781, "step": 1898 }, { "epoch": 1.257615894039735, "grad_norm": 1.1194736129673266, "learning_rate": 0.00028484999999999996, "loss": 2.6719, "step": 1899 }, { "epoch": 1.2582781456953642, "grad_norm": 1.1360705332281893, "learning_rate": 0.000285, "loss": 2.6406, "step": 1900 }, { "epoch": 1.2589403973509934, "grad_norm": 1.168718456941622, "learning_rate": 0.00028514999999999997, "loss": 2.5625, "step": 1901 }, { "epoch": 1.2596026490066226, "grad_norm": 1.0200250638359925, "learning_rate": 0.00028529999999999994, "loss": 2.5469, "step": 1902 }, { "epoch": 1.2602649006622517, "grad_norm": 1.1208829067947539, "learning_rate": 0.00028544999999999997, "loss": 2.1875, "step": 1903 }, { "epoch": 1.2609271523178807, "grad_norm": 1.0525553334609878, "learning_rate": 0.00028559999999999995, "loss": 2.6875, "step": 1904 }, { "epoch": 1.26158940397351, "grad_norm": 1.0604350845208788, "learning_rate": 0.00028575, "loss": 2.5312, "step": 1905 }, { "epoch": 1.262251655629139, "grad_norm": 0.992399333560451, "learning_rate": 0.00028589999999999996, "loss": 2.0938, "step": 1906 }, { "epoch": 1.2629139072847682, "grad_norm": 1.0111889050599285, "learning_rate": 0.00028605, "loss": 2.5938, "step": 1907 }, { "epoch": 1.2635761589403973, "grad_norm": 0.9681452529335135, "learning_rate": 0.00028619999999999996, "loss": 2.2031, "step": 1908 }, { "epoch": 1.2642384105960265, "grad_norm": 1.1407885430181062, "learning_rate": 0.00028635, "loss": 2.7031, "step": 1909 }, { "epoch": 1.2649006622516556, "grad_norm": 1.118260853278201, "learning_rate": 0.00028649999999999997, "loss": 2.7188, "step": 1910 }, { "epoch": 1.2655629139072848, "grad_norm": 1.055249747682395, "learning_rate": 0.00028665, "loss": 2.1875, "step": 1911 }, { "epoch": 1.266225165562914, "grad_norm": 1.1026575482125525, "learning_rate": 0.0002868, "loss": 2.4219, "step": 1912 }, { "epoch": 1.266887417218543, "grad_norm": 1.1169894504203735, "learning_rate": 0.00028694999999999995, "loss": 2.4219, "step": 1913 }, { "epoch": 1.2675496688741723, "grad_norm": 0.9843326738823613, "learning_rate": 0.0002871, "loss": 2.1562, "step": 1914 }, { "epoch": 1.2682119205298013, "grad_norm": 1.2241118134432818, "learning_rate": 0.00028724999999999996, "loss": 3.0469, "step": 1915 }, { "epoch": 1.2688741721854304, "grad_norm": 1.1400944025068844, "learning_rate": 0.00028739999999999994, "loss": 3.0, "step": 1916 }, { "epoch": 1.2695364238410596, "grad_norm": 1.100338586503263, "learning_rate": 0.00028754999999999997, "loss": 2.6875, "step": 1917 }, { "epoch": 1.2701986754966887, "grad_norm": 1.0469257858662373, "learning_rate": 0.00028769999999999995, "loss": 2.3281, "step": 1918 }, { "epoch": 1.270860927152318, "grad_norm": 1.155515575239003, "learning_rate": 0.00028785, "loss": 2.7344, "step": 1919 }, { "epoch": 1.271523178807947, "grad_norm": 1.0409552474106014, "learning_rate": 0.00028799999999999995, "loss": 2.0938, "step": 1920 }, { "epoch": 1.2721854304635762, "grad_norm": 1.03696624615095, "learning_rate": 0.00028815, "loss": 2.5, "step": 1921 }, { "epoch": 1.2728476821192052, "grad_norm": 1.1270970282997816, "learning_rate": 0.00028829999999999996, "loss": 2.1719, "step": 1922 }, { "epoch": 1.2735099337748346, "grad_norm": 1.2352258964071394, "learning_rate": 0.00028845, "loss": 2.7344, "step": 1923 }, { "epoch": 1.2741721854304635, "grad_norm": 1.1379863833456227, "learning_rate": 0.00028859999999999997, "loss": 2.4531, "step": 1924 }, { "epoch": 1.2748344370860927, "grad_norm": 1.3774584956460922, "learning_rate": 0.00028875, "loss": 2.2031, "step": 1925 }, { "epoch": 1.2754966887417218, "grad_norm": 1.0019107181300786, "learning_rate": 0.0002889, "loss": 2.2031, "step": 1926 }, { "epoch": 1.276158940397351, "grad_norm": 1.1805687541288026, "learning_rate": 0.00028905, "loss": 2.6562, "step": 1927 }, { "epoch": 1.2768211920529802, "grad_norm": 1.0791646104057475, "learning_rate": 0.0002892, "loss": 2.5625, "step": 1928 }, { "epoch": 1.2774834437086093, "grad_norm": 1.0959949474657371, "learning_rate": 0.00028934999999999996, "loss": 2.0625, "step": 1929 }, { "epoch": 1.2781456953642385, "grad_norm": 1.0354366485198145, "learning_rate": 0.0002895, "loss": 2.0938, "step": 1930 }, { "epoch": 1.2788079470198674, "grad_norm": 1.1501986369387915, "learning_rate": 0.00028964999999999997, "loss": 2.5, "step": 1931 }, { "epoch": 1.2794701986754966, "grad_norm": 1.0645195298901111, "learning_rate": 0.00028979999999999994, "loss": 2.5469, "step": 1932 }, { "epoch": 1.2801324503311258, "grad_norm": 1.2568649516174393, "learning_rate": 0.00028995, "loss": 2.6562, "step": 1933 }, { "epoch": 1.280794701986755, "grad_norm": 1.0184071562805015, "learning_rate": 0.00029009999999999995, "loss": 2.5781, "step": 1934 }, { "epoch": 1.281456953642384, "grad_norm": 0.9511248085741806, "learning_rate": 0.00029025, "loss": 2.5469, "step": 1935 }, { "epoch": 1.2821192052980133, "grad_norm": 1.0350583274553433, "learning_rate": 0.00029039999999999996, "loss": 2.4531, "step": 1936 }, { "epoch": 1.2827814569536424, "grad_norm": 1.1329353201398, "learning_rate": 0.00029055, "loss": 2.7031, "step": 1937 }, { "epoch": 1.2834437086092716, "grad_norm": 1.0894534422556255, "learning_rate": 0.00029069999999999996, "loss": 2.5938, "step": 1938 }, { "epoch": 1.2841059602649008, "grad_norm": 1.0967781765222333, "learning_rate": 0.00029085, "loss": 2.6719, "step": 1939 }, { "epoch": 1.2847682119205297, "grad_norm": 1.2096398116942162, "learning_rate": 0.00029099999999999997, "loss": 2.375, "step": 1940 }, { "epoch": 1.2854304635761589, "grad_norm": 1.036185112772001, "learning_rate": 0.00029115, "loss": 2.6719, "step": 1941 }, { "epoch": 1.286092715231788, "grad_norm": 1.0788777078109564, "learning_rate": 0.0002913, "loss": 2.6406, "step": 1942 }, { "epoch": 1.2867549668874172, "grad_norm": 1.2141745514818192, "learning_rate": 0.00029145, "loss": 2.6562, "step": 1943 }, { "epoch": 1.2874172185430464, "grad_norm": 1.1644640957341408, "learning_rate": 0.0002916, "loss": 2.5938, "step": 1944 }, { "epoch": 1.2880794701986755, "grad_norm": 1.0881825031520775, "learning_rate": 0.00029174999999999996, "loss": 2.7969, "step": 1945 }, { "epoch": 1.2887417218543047, "grad_norm": 1.1861048268415086, "learning_rate": 0.0002919, "loss": 2.5781, "step": 1946 }, { "epoch": 1.2894039735099339, "grad_norm": 1.126688345733708, "learning_rate": 0.00029204999999999997, "loss": 2.5, "step": 1947 }, { "epoch": 1.290066225165563, "grad_norm": 0.9650983365409912, "learning_rate": 0.00029219999999999995, "loss": 2.5469, "step": 1948 }, { "epoch": 1.290728476821192, "grad_norm": 1.1985797671040255, "learning_rate": 0.00029235, "loss": 2.2031, "step": 1949 }, { "epoch": 1.2913907284768211, "grad_norm": 1.3279822585227978, "learning_rate": 0.00029249999999999995, "loss": 2.8281, "step": 1950 }, { "epoch": 1.2920529801324503, "grad_norm": 1.109824070413775, "learning_rate": 0.00029265, "loss": 2.7969, "step": 1951 }, { "epoch": 1.2927152317880795, "grad_norm": 1.2355286789036353, "learning_rate": 0.00029279999999999996, "loss": 2.5781, "step": 1952 }, { "epoch": 1.2933774834437086, "grad_norm": 0.9902368225212206, "learning_rate": 0.00029295, "loss": 2.5469, "step": 1953 }, { "epoch": 1.2940397350993378, "grad_norm": 1.2500663286055769, "learning_rate": 0.00029309999999999997, "loss": 2.8281, "step": 1954 }, { "epoch": 1.294701986754967, "grad_norm": 1.1390262695226614, "learning_rate": 0.00029325, "loss": 2.6719, "step": 1955 }, { "epoch": 1.295364238410596, "grad_norm": 1.0569373115836096, "learning_rate": 0.0002934, "loss": 2.6094, "step": 1956 }, { "epoch": 1.2960264900662253, "grad_norm": 1.121002100150029, "learning_rate": 0.00029355, "loss": 2.6094, "step": 1957 }, { "epoch": 1.2966887417218542, "grad_norm": 1.1086242686110153, "learning_rate": 0.0002937, "loss": 2.5469, "step": 1958 }, { "epoch": 1.2973509933774834, "grad_norm": 1.3535454084609597, "learning_rate": 0.00029384999999999996, "loss": 2.7188, "step": 1959 }, { "epoch": 1.2980132450331126, "grad_norm": 39.181124657694724, "learning_rate": 0.000294, "loss": 3.1875, "step": 1960 }, { "epoch": 1.2986754966887417, "grad_norm": 0.9259981151085336, "learning_rate": 0.00029414999999999997, "loss": 2.4844, "step": 1961 }, { "epoch": 1.2993377483443709, "grad_norm": 1.0720675503395423, "learning_rate": 0.00029429999999999994, "loss": 2.8125, "step": 1962 }, { "epoch": 1.3, "grad_norm": 0.9875026945065355, "learning_rate": 0.00029445, "loss": 2.3281, "step": 1963 }, { "epoch": 1.3006622516556292, "grad_norm": 1.0922421253948331, "learning_rate": 0.00029459999999999995, "loss": 2.625, "step": 1964 }, { "epoch": 1.3013245033112582, "grad_norm": 1.116007377266788, "learning_rate": 0.00029475, "loss": 2.4062, "step": 1965 }, { "epoch": 1.3019867549668875, "grad_norm": 1.0548331634043413, "learning_rate": 0.00029489999999999996, "loss": 2.5, "step": 1966 }, { "epoch": 1.3026490066225165, "grad_norm": 0.9782879048981354, "learning_rate": 0.00029505, "loss": 2.4062, "step": 1967 }, { "epoch": 1.3033112582781456, "grad_norm": 1.056137184337501, "learning_rate": 0.00029519999999999997, "loss": 2.5938, "step": 1968 }, { "epoch": 1.3039735099337748, "grad_norm": 1.0941836673266787, "learning_rate": 0.00029535, "loss": 2.6562, "step": 1969 }, { "epoch": 1.304635761589404, "grad_norm": 1.1532094349044157, "learning_rate": 0.00029549999999999997, "loss": 2.6406, "step": 1970 }, { "epoch": 1.3052980132450331, "grad_norm": 1.0131475739399138, "learning_rate": 0.00029565, "loss": 2.1875, "step": 1971 }, { "epoch": 1.3059602649006623, "grad_norm": 1.147507884991115, "learning_rate": 0.0002958, "loss": 2.7188, "step": 1972 }, { "epoch": 1.3066225165562915, "grad_norm": 1.1045130325774761, "learning_rate": 0.00029595, "loss": 2.6094, "step": 1973 }, { "epoch": 1.3072847682119204, "grad_norm": 1.0895706669188263, "learning_rate": 0.0002961, "loss": 2.5938, "step": 1974 }, { "epoch": 1.3079470198675498, "grad_norm": 1.0713927376251628, "learning_rate": 0.00029624999999999996, "loss": 2.6875, "step": 1975 }, { "epoch": 1.3086092715231787, "grad_norm": 1.0336689741676732, "learning_rate": 0.0002964, "loss": 2.4844, "step": 1976 }, { "epoch": 1.309271523178808, "grad_norm": 1.0818326941357388, "learning_rate": 0.00029654999999999997, "loss": 2.2188, "step": 1977 }, { "epoch": 1.309933774834437, "grad_norm": 1.1595925386010197, "learning_rate": 0.00029669999999999995, "loss": 2.4375, "step": 1978 }, { "epoch": 1.3105960264900662, "grad_norm": 1.0076444919530783, "learning_rate": 0.00029685, "loss": 2.5469, "step": 1979 }, { "epoch": 1.3112582781456954, "grad_norm": 1.1021443681044725, "learning_rate": 0.00029699999999999996, "loss": 2.7188, "step": 1980 }, { "epoch": 1.3119205298013246, "grad_norm": 1.2742319205681256, "learning_rate": 0.00029715, "loss": 2.125, "step": 1981 }, { "epoch": 1.3125827814569537, "grad_norm": 1.1548730432150276, "learning_rate": 0.00029729999999999996, "loss": 2.4688, "step": 1982 }, { "epoch": 1.3132450331125827, "grad_norm": 1.0808407989829563, "learning_rate": 0.00029745, "loss": 2.1719, "step": 1983 }, { "epoch": 1.313907284768212, "grad_norm": 2.6683188367977073, "learning_rate": 0.00029759999999999997, "loss": 2.7969, "step": 1984 }, { "epoch": 1.314569536423841, "grad_norm": 1.1339643271225937, "learning_rate": 0.00029775, "loss": 2.4531, "step": 1985 }, { "epoch": 1.3152317880794702, "grad_norm": 1.1007110203489667, "learning_rate": 0.0002979, "loss": 2.4219, "step": 1986 }, { "epoch": 1.3158940397350993, "grad_norm": 1.050090083826493, "learning_rate": 0.00029805, "loss": 2.4688, "step": 1987 }, { "epoch": 1.3165562913907285, "grad_norm": 1.0532719772266244, "learning_rate": 0.0002982, "loss": 2.375, "step": 1988 }, { "epoch": 1.3172185430463577, "grad_norm": 1.0148735207987534, "learning_rate": 0.00029835, "loss": 2.5, "step": 1989 }, { "epoch": 1.3178807947019868, "grad_norm": 1.3468871824253617, "learning_rate": 0.0002985, "loss": 2.5156, "step": 1990 }, { "epoch": 1.318543046357616, "grad_norm": 1.0513339303068943, "learning_rate": 0.00029864999999999997, "loss": 2.4219, "step": 1991 }, { "epoch": 1.319205298013245, "grad_norm": 1.1553184355427268, "learning_rate": 0.0002988, "loss": 2.75, "step": 1992 }, { "epoch": 1.319867549668874, "grad_norm": 1.1858219448525618, "learning_rate": 0.00029895, "loss": 2.7812, "step": 1993 }, { "epoch": 1.3205298013245033, "grad_norm": 1.093980672612447, "learning_rate": 0.00029909999999999995, "loss": 1.9844, "step": 1994 }, { "epoch": 1.3211920529801324, "grad_norm": 1.1947924271549226, "learning_rate": 0.00029925, "loss": 2.5938, "step": 1995 }, { "epoch": 1.3218543046357616, "grad_norm": 1.0875267890028453, "learning_rate": 0.00029939999999999996, "loss": 2.5625, "step": 1996 }, { "epoch": 1.3225165562913908, "grad_norm": 1.2949422085707263, "learning_rate": 0.00029955, "loss": 2.6406, "step": 1997 }, { "epoch": 1.32317880794702, "grad_norm": 1.0739303521176613, "learning_rate": 0.00029969999999999997, "loss": 2.5938, "step": 1998 }, { "epoch": 1.323841059602649, "grad_norm": 1.1677531520886069, "learning_rate": 0.00029985, "loss": 2.4531, "step": 1999 }, { "epoch": 1.3245033112582782, "grad_norm": 0.9897328569294896, "learning_rate": 0.0003, "loss": 2.4688, "step": 2000 }, { "epoch": 1.3251655629139072, "grad_norm": 0.9106212163888883, "learning_rate": 0.0002999999851491401, "loss": 2.3438, "step": 2001 }, { "epoch": 1.3258278145695364, "grad_norm": 1.054385263828949, "learning_rate": 0.00029999994059656354, "loss": 2.6094, "step": 2002 }, { "epoch": 1.3264900662251655, "grad_norm": 0.9827580505608354, "learning_rate": 0.000299999866342279, "loss": 2.5156, "step": 2003 }, { "epoch": 1.3271523178807947, "grad_norm": 1.0135124480904556, "learning_rate": 0.00029999976238630124, "loss": 2.375, "step": 2004 }, { "epoch": 1.3278145695364238, "grad_norm": 1.0921632137293598, "learning_rate": 0.00029999962872865086, "loss": 2.4531, "step": 2005 }, { "epoch": 1.328476821192053, "grad_norm": 1.0284942774506889, "learning_rate": 0.00029999946536935423, "loss": 2.5469, "step": 2006 }, { "epoch": 1.3291390728476822, "grad_norm": 0.9660339299106807, "learning_rate": 0.0002999992723084438, "loss": 2.7188, "step": 2007 }, { "epoch": 1.3298013245033111, "grad_norm": 1.1351336909668122, "learning_rate": 0.00029999904954595784, "loss": 2.7969, "step": 2008 }, { "epoch": 1.3304635761589405, "grad_norm": 1.2793949440623427, "learning_rate": 0.0002999987970819403, "loss": 2.75, "step": 2009 }, { "epoch": 1.3311258278145695, "grad_norm": 1.0471921414360634, "learning_rate": 0.0002999985149164413, "loss": 2.0156, "step": 2010 }, { "epoch": 1.3317880794701986, "grad_norm": 0.9722668965549524, "learning_rate": 0.00029999820304951665, "loss": 2.5781, "step": 2011 }, { "epoch": 1.3324503311258278, "grad_norm": 1.1012710706926796, "learning_rate": 0.00029999786148122813, "loss": 2.5312, "step": 2012 }, { "epoch": 1.333112582781457, "grad_norm": 1.0511107087686171, "learning_rate": 0.00029999749021164334, "loss": 2.5156, "step": 2013 }, { "epoch": 1.333774834437086, "grad_norm": 1.0997450493383003, "learning_rate": 0.0002999970892408359, "loss": 2.3438, "step": 2014 }, { "epoch": 1.3344370860927153, "grad_norm": 0.9934301707366604, "learning_rate": 0.0002999966585688851, "loss": 2.4062, "step": 2015 }, { "epoch": 1.3350993377483444, "grad_norm": 1.0130746712865908, "learning_rate": 0.0002999961981958762, "loss": 2.6562, "step": 2016 }, { "epoch": 1.3357615894039734, "grad_norm": 0.979423502669621, "learning_rate": 0.0002999957081219005, "loss": 2.3906, "step": 2017 }, { "epoch": 1.3364238410596028, "grad_norm": 0.977531539824706, "learning_rate": 0.0002999951883470548, "loss": 2.5625, "step": 2018 }, { "epoch": 1.3370860927152317, "grad_norm": 1.0364533471056445, "learning_rate": 0.0002999946388714423, "loss": 2.4219, "step": 2019 }, { "epoch": 1.3377483443708609, "grad_norm": 1.0808251568241538, "learning_rate": 0.0002999940596951716, "loss": 2.5938, "step": 2020 }, { "epoch": 1.33841059602649, "grad_norm": 1.1719723977286878, "learning_rate": 0.0002999934508183575, "loss": 2.75, "step": 2021 }, { "epoch": 1.3390728476821192, "grad_norm": 1.1025455427458621, "learning_rate": 0.0002999928122411205, "loss": 2.0312, "step": 2022 }, { "epoch": 1.3397350993377484, "grad_norm": 0.9910353888871356, "learning_rate": 0.00029999214396358705, "loss": 2.0156, "step": 2023 }, { "epoch": 1.3403973509933775, "grad_norm": 0.9977347497991871, "learning_rate": 0.0002999914459858895, "loss": 2.5469, "step": 2024 }, { "epoch": 1.3410596026490067, "grad_norm": 1.0804623662503163, "learning_rate": 0.0002999907183081661, "loss": 2.5938, "step": 2025 }, { "epoch": 1.3417218543046356, "grad_norm": 1.40944556666985, "learning_rate": 0.0002999899609305608, "loss": 2.7031, "step": 2026 }, { "epoch": 1.342384105960265, "grad_norm": 0.9921557939340224, "learning_rate": 0.00029998917385322366, "loss": 2.5781, "step": 2027 }, { "epoch": 1.343046357615894, "grad_norm": 1.1239899099975554, "learning_rate": 0.00029998835707631063, "loss": 2.7188, "step": 2028 }, { "epoch": 1.3437086092715231, "grad_norm": 1.0518251072319211, "learning_rate": 0.00029998751059998323, "loss": 2.9062, "step": 2029 }, { "epoch": 1.3443708609271523, "grad_norm": 1.0585330315514934, "learning_rate": 0.00029998663442440924, "loss": 2.3125, "step": 2030 }, { "epoch": 1.3450331125827815, "grad_norm": 0.9025844197740192, "learning_rate": 0.0002999857285497621, "loss": 2.1875, "step": 2031 }, { "epoch": 1.3456953642384106, "grad_norm": 1.1210696515198164, "learning_rate": 0.00029998479297622114, "loss": 2.5625, "step": 2032 }, { "epoch": 1.3463576158940398, "grad_norm": 1.0952408376390268, "learning_rate": 0.00029998382770397164, "loss": 2.625, "step": 2033 }, { "epoch": 1.347019867549669, "grad_norm": 1.0601953619614632, "learning_rate": 0.0002999828327332048, "loss": 2.6562, "step": 2034 }, { "epoch": 1.347682119205298, "grad_norm": 1.056257914944039, "learning_rate": 0.0002999818080641175, "loss": 2.625, "step": 2035 }, { "epoch": 1.3483443708609273, "grad_norm": 1.002870581309078, "learning_rate": 0.0002999807536969128, "loss": 2.5938, "step": 2036 }, { "epoch": 1.3490066225165562, "grad_norm": 1.132751865514166, "learning_rate": 0.00029997966963179936, "loss": 2.5, "step": 2037 }, { "epoch": 1.3496688741721854, "grad_norm": 0.9273223497681905, "learning_rate": 0.00029997855586899185, "loss": 2.2344, "step": 2038 }, { "epoch": 1.3503311258278146, "grad_norm": 1.0183091582389598, "learning_rate": 0.00029997741240871085, "loss": 2.4062, "step": 2039 }, { "epoch": 1.3509933774834437, "grad_norm": 0.9648463263310283, "learning_rate": 0.0002999762392511828, "loss": 2.4688, "step": 2040 }, { "epoch": 1.351655629139073, "grad_norm": 1.0817590367134078, "learning_rate": 0.0002999750363966399, "loss": 2.4375, "step": 2041 }, { "epoch": 1.352317880794702, "grad_norm": 1.026529476359781, "learning_rate": 0.0002999738038453204, "loss": 2.7344, "step": 2042 }, { "epoch": 1.3529801324503312, "grad_norm": 0.9650344341618357, "learning_rate": 0.0002999725415974684, "loss": 2.4688, "step": 2043 }, { "epoch": 1.3536423841059602, "grad_norm": 0.9349776003181619, "learning_rate": 0.0002999712496533337, "loss": 2.375, "step": 2044 }, { "epoch": 1.3543046357615893, "grad_norm": 1.122369921650896, "learning_rate": 0.0002999699280131723, "loss": 2.6406, "step": 2045 }, { "epoch": 1.3549668874172185, "grad_norm": 0.9545700947631062, "learning_rate": 0.00029996857667724574, "loss": 2.3125, "step": 2046 }, { "epoch": 1.3556291390728477, "grad_norm": 1.104748855317723, "learning_rate": 0.00029996719564582164, "loss": 2.7656, "step": 2047 }, { "epoch": 1.3562913907284768, "grad_norm": 1.002662558031696, "learning_rate": 0.0002999657849191735, "loss": 2.375, "step": 2048 }, { "epoch": 1.356953642384106, "grad_norm": 1.0260077076481458, "learning_rate": 0.0002999643444975807, "loss": 2.4219, "step": 2049 }, { "epoch": 1.3576158940397351, "grad_norm": 0.9882912082533333, "learning_rate": 0.0002999628743813283, "loss": 2.1094, "step": 2050 }, { "epoch": 1.3582781456953643, "grad_norm": 1.1265108467549234, "learning_rate": 0.00029996137457070763, "loss": 2.5312, "step": 2051 }, { "epoch": 1.3589403973509935, "grad_norm": 1.3069110943498685, "learning_rate": 0.00029995984506601546, "loss": 2.8438, "step": 2052 }, { "epoch": 1.3596026490066224, "grad_norm": 1.1169943828318265, "learning_rate": 0.00029995828586755474, "loss": 2.5469, "step": 2053 }, { "epoch": 1.3602649006622516, "grad_norm": 1.24683348164736, "learning_rate": 0.0002999566969756342, "loss": 2.3906, "step": 2054 }, { "epoch": 1.3609271523178808, "grad_norm": 1.3264826271754002, "learning_rate": 0.00029995507839056853, "loss": 2.5625, "step": 2055 }, { "epoch": 1.36158940397351, "grad_norm": 0.9310865908412236, "learning_rate": 0.00029995343011267805, "loss": 2.5312, "step": 2056 }, { "epoch": 1.362251655629139, "grad_norm": 1.0657039094623508, "learning_rate": 0.0002999517521422894, "loss": 2.7031, "step": 2057 }, { "epoch": 1.3629139072847682, "grad_norm": 1.0238749959680067, "learning_rate": 0.00029995004447973457, "loss": 2.4375, "step": 2058 }, { "epoch": 1.3635761589403974, "grad_norm": 1.156791465048237, "learning_rate": 0.00029994830712535176, "loss": 2.6719, "step": 2059 }, { "epoch": 1.3642384105960264, "grad_norm": 0.9984995218019491, "learning_rate": 0.00029994654007948514, "loss": 2.3125, "step": 2060 }, { "epoch": 1.3649006622516557, "grad_norm": 1.0928268740664213, "learning_rate": 0.00029994474334248453, "loss": 2.5781, "step": 2061 }, { "epoch": 1.3655629139072847, "grad_norm": 1.0765424489806634, "learning_rate": 0.0002999429169147056, "loss": 2.6875, "step": 2062 }, { "epoch": 1.3662251655629138, "grad_norm": 0.9437256024088053, "learning_rate": 0.0002999410607965101, "loss": 2.4219, "step": 2063 }, { "epoch": 1.366887417218543, "grad_norm": 1.0676649742268547, "learning_rate": 0.0002999391749882655, "loss": 2.2656, "step": 2064 }, { "epoch": 1.3675496688741722, "grad_norm": 1.1699558002606203, "learning_rate": 0.00029993725949034535, "loss": 2.7031, "step": 2065 }, { "epoch": 1.3682119205298013, "grad_norm": 1.2963926717576646, "learning_rate": 0.0002999353143031288, "loss": 2.625, "step": 2066 }, { "epoch": 1.3688741721854305, "grad_norm": 0.9424376115189346, "learning_rate": 0.00029993333942700105, "loss": 2.6094, "step": 2067 }, { "epoch": 1.3695364238410597, "grad_norm": 1.0281093363124507, "learning_rate": 0.00029993133486235315, "loss": 2.6875, "step": 2068 }, { "epoch": 1.3701986754966886, "grad_norm": 0.9722275326151685, "learning_rate": 0.0002999293006095821, "loss": 2.5625, "step": 2069 }, { "epoch": 1.370860927152318, "grad_norm": 1.011360433036984, "learning_rate": 0.00029992723666909065, "loss": 2.75, "step": 2070 }, { "epoch": 1.371523178807947, "grad_norm": 1.1275751769331597, "learning_rate": 0.0002999251430412874, "loss": 2.4375, "step": 2071 }, { "epoch": 1.372185430463576, "grad_norm": 1.0648901210761295, "learning_rate": 0.000299923019726587, "loss": 2.5469, "step": 2072 }, { "epoch": 1.3728476821192053, "grad_norm": 1.042670875014918, "learning_rate": 0.00029992086672540994, "loss": 2.6719, "step": 2073 }, { "epoch": 1.3735099337748344, "grad_norm": 1.2108186499996882, "learning_rate": 0.0002999186840381825, "loss": 2.75, "step": 2074 }, { "epoch": 1.3741721854304636, "grad_norm": 1.0190419555836847, "learning_rate": 0.0002999164716653368, "loss": 2.6562, "step": 2075 }, { "epoch": 1.3748344370860928, "grad_norm": 0.9335672354134906, "learning_rate": 0.00029991422960731106, "loss": 2.375, "step": 2076 }, { "epoch": 1.375496688741722, "grad_norm": 0.9950280715301916, "learning_rate": 0.00029991195786454905, "loss": 2.4844, "step": 2077 }, { "epoch": 1.3761589403973509, "grad_norm": 1.0306797800037197, "learning_rate": 0.0002999096564375007, "loss": 2.625, "step": 2078 }, { "epoch": 1.3768211920529803, "grad_norm": 0.9315985494953134, "learning_rate": 0.0002999073253266218, "loss": 2.5938, "step": 2079 }, { "epoch": 1.3774834437086092, "grad_norm": 1.0285396108311404, "learning_rate": 0.00029990496453237376, "loss": 2.6094, "step": 2080 }, { "epoch": 1.3781456953642384, "grad_norm": 1.058219687594806, "learning_rate": 0.0002999025740552242, "loss": 2.0625, "step": 2081 }, { "epoch": 1.3788079470198675, "grad_norm": 1.108192494461978, "learning_rate": 0.00029990015389564636, "loss": 2.6719, "step": 2082 }, { "epoch": 1.3794701986754967, "grad_norm": 0.9928633586645363, "learning_rate": 0.00029989770405411956, "loss": 2.5625, "step": 2083 }, { "epoch": 1.3801324503311259, "grad_norm": 1.026506255561202, "learning_rate": 0.00029989522453112877, "loss": 2.4219, "step": 2084 }, { "epoch": 1.380794701986755, "grad_norm": 1.1097359745458428, "learning_rate": 0.000299892715327165, "loss": 2.6094, "step": 2085 }, { "epoch": 1.3814569536423842, "grad_norm": 1.197333947763301, "learning_rate": 0.0002998901764427252, "loss": 2.0625, "step": 2086 }, { "epoch": 1.3821192052980131, "grad_norm": 0.9703797561813698, "learning_rate": 0.00029988760787831207, "loss": 2.5, "step": 2087 }, { "epoch": 1.3827814569536425, "grad_norm": 1.0829678223300725, "learning_rate": 0.0002998850096344341, "loss": 2.4844, "step": 2088 }, { "epoch": 1.3834437086092715, "grad_norm": 0.9875225963662323, "learning_rate": 0.00029988238171160586, "loss": 2.5781, "step": 2089 }, { "epoch": 1.3841059602649006, "grad_norm": 0.9181551745807098, "learning_rate": 0.0002998797241103477, "loss": 2.1406, "step": 2090 }, { "epoch": 1.3847682119205298, "grad_norm": 1.0793421842263742, "learning_rate": 0.0002998770368311859, "loss": 2.625, "step": 2091 }, { "epoch": 1.385430463576159, "grad_norm": 1.0202697923864872, "learning_rate": 0.0002998743198746524, "loss": 2.3594, "step": 2092 }, { "epoch": 1.3860927152317881, "grad_norm": 1.0085313654720545, "learning_rate": 0.00029987157324128544, "loss": 2.6719, "step": 2093 }, { "epoch": 1.3867549668874173, "grad_norm": 0.9682317825838836, "learning_rate": 0.0002998687969316287, "loss": 2.5625, "step": 2094 }, { "epoch": 1.3874172185430464, "grad_norm": 1.0247628071903343, "learning_rate": 0.000299865990946232, "loss": 2.6719, "step": 2095 }, { "epoch": 1.3880794701986754, "grad_norm": 0.95291044958702, "learning_rate": 0.0002998631552856509, "loss": 2.4844, "step": 2096 }, { "epoch": 1.3887417218543046, "grad_norm": 1.635911671151028, "learning_rate": 0.000299860289950447, "loss": 2.625, "step": 2097 }, { "epoch": 1.3894039735099337, "grad_norm": 1.0380295247735245, "learning_rate": 0.0002998573949411875, "loss": 2.5, "step": 2098 }, { "epoch": 1.3900662251655629, "grad_norm": 1.0723657652729095, "learning_rate": 0.0002998544702584458, "loss": 2.6406, "step": 2099 }, { "epoch": 1.390728476821192, "grad_norm": 1.0909682795165, "learning_rate": 0.00029985151590280096, "loss": 2.7031, "step": 2100 }, { "epoch": 1.3913907284768212, "grad_norm": 1.132269107255355, "learning_rate": 0.000299848531874838, "loss": 2.5312, "step": 2101 }, { "epoch": 1.3920529801324504, "grad_norm": 1.0424865897875386, "learning_rate": 0.00029984551817514773, "loss": 2.5625, "step": 2102 }, { "epoch": 1.3927152317880795, "grad_norm": 1.215998969244023, "learning_rate": 0.0002998424748043269, "loss": 2.6094, "step": 2103 }, { "epoch": 1.3933774834437087, "grad_norm": 0.9827945155801082, "learning_rate": 0.0002998394017629783, "loss": 2.4375, "step": 2104 }, { "epoch": 1.3940397350993377, "grad_norm": 0.9744273230170389, "learning_rate": 0.00029983629905171015, "loss": 2.5469, "step": 2105 }, { "epoch": 1.3947019867549668, "grad_norm": 1.1271484555081328, "learning_rate": 0.00029983316667113707, "loss": 2.6094, "step": 2106 }, { "epoch": 1.395364238410596, "grad_norm": 1.096239640373542, "learning_rate": 0.00029983000462187917, "loss": 2.8281, "step": 2107 }, { "epoch": 1.3960264900662251, "grad_norm": 1.0393770138040104, "learning_rate": 0.0002998268129045626, "loss": 2.7812, "step": 2108 }, { "epoch": 1.3966887417218543, "grad_norm": 1.009495811360415, "learning_rate": 0.0002998235915198194, "loss": 2.7344, "step": 2109 }, { "epoch": 1.3973509933774835, "grad_norm": 1.024976689277755, "learning_rate": 0.0002998203404682874, "loss": 2.2812, "step": 2110 }, { "epoch": 1.3980132450331126, "grad_norm": 1.0709768509277813, "learning_rate": 0.00029981705975061035, "loss": 2.5625, "step": 2111 }, { "epoch": 1.3986754966887418, "grad_norm": 0.9189981798877973, "learning_rate": 0.00029981374936743785, "loss": 2.375, "step": 2112 }, { "epoch": 1.399337748344371, "grad_norm": 1.1019402390564017, "learning_rate": 0.00029981040931942543, "loss": 2.8125, "step": 2113 }, { "epoch": 1.4, "grad_norm": 0.9162996994941862, "learning_rate": 0.00029980703960723445, "loss": 2.3125, "step": 2114 }, { "epoch": 1.400662251655629, "grad_norm": 1.0926442260991014, "learning_rate": 0.0002998036402315322, "loss": 2.8594, "step": 2115 }, { "epoch": 1.4013245033112582, "grad_norm": 1.0555369624437276, "learning_rate": 0.00029980021119299167, "loss": 2.6562, "step": 2116 }, { "epoch": 1.4019867549668874, "grad_norm": 0.9092918659556374, "learning_rate": 0.0002997967524922919, "loss": 2.0, "step": 2117 }, { "epoch": 1.4026490066225166, "grad_norm": 1.0665436080451218, "learning_rate": 0.00029979326413011786, "loss": 2.5781, "step": 2118 }, { "epoch": 1.4033112582781457, "grad_norm": 1.011513565088108, "learning_rate": 0.0002997897461071601, "loss": 2.6562, "step": 2119 }, { "epoch": 1.403973509933775, "grad_norm": 1.0477597468168856, "learning_rate": 0.00029978619842411543, "loss": 2.7969, "step": 2120 }, { "epoch": 1.4046357615894038, "grad_norm": 0.88325777130976, "learning_rate": 0.00029978262108168615, "loss": 2.4219, "step": 2121 }, { "epoch": 1.4052980132450332, "grad_norm": 1.0496082092803243, "learning_rate": 0.00029977901408058074, "loss": 2.5938, "step": 2122 }, { "epoch": 1.4059602649006622, "grad_norm": 0.973819815876037, "learning_rate": 0.0002997753774215133, "loss": 2.7031, "step": 2123 }, { "epoch": 1.4066225165562913, "grad_norm": 0.9961371582182782, "learning_rate": 0.0002997717111052041, "loss": 2.4688, "step": 2124 }, { "epoch": 1.4072847682119205, "grad_norm": 0.8927419266461215, "learning_rate": 0.000299768015132379, "loss": 2.1875, "step": 2125 }, { "epoch": 1.4079470198675497, "grad_norm": 0.8993635239288216, "learning_rate": 0.00029976428950376986, "loss": 2.5156, "step": 2126 }, { "epoch": 1.4086092715231788, "grad_norm": 1.1536660343448664, "learning_rate": 0.0002997605342201144, "loss": 2.5625, "step": 2127 }, { "epoch": 1.409271523178808, "grad_norm": 0.9935687797992114, "learning_rate": 0.00029975674928215627, "loss": 2.4531, "step": 2128 }, { "epoch": 1.4099337748344372, "grad_norm": 1.1328804926151956, "learning_rate": 0.00029975293469064484, "loss": 2.8594, "step": 2129 }, { "epoch": 1.410596026490066, "grad_norm": 1.0656844295547943, "learning_rate": 0.00029974909044633546, "loss": 2.625, "step": 2130 }, { "epoch": 1.4112582781456955, "grad_norm": 0.9407947356703492, "learning_rate": 0.0002997452165499894, "loss": 2.3594, "step": 2131 }, { "epoch": 1.4119205298013244, "grad_norm": 0.9536151602500974, "learning_rate": 0.00029974131300237366, "loss": 2.4844, "step": 2132 }, { "epoch": 1.4125827814569536, "grad_norm": 0.9848676103028161, "learning_rate": 0.00029973737980426124, "loss": 2.3281, "step": 2133 }, { "epoch": 1.4132450331125828, "grad_norm": 1.2430168077390826, "learning_rate": 0.00029973341695643086, "loss": 2.7188, "step": 2134 }, { "epoch": 1.413907284768212, "grad_norm": 0.9449159280040796, "learning_rate": 0.00029972942445966733, "loss": 2.4531, "step": 2135 }, { "epoch": 1.414569536423841, "grad_norm": 1.1416375068990001, "learning_rate": 0.0002997254023147612, "loss": 2.6094, "step": 2136 }, { "epoch": 1.4152317880794703, "grad_norm": 1.6639698092751323, "learning_rate": 0.00029972135052250883, "loss": 2.4531, "step": 2137 }, { "epoch": 1.4158940397350994, "grad_norm": 1.0859096917199904, "learning_rate": 0.0002997172690837126, "loss": 2.2344, "step": 2138 }, { "epoch": 1.4165562913907284, "grad_norm": 1.0517869458832836, "learning_rate": 0.0002997131579991806, "loss": 2.4375, "step": 2139 }, { "epoch": 1.4172185430463577, "grad_norm": 0.984439846038069, "learning_rate": 0.00029970901726972694, "loss": 2.3438, "step": 2140 }, { "epoch": 1.4178807947019867, "grad_norm": 1.0841803994312005, "learning_rate": 0.00029970484689617144, "loss": 2.6719, "step": 2141 }, { "epoch": 1.4185430463576159, "grad_norm": 0.9571427054971211, "learning_rate": 0.00029970064687934, "loss": 2.4062, "step": 2142 }, { "epoch": 1.419205298013245, "grad_norm": 0.9968538494134072, "learning_rate": 0.00029969641722006423, "loss": 2.5312, "step": 2143 }, { "epoch": 1.4198675496688742, "grad_norm": 0.9838747613924839, "learning_rate": 0.00029969215791918153, "loss": 2.2812, "step": 2144 }, { "epoch": 1.4205298013245033, "grad_norm": 1.0002782704685769, "learning_rate": 0.0002996878689775355, "loss": 2.7188, "step": 2145 }, { "epoch": 1.4211920529801325, "grad_norm": 1.0772060173754583, "learning_rate": 0.0002996835503959753, "loss": 2.7031, "step": 2146 }, { "epoch": 1.4218543046357617, "grad_norm": 0.9145045826150184, "learning_rate": 0.000299679202175356, "loss": 2.625, "step": 2147 }, { "epoch": 1.4225165562913906, "grad_norm": 1.0217006279292333, "learning_rate": 0.0002996748243165387, "loss": 2.4219, "step": 2148 }, { "epoch": 1.4231788079470198, "grad_norm": 0.9962119284023744, "learning_rate": 0.0002996704168203902, "loss": 2.5312, "step": 2149 }, { "epoch": 1.423841059602649, "grad_norm": 0.9462382168409187, "learning_rate": 0.0002996659796877833, "loss": 2.6562, "step": 2150 }, { "epoch": 1.4245033112582781, "grad_norm": 1.0113280370470705, "learning_rate": 0.0002996615129195965, "loss": 2.5781, "step": 2151 }, { "epoch": 1.4251655629139073, "grad_norm": 0.9215109975379985, "learning_rate": 0.0002996570165167143, "loss": 2.2656, "step": 2152 }, { "epoch": 1.4258278145695364, "grad_norm": 1.12200714133647, "learning_rate": 0.0002996524904800271, "loss": 2.5312, "step": 2153 }, { "epoch": 1.4264900662251656, "grad_norm": 0.9472627848008581, "learning_rate": 0.0002996479348104311, "loss": 2.625, "step": 2154 }, { "epoch": 1.4271523178807948, "grad_norm": 1.0065261669622478, "learning_rate": 0.00029964334950882834, "loss": 2.5781, "step": 2155 }, { "epoch": 1.427814569536424, "grad_norm": 1.0467617191929983, "learning_rate": 0.0002996387345761268, "loss": 2.0625, "step": 2156 }, { "epoch": 1.4284768211920529, "grad_norm": 0.9684265857571045, "learning_rate": 0.00029963409001324015, "loss": 2.4062, "step": 2157 }, { "epoch": 1.429139072847682, "grad_norm": 0.8781180766063381, "learning_rate": 0.0002996294158210883, "loss": 2.125, "step": 2158 }, { "epoch": 1.4298013245033112, "grad_norm": 1.0049911770768698, "learning_rate": 0.0002996247120005966, "loss": 2.6875, "step": 2159 }, { "epoch": 1.4304635761589404, "grad_norm": 0.926129318755258, "learning_rate": 0.0002996199785526966, "loss": 2.125, "step": 2160 }, { "epoch": 1.4311258278145695, "grad_norm": 0.7898323276246039, "learning_rate": 0.0002996152154783255, "loss": 1.8984, "step": 2161 }, { "epoch": 1.4317880794701987, "grad_norm": 0.9626418487070505, "learning_rate": 0.0002996104227784264, "loss": 2.625, "step": 2162 }, { "epoch": 1.4324503311258279, "grad_norm": 0.9225346212743132, "learning_rate": 0.0002996056004539483, "loss": 2.5625, "step": 2163 }, { "epoch": 1.433112582781457, "grad_norm": 1.054888544536959, "learning_rate": 0.0002996007485058463, "loss": 2.8594, "step": 2164 }, { "epoch": 1.4337748344370862, "grad_norm": 0.8805679398921958, "learning_rate": 0.0002995958669350809, "loss": 2.125, "step": 2165 }, { "epoch": 1.4344370860927151, "grad_norm": 0.9921058010312066, "learning_rate": 0.0002995909557426188, "loss": 2.6094, "step": 2166 }, { "epoch": 1.4350993377483443, "grad_norm": 0.9666739059398567, "learning_rate": 0.00029958601492943243, "loss": 2.2188, "step": 2167 }, { "epoch": 1.4357615894039735, "grad_norm": 1.0461201495503443, "learning_rate": 0.00029958104449650017, "loss": 2.6875, "step": 2168 }, { "epoch": 1.4364238410596026, "grad_norm": 2.3315355821867647, "learning_rate": 0.00029957604444480625, "loss": 2.5, "step": 2169 }, { "epoch": 1.4370860927152318, "grad_norm": 0.9654622286173267, "learning_rate": 0.00029957101477534067, "loss": 2.6719, "step": 2170 }, { "epoch": 1.437748344370861, "grad_norm": 1.0580376279345556, "learning_rate": 0.0002995659554890993, "loss": 2.5469, "step": 2171 }, { "epoch": 1.4384105960264901, "grad_norm": 0.9972337421662041, "learning_rate": 0.00029956086658708414, "loss": 2.3906, "step": 2172 }, { "epoch": 1.439072847682119, "grad_norm": 1.1547474700803408, "learning_rate": 0.0002995557480703027, "loss": 2.5625, "step": 2173 }, { "epoch": 1.4397350993377485, "grad_norm": 1.0690611728683848, "learning_rate": 0.00029955059993976854, "loss": 2.2969, "step": 2174 }, { "epoch": 1.4403973509933774, "grad_norm": 1.0262170423749843, "learning_rate": 0.00029954542219650103, "loss": 2.7344, "step": 2175 }, { "epoch": 1.4410596026490066, "grad_norm": 1.0424042469909958, "learning_rate": 0.0002995402148415255, "loss": 2.2344, "step": 2176 }, { "epoch": 1.4417218543046357, "grad_norm": 0.9853496474894284, "learning_rate": 0.00029953497787587295, "loss": 2.4844, "step": 2177 }, { "epoch": 1.442384105960265, "grad_norm": 1.050554866514043, "learning_rate": 0.00029952971130058043, "loss": 2.6094, "step": 2178 }, { "epoch": 1.443046357615894, "grad_norm": 0.9362634108301414, "learning_rate": 0.0002995244151166908, "loss": 2.1875, "step": 2179 }, { "epoch": 1.4437086092715232, "grad_norm": 0.8963364612115888, "learning_rate": 0.0002995190893252527, "loss": 2.2812, "step": 2180 }, { "epoch": 1.4443708609271524, "grad_norm": 0.9531468696034419, "learning_rate": 0.0002995137339273208, "loss": 2.5781, "step": 2181 }, { "epoch": 1.4450331125827813, "grad_norm": 1.030754419163838, "learning_rate": 0.00029950834892395535, "loss": 2.5625, "step": 2182 }, { "epoch": 1.4456953642384107, "grad_norm": 1.0200434739291868, "learning_rate": 0.00029950293431622285, "loss": 2.4844, "step": 2183 }, { "epoch": 1.4463576158940397, "grad_norm": 0.9745848250975345, "learning_rate": 0.00029949749010519533, "loss": 2.4531, "step": 2184 }, { "epoch": 1.4470198675496688, "grad_norm": 0.9992595477958197, "learning_rate": 0.0002994920162919508, "loss": 2.2812, "step": 2185 }, { "epoch": 1.447682119205298, "grad_norm": 1.0011650865411004, "learning_rate": 0.00029948651287757327, "loss": 2.1094, "step": 2186 }, { "epoch": 1.4483443708609272, "grad_norm": 1.0626348387906883, "learning_rate": 0.00029948097986315234, "loss": 2.5781, "step": 2187 }, { "epoch": 1.4490066225165563, "grad_norm": 1.0959646330787183, "learning_rate": 0.0002994754172497836, "loss": 2.4688, "step": 2188 }, { "epoch": 1.4496688741721855, "grad_norm": 0.9306188761457541, "learning_rate": 0.00029946982503856857, "loss": 2.6719, "step": 2189 }, { "epoch": 1.4503311258278146, "grad_norm": 0.9073547574130615, "learning_rate": 0.0002994642032306146, "loss": 2.3594, "step": 2190 }, { "epoch": 1.4509933774834436, "grad_norm": 0.9797658272770757, "learning_rate": 0.00029945855182703486, "loss": 2.7188, "step": 2191 }, { "epoch": 1.451655629139073, "grad_norm": 0.9212378264447347, "learning_rate": 0.00029945287082894836, "loss": 2.5938, "step": 2192 }, { "epoch": 1.452317880794702, "grad_norm": 1.0151534078778723, "learning_rate": 0.00029944716023748, "loss": 2.4531, "step": 2193 }, { "epoch": 1.452980132450331, "grad_norm": 1.0209988612900802, "learning_rate": 0.00029944142005376055, "loss": 2.4062, "step": 2194 }, { "epoch": 1.4536423841059603, "grad_norm": 1.0569536280321712, "learning_rate": 0.00029943565027892665, "loss": 2.4688, "step": 2195 }, { "epoch": 1.4543046357615894, "grad_norm": 0.973073077517115, "learning_rate": 0.0002994298509141208, "loss": 2.625, "step": 2196 }, { "epoch": 1.4549668874172186, "grad_norm": 0.8472445519570411, "learning_rate": 0.00029942402196049127, "loss": 2.3125, "step": 2197 }, { "epoch": 1.4556291390728477, "grad_norm": 0.9250924754416165, "learning_rate": 0.0002994181634191923, "loss": 2.4062, "step": 2198 }, { "epoch": 1.456291390728477, "grad_norm": 0.9972108791238056, "learning_rate": 0.00029941227529138397, "loss": 2.75, "step": 2199 }, { "epoch": 1.4569536423841059, "grad_norm": 1.1005013481880201, "learning_rate": 0.00029940635757823216, "loss": 2.5156, "step": 2200 }, { "epoch": 1.457615894039735, "grad_norm": 0.90795962645883, "learning_rate": 0.0002994004102809087, "loss": 2.5781, "step": 2201 }, { "epoch": 1.4582781456953642, "grad_norm": 1.1582081424160675, "learning_rate": 0.00029939443340059116, "loss": 2.5469, "step": 2202 }, { "epoch": 1.4589403973509933, "grad_norm": 0.914456740134419, "learning_rate": 0.00029938842693846303, "loss": 2.3906, "step": 2203 }, { "epoch": 1.4596026490066225, "grad_norm": 0.9160385265915898, "learning_rate": 0.0002993823908957137, "loss": 2.0, "step": 2204 }, { "epoch": 1.4602649006622517, "grad_norm": 1.1101265255685904, "learning_rate": 0.00029937632527353837, "loss": 2.4688, "step": 2205 }, { "epoch": 1.4609271523178808, "grad_norm": 1.1247032209702812, "learning_rate": 0.0002993702300731381, "loss": 2.8906, "step": 2206 }, { "epoch": 1.46158940397351, "grad_norm": 0.9182031063560402, "learning_rate": 0.0002993641052957198, "loss": 2.8906, "step": 2207 }, { "epoch": 1.4622516556291392, "grad_norm": 0.9884498179904752, "learning_rate": 0.00029935795094249623, "loss": 2.3281, "step": 2208 }, { "epoch": 1.4629139072847681, "grad_norm": 0.9904464978923531, "learning_rate": 0.00029935176701468603, "loss": 2.4531, "step": 2209 }, { "epoch": 1.4635761589403973, "grad_norm": 0.8934308664586316, "learning_rate": 0.0002993455535135137, "loss": 2.2656, "step": 2210 }, { "epoch": 1.4642384105960264, "grad_norm": 0.9311145650141537, "learning_rate": 0.0002993393104402096, "loss": 2.4844, "step": 2211 }, { "epoch": 1.4649006622516556, "grad_norm": 0.89757965722654, "learning_rate": 0.0002993330377960099, "loss": 2.4375, "step": 2212 }, { "epoch": 1.4655629139072848, "grad_norm": 0.8820427296521648, "learning_rate": 0.0002993267355821567, "loss": 2.3125, "step": 2213 }, { "epoch": 1.466225165562914, "grad_norm": 1.0339670667228942, "learning_rate": 0.00029932040379989787, "loss": 2.7344, "step": 2214 }, { "epoch": 1.466887417218543, "grad_norm": 1.0508568588045355, "learning_rate": 0.00029931404245048715, "loss": 1.875, "step": 2215 }, { "epoch": 1.4675496688741723, "grad_norm": 0.9309277007384041, "learning_rate": 0.0002993076515351842, "loss": 2.6875, "step": 2216 }, { "epoch": 1.4682119205298014, "grad_norm": 0.8326487364228112, "learning_rate": 0.00029930123105525455, "loss": 2.4062, "step": 2217 }, { "epoch": 1.4688741721854304, "grad_norm": 0.9733888966128189, "learning_rate": 0.0002992947810119694, "loss": 2.6562, "step": 2218 }, { "epoch": 1.4695364238410595, "grad_norm": 0.9247161309578887, "learning_rate": 0.00029928830140660604, "loss": 2.4531, "step": 2219 }, { "epoch": 1.4701986754966887, "grad_norm": 0.9310946953839669, "learning_rate": 0.0002992817922404475, "loss": 2.5625, "step": 2220 }, { "epoch": 1.4708609271523179, "grad_norm": 1.0249183840394278, "learning_rate": 0.00029927525351478257, "loss": 2.4844, "step": 2221 }, { "epoch": 1.471523178807947, "grad_norm": 0.9056492402580734, "learning_rate": 0.0002992686852309061, "loss": 2.4531, "step": 2222 }, { "epoch": 1.4721854304635762, "grad_norm": 1.0069941009306147, "learning_rate": 0.0002992620873901186, "loss": 2.125, "step": 2223 }, { "epoch": 1.4728476821192054, "grad_norm": 0.9080274376842974, "learning_rate": 0.0002992554599937266, "loss": 2.375, "step": 2224 }, { "epoch": 1.4735099337748343, "grad_norm": 0.9574691269621646, "learning_rate": 0.0002992488030430424, "loss": 2.4844, "step": 2225 }, { "epoch": 1.4741721854304637, "grad_norm": 0.8857828960475895, "learning_rate": 0.000299242116539384, "loss": 2.0469, "step": 2226 }, { "epoch": 1.4748344370860926, "grad_norm": 0.9392572199148592, "learning_rate": 0.0002992354004840756, "loss": 2.4844, "step": 2227 }, { "epoch": 1.4754966887417218, "grad_norm": 0.9645189684553113, "learning_rate": 0.000299228654878447, "loss": 2.6562, "step": 2228 }, { "epoch": 1.476158940397351, "grad_norm": 1.0249253285767028, "learning_rate": 0.00029922187972383383, "loss": 2.625, "step": 2229 }, { "epoch": 1.4768211920529801, "grad_norm": 0.8895718191214521, "learning_rate": 0.00029921507502157774, "loss": 2.5312, "step": 2230 }, { "epoch": 1.4774834437086093, "grad_norm": 0.9697110142340752, "learning_rate": 0.00029920824077302605, "loss": 2.3906, "step": 2231 }, { "epoch": 1.4781456953642385, "grad_norm": 0.945257059297628, "learning_rate": 0.00029920137697953214, "loss": 2.4375, "step": 2232 }, { "epoch": 1.4788079470198676, "grad_norm": 0.9839439850869712, "learning_rate": 0.000299194483642455, "loss": 2.6094, "step": 2233 }, { "epoch": 1.4794701986754966, "grad_norm": 1.026555664998305, "learning_rate": 0.0002991875607631596, "loss": 2.5312, "step": 2234 }, { "epoch": 1.480132450331126, "grad_norm": 1.0400850075677455, "learning_rate": 0.0002991806083430169, "loss": 2.8594, "step": 2235 }, { "epoch": 1.480794701986755, "grad_norm": 1.6812644678260764, "learning_rate": 0.0002991736263834034, "loss": 2.4531, "step": 2236 }, { "epoch": 1.481456953642384, "grad_norm": 1.0481246874714307, "learning_rate": 0.0002991666148857016, "loss": 2.7031, "step": 2237 }, { "epoch": 1.4821192052980132, "grad_norm": 1.0080732245522337, "learning_rate": 0.0002991595738513, "loss": 2.3594, "step": 2238 }, { "epoch": 1.4827814569536424, "grad_norm": 0.9414461677030689, "learning_rate": 0.00029915250328159263, "loss": 2.5312, "step": 2239 }, { "epoch": 1.4834437086092715, "grad_norm": 0.9626123170038507, "learning_rate": 0.00029914540317797973, "loss": 2.5469, "step": 2240 }, { "epoch": 1.4841059602649007, "grad_norm": 0.96942463610998, "learning_rate": 0.000299138273541867, "loss": 1.9297, "step": 2241 }, { "epoch": 1.4847682119205299, "grad_norm": 1.040747242865704, "learning_rate": 0.0002991311143746664, "loss": 2.5781, "step": 2242 }, { "epoch": 1.4854304635761588, "grad_norm": 1.1787401948161933, "learning_rate": 0.00029912392567779536, "loss": 2.875, "step": 2243 }, { "epoch": 1.4860927152317882, "grad_norm": 0.9710683383118753, "learning_rate": 0.0002991167074526774, "loss": 2.5156, "step": 2244 }, { "epoch": 1.4867549668874172, "grad_norm": 0.8931900777410339, "learning_rate": 0.00029910945970074176, "loss": 2.4219, "step": 2245 }, { "epoch": 1.4874172185430463, "grad_norm": 1.032200369726091, "learning_rate": 0.0002991021824234236, "loss": 2.4844, "step": 2246 }, { "epoch": 1.4880794701986755, "grad_norm": 1.0916729537974976, "learning_rate": 0.000299094875622164, "loss": 2.5156, "step": 2247 }, { "epoch": 1.4887417218543046, "grad_norm": 0.900077158911175, "learning_rate": 0.0002990875392984097, "loss": 2.0312, "step": 2248 }, { "epoch": 1.4894039735099338, "grad_norm": 0.89775513470307, "learning_rate": 0.00029908017345361336, "loss": 2.3906, "step": 2249 }, { "epoch": 1.490066225165563, "grad_norm": 0.9965560385657952, "learning_rate": 0.00029907277808923356, "loss": 2.4688, "step": 2250 }, { "epoch": 1.4907284768211921, "grad_norm": 1.109496676069234, "learning_rate": 0.0002990653532067346, "loss": 2.8125, "step": 2251 }, { "epoch": 1.491390728476821, "grad_norm": 0.9651585987199373, "learning_rate": 0.0002990578988075867, "loss": 2.6562, "step": 2252 }, { "epoch": 1.4920529801324505, "grad_norm": 0.9516042390966882, "learning_rate": 0.000299050414893266, "loss": 2.125, "step": 2253 }, { "epoch": 1.4927152317880794, "grad_norm": 1.0959892438052803, "learning_rate": 0.0002990429014652543, "loss": 2.4844, "step": 2254 }, { "epoch": 1.4933774834437086, "grad_norm": 0.9295136835936418, "learning_rate": 0.0002990353585250394, "loss": 2.7031, "step": 2255 }, { "epoch": 1.4940397350993377, "grad_norm": 0.9764504972134843, "learning_rate": 0.0002990277860741149, "loss": 2.7969, "step": 2256 }, { "epoch": 1.494701986754967, "grad_norm": 0.9694103062956547, "learning_rate": 0.0002990201841139802, "loss": 2.5625, "step": 2257 }, { "epoch": 1.495364238410596, "grad_norm": 1.0119943119577604, "learning_rate": 0.0002990125526461405, "loss": 2.5312, "step": 2258 }, { "epoch": 1.4960264900662252, "grad_norm": 0.9893357839726827, "learning_rate": 0.00029900489167210715, "loss": 2.0312, "step": 2259 }, { "epoch": 1.4966887417218544, "grad_norm": 0.8753220294517673, "learning_rate": 0.00029899720119339686, "loss": 1.9844, "step": 2260 }, { "epoch": 1.4973509933774833, "grad_norm": 1.066527330356536, "learning_rate": 0.0002989894812115326, "loss": 2.7031, "step": 2261 }, { "epoch": 1.4980132450331125, "grad_norm": 0.9629555378366176, "learning_rate": 0.00029898173172804293, "loss": 2.6094, "step": 2262 }, { "epoch": 1.4986754966887417, "grad_norm": 0.8913995764898816, "learning_rate": 0.0002989739527444624, "loss": 2.1719, "step": 2263 }, { "epoch": 1.4993377483443708, "grad_norm": 0.9658175403720054, "learning_rate": 0.0002989661442623312, "loss": 2.4844, "step": 2264 }, { "epoch": 1.5, "grad_norm": 0.9734340932988587, "learning_rate": 0.00029895830628319563, "loss": 2.4844, "step": 2265 }, { "epoch": 1.5006622516556292, "grad_norm": 0.9816524994313781, "learning_rate": 0.0002989504388086077, "loss": 2.5625, "step": 2266 }, { "epoch": 1.5013245033112583, "grad_norm": 0.9929745950744212, "learning_rate": 0.0002989425418401252, "loss": 2.6875, "step": 2267 }, { "epoch": 1.5019867549668873, "grad_norm": 0.9783998967602694, "learning_rate": 0.00029893461537931185, "loss": 2.7188, "step": 2268 }, { "epoch": 1.5026490066225167, "grad_norm": 0.9472676338288144, "learning_rate": 0.00029892665942773716, "loss": 2.4531, "step": 2269 }, { "epoch": 1.5033112582781456, "grad_norm": 1.0038027271668695, "learning_rate": 0.0002989186739869765, "loss": 2.8438, "step": 2270 }, { "epoch": 1.503973509933775, "grad_norm": 0.880262084566376, "learning_rate": 0.0002989106590586111, "loss": 2.5938, "step": 2271 }, { "epoch": 1.504635761589404, "grad_norm": 0.9546595433612308, "learning_rate": 0.00029890261464422807, "loss": 2.2031, "step": 2272 }, { "epoch": 1.505298013245033, "grad_norm": 1.0304215914545412, "learning_rate": 0.0002988945407454202, "loss": 2.0469, "step": 2273 }, { "epoch": 1.5059602649006623, "grad_norm": 1.0082173313359755, "learning_rate": 0.0002988864373637862, "loss": 2.4375, "step": 2274 }, { "epoch": 1.5066225165562914, "grad_norm": 0.9174620352802345, "learning_rate": 0.0002988783045009307, "loss": 2.4219, "step": 2275 }, { "epoch": 1.5072847682119206, "grad_norm": 0.9889198017995099, "learning_rate": 0.00029887014215846405, "loss": 2.625, "step": 2276 }, { "epoch": 1.5079470198675495, "grad_norm": 0.9571846479884542, "learning_rate": 0.00029886195033800256, "loss": 2.4062, "step": 2277 }, { "epoch": 1.508609271523179, "grad_norm": 0.9545811897127711, "learning_rate": 0.0002988537290411682, "loss": 2.2969, "step": 2278 }, { "epoch": 1.5092715231788079, "grad_norm": 1.0962799388965638, "learning_rate": 0.000298845478269589, "loss": 2.6094, "step": 2279 }, { "epoch": 1.5099337748344372, "grad_norm": 1.0133772617585435, "learning_rate": 0.0002988371980248986, "loss": 2.5781, "step": 2280 }, { "epoch": 1.5105960264900662, "grad_norm": 0.9128192630021276, "learning_rate": 0.00029882888830873663, "loss": 2.5469, "step": 2281 }, { "epoch": 1.5112582781456954, "grad_norm": 0.8833398705599705, "learning_rate": 0.0002988205491227485, "loss": 2.5156, "step": 2282 }, { "epoch": 1.5119205298013245, "grad_norm": 0.8314339604795769, "learning_rate": 0.0002988121804685855, "loss": 2.4062, "step": 2283 }, { "epoch": 1.5125827814569537, "grad_norm": 0.9287077315177573, "learning_rate": 0.0002988037823479047, "loss": 2.1406, "step": 2284 }, { "epoch": 1.5132450331125828, "grad_norm": 0.9627672760179488, "learning_rate": 0.00029879535476236903, "loss": 2.6094, "step": 2285 }, { "epoch": 1.5139072847682118, "grad_norm": 0.8508930409776032, "learning_rate": 0.0002987868977136472, "loss": 2.4375, "step": 2286 }, { "epoch": 1.5145695364238412, "grad_norm": 0.8712854269686106, "learning_rate": 0.0002987784112034138, "loss": 2.5469, "step": 2287 }, { "epoch": 1.5152317880794701, "grad_norm": 1.0499644764796048, "learning_rate": 0.00029876989523334936, "loss": 2.7969, "step": 2288 }, { "epoch": 1.5158940397350993, "grad_norm": 0.940808725635407, "learning_rate": 0.00029876134980514, "loss": 2.2031, "step": 2289 }, { "epoch": 1.5165562913907285, "grad_norm": 0.7610166684769152, "learning_rate": 0.00029875277492047794, "loss": 2.125, "step": 2290 }, { "epoch": 1.5172185430463576, "grad_norm": 0.8540399164426568, "learning_rate": 0.00029874417058106103, "loss": 2.375, "step": 2291 }, { "epoch": 1.5178807947019868, "grad_norm": 0.9202743303114995, "learning_rate": 0.00029873553678859307, "loss": 2.625, "step": 2292 }, { "epoch": 1.518543046357616, "grad_norm": 1.003214411076348, "learning_rate": 0.00029872687354478357, "loss": 2.4219, "step": 2293 }, { "epoch": 1.519205298013245, "grad_norm": 0.8953183449789073, "learning_rate": 0.0002987181808513481, "loss": 2.4531, "step": 2294 }, { "epoch": 1.519867549668874, "grad_norm": 0.8983376888905227, "learning_rate": 0.0002987094587100078, "loss": 2.3438, "step": 2295 }, { "epoch": 1.5205298013245034, "grad_norm": 0.8689615262058188, "learning_rate": 0.00029870070712248974, "loss": 1.9688, "step": 2296 }, { "epoch": 1.5211920529801324, "grad_norm": 0.8401751857227879, "learning_rate": 0.0002986919260905269, "loss": 2.375, "step": 2297 }, { "epoch": 1.5218543046357615, "grad_norm": 1.0394392811345194, "learning_rate": 0.000298683115615858, "loss": 2.5625, "step": 2298 }, { "epoch": 1.5225165562913907, "grad_norm": 1.0124725877938936, "learning_rate": 0.0002986742757002276, "loss": 2.5938, "step": 2299 }, { "epoch": 1.5231788079470199, "grad_norm": 1.0393730476410066, "learning_rate": 0.0002986654063453861, "loss": 2.4062, "step": 2300 }, { "epoch": 1.523841059602649, "grad_norm": 0.9114001830138886, "learning_rate": 0.00029865650755308973, "loss": 2.5781, "step": 2301 }, { "epoch": 1.524503311258278, "grad_norm": 0.9910707700753598, "learning_rate": 0.0002986475793251006, "loss": 2.4844, "step": 2302 }, { "epoch": 1.5251655629139074, "grad_norm": 0.9974585673477396, "learning_rate": 0.0002986386216631866, "loss": 2.8125, "step": 2303 }, { "epoch": 1.5258278145695363, "grad_norm": 0.9410577666642724, "learning_rate": 0.0002986296345691214, "loss": 2.1406, "step": 2304 }, { "epoch": 1.5264900662251657, "grad_norm": 0.8831950557754421, "learning_rate": 0.0002986206180446846, "loss": 2.0469, "step": 2305 }, { "epoch": 1.5271523178807946, "grad_norm": 0.8589849832315861, "learning_rate": 0.00029861157209166154, "loss": 2.4219, "step": 2306 }, { "epoch": 1.5278145695364238, "grad_norm": 0.8912659034075949, "learning_rate": 0.0002986024967118434, "loss": 2.5781, "step": 2307 }, { "epoch": 1.528476821192053, "grad_norm": 1.1690949523673406, "learning_rate": 0.0002985933919070272, "loss": 2.4688, "step": 2308 }, { "epoch": 1.5291390728476821, "grad_norm": 1.0646105778195296, "learning_rate": 0.00029858425767901594, "loss": 2.4688, "step": 2309 }, { "epoch": 1.5298013245033113, "grad_norm": 0.8374765050660513, "learning_rate": 0.0002985750940296181, "loss": 2.3594, "step": 2310 }, { "epoch": 1.5304635761589402, "grad_norm": 0.9532256370755269, "learning_rate": 0.00029856590096064836, "loss": 2.5312, "step": 2311 }, { "epoch": 1.5311258278145696, "grad_norm": 0.8775748988864893, "learning_rate": 0.00029855667847392695, "loss": 2.2969, "step": 2312 }, { "epoch": 1.5317880794701986, "grad_norm": 0.9907184930039408, "learning_rate": 0.00029854742657128006, "loss": 2.4219, "step": 2313 }, { "epoch": 1.532450331125828, "grad_norm": 0.956373136861473, "learning_rate": 0.00029853814525453967, "loss": 2.3906, "step": 2314 }, { "epoch": 1.533112582781457, "grad_norm": 0.9685820840848379, "learning_rate": 0.00029852883452554356, "loss": 2.5625, "step": 2315 }, { "epoch": 1.533774834437086, "grad_norm": 0.932313468229487, "learning_rate": 0.0002985194943861354, "loss": 2.5312, "step": 2316 }, { "epoch": 1.5344370860927152, "grad_norm": 1.0244475744412036, "learning_rate": 0.0002985101248381646, "loss": 2.8125, "step": 2317 }, { "epoch": 1.5350993377483444, "grad_norm": 0.9502117724836681, "learning_rate": 0.0002985007258834865, "loss": 2.4688, "step": 2318 }, { "epoch": 1.5357615894039736, "grad_norm": 1.0646374754798642, "learning_rate": 0.00029849129752396224, "loss": 2.4531, "step": 2319 }, { "epoch": 1.5364238410596025, "grad_norm": 0.994271909670904, "learning_rate": 0.0002984818397614586, "loss": 2.5312, "step": 2320 }, { "epoch": 1.5370860927152319, "grad_norm": 1.3409726468281091, "learning_rate": 0.00029847235259784836, "loss": 2.6094, "step": 2321 }, { "epoch": 1.5377483443708608, "grad_norm": 0.9340548772315902, "learning_rate": 0.0002984628360350102, "loss": 2.3438, "step": 2322 }, { "epoch": 1.5384105960264902, "grad_norm": 0.97459115939099, "learning_rate": 0.00029845329007482837, "loss": 2.625, "step": 2323 }, { "epoch": 1.5390728476821192, "grad_norm": 0.9239066205749943, "learning_rate": 0.0002984437147191932, "loss": 2.4062, "step": 2324 }, { "epoch": 1.5397350993377483, "grad_norm": 0.9450340327131521, "learning_rate": 0.0002984341099700006, "loss": 2.2969, "step": 2325 }, { "epoch": 1.5403973509933775, "grad_norm": 0.9880422459754131, "learning_rate": 0.00029842447582915254, "loss": 2.6094, "step": 2326 }, { "epoch": 1.5410596026490067, "grad_norm": 0.9353465623155267, "learning_rate": 0.00029841481229855664, "loss": 2.6094, "step": 2327 }, { "epoch": 1.5417218543046358, "grad_norm": 0.9505862196628337, "learning_rate": 0.00029840511938012635, "loss": 2.7656, "step": 2328 }, { "epoch": 1.5423841059602648, "grad_norm": 0.96840165754237, "learning_rate": 0.000298395397075781, "loss": 2.5469, "step": 2329 }, { "epoch": 1.5430463576158941, "grad_norm": 0.9471417742023834, "learning_rate": 0.0002983856453874458, "loss": 2.5781, "step": 2330 }, { "epoch": 1.543708609271523, "grad_norm": 1.0361705525948484, "learning_rate": 0.0002983758643170516, "loss": 2.3438, "step": 2331 }, { "epoch": 1.5443708609271525, "grad_norm": 0.9711235007772678, "learning_rate": 0.00029836605386653514, "loss": 2.1406, "step": 2332 }, { "epoch": 1.5450331125827814, "grad_norm": 0.8907324349231668, "learning_rate": 0.0002983562140378391, "loss": 2.3594, "step": 2333 }, { "epoch": 1.5456953642384106, "grad_norm": 1.0357534061322606, "learning_rate": 0.0002983463448329118, "loss": 2.4375, "step": 2334 }, { "epoch": 1.5463576158940397, "grad_norm": 0.9708299997851642, "learning_rate": 0.00029833644625370755, "loss": 2.2969, "step": 2335 }, { "epoch": 1.547019867549669, "grad_norm": 0.858879059028281, "learning_rate": 0.0002983265183021863, "loss": 2.3906, "step": 2336 }, { "epoch": 1.547682119205298, "grad_norm": 0.9739831016371527, "learning_rate": 0.00029831656098031387, "loss": 2.5469, "step": 2337 }, { "epoch": 1.548344370860927, "grad_norm": 0.9697674657167231, "learning_rate": 0.00029830657429006204, "loss": 2.4844, "step": 2338 }, { "epoch": 1.5490066225165564, "grad_norm": 0.9673812037816254, "learning_rate": 0.00029829655823340815, "loss": 2.4062, "step": 2339 }, { "epoch": 1.5496688741721854, "grad_norm": 0.9158567071596156, "learning_rate": 0.0002982865128123356, "loss": 2.5625, "step": 2340 }, { "epoch": 1.5503311258278145, "grad_norm": 0.9920372217043345, "learning_rate": 0.00029827643802883346, "loss": 2.75, "step": 2341 }, { "epoch": 1.5509933774834437, "grad_norm": 0.9817012933590009, "learning_rate": 0.00029826633388489667, "loss": 2.8281, "step": 2342 }, { "epoch": 1.5516556291390728, "grad_norm": 0.9503736626809571, "learning_rate": 0.0002982562003825259, "loss": 2.5625, "step": 2343 }, { "epoch": 1.552317880794702, "grad_norm": 1.0030852686766019, "learning_rate": 0.00029824603752372784, "loss": 2.4375, "step": 2344 }, { "epoch": 1.5529801324503312, "grad_norm": 0.9344704733189892, "learning_rate": 0.0002982358453105147, "loss": 2.4375, "step": 2345 }, { "epoch": 1.5536423841059603, "grad_norm": 0.9332399013724976, "learning_rate": 0.00029822562374490474, "loss": 2.0469, "step": 2346 }, { "epoch": 1.5543046357615893, "grad_norm": 1.0400771581708228, "learning_rate": 0.0002982153728289219, "loss": 2.6406, "step": 2347 }, { "epoch": 1.5549668874172187, "grad_norm": 0.9299631193745552, "learning_rate": 0.0002982050925645961, "loss": 2.5625, "step": 2348 }, { "epoch": 1.5556291390728476, "grad_norm": 1.0543976538834634, "learning_rate": 0.00029819478295396273, "loss": 2.5625, "step": 2349 }, { "epoch": 1.5562913907284768, "grad_norm": 0.8048431251298995, "learning_rate": 0.00029818444399906343, "loss": 2.4062, "step": 2350 }, { "epoch": 1.556953642384106, "grad_norm": 1.0602584321964825, "learning_rate": 0.00029817407570194526, "loss": 2.3906, "step": 2351 }, { "epoch": 1.557615894039735, "grad_norm": 0.9291421931039523, "learning_rate": 0.00029816367806466144, "loss": 2.5312, "step": 2352 }, { "epoch": 1.5582781456953643, "grad_norm": 0.992995606800549, "learning_rate": 0.00029815325108927063, "loss": 2.375, "step": 2353 }, { "epoch": 1.5589403973509932, "grad_norm": 0.9741620788160839, "learning_rate": 0.0002981427947778376, "loss": 2.625, "step": 2354 }, { "epoch": 1.5596026490066226, "grad_norm": 0.8615299422277266, "learning_rate": 0.00029813230913243287, "loss": 2.0781, "step": 2355 }, { "epoch": 1.5602649006622515, "grad_norm": 0.8798929319777776, "learning_rate": 0.0002981217941551326, "loss": 2.5156, "step": 2356 }, { "epoch": 1.560927152317881, "grad_norm": 0.8279006252632788, "learning_rate": 0.0002981112498480189, "loss": 2.4688, "step": 2357 }, { "epoch": 1.5615894039735099, "grad_norm": 1.002257797790237, "learning_rate": 0.0002981006762131797, "loss": 2.5, "step": 2358 }, { "epoch": 1.562251655629139, "grad_norm": 0.9171342633827861, "learning_rate": 0.00029809007325270874, "loss": 2.5625, "step": 2359 }, { "epoch": 1.5629139072847682, "grad_norm": 0.941713190640774, "learning_rate": 0.00029807944096870543, "loss": 2.5312, "step": 2360 }, { "epoch": 1.5635761589403974, "grad_norm": 1.0245051698378338, "learning_rate": 0.00029806877936327516, "loss": 2.5625, "step": 2361 }, { "epoch": 1.5642384105960265, "grad_norm": 0.9750012959043385, "learning_rate": 0.000298058088438529, "loss": 2.5938, "step": 2362 }, { "epoch": 1.5649006622516555, "grad_norm": 0.9564782676316433, "learning_rate": 0.0002980473681965839, "loss": 2.3438, "step": 2363 }, { "epoch": 1.5655629139072849, "grad_norm": 1.0372406776313476, "learning_rate": 0.0002980366186395626, "loss": 1.9844, "step": 2364 }, { "epoch": 1.5662251655629138, "grad_norm": 0.8622284588621415, "learning_rate": 0.00029802583976959365, "loss": 2.3438, "step": 2365 }, { "epoch": 1.5668874172185432, "grad_norm": 1.0890346987948818, "learning_rate": 0.0002980150315888113, "loss": 2.6719, "step": 2366 }, { "epoch": 1.5675496688741721, "grad_norm": 0.9009961808051479, "learning_rate": 0.0002980041940993558, "loss": 2.4062, "step": 2367 }, { "epoch": 1.5682119205298013, "grad_norm": 1.0873686512594303, "learning_rate": 0.0002979933273033731, "loss": 2.7344, "step": 2368 }, { "epoch": 1.5688741721854305, "grad_norm": 0.9468863401231004, "learning_rate": 0.00029798243120301483, "loss": 2.6406, "step": 2369 }, { "epoch": 1.5695364238410596, "grad_norm": 0.9154868169855724, "learning_rate": 0.00029797150580043865, "loss": 2.4062, "step": 2370 }, { "epoch": 1.5701986754966888, "grad_norm": 0.9990317802731059, "learning_rate": 0.0002979605510978079, "loss": 2.5156, "step": 2371 }, { "epoch": 1.5708609271523177, "grad_norm": 0.9572739789961483, "learning_rate": 0.0002979495670972917, "loss": 2.6562, "step": 2372 }, { "epoch": 1.5715231788079471, "grad_norm": 1.0905066247282007, "learning_rate": 0.00029793855380106504, "loss": 2.7031, "step": 2373 }, { "epoch": 1.572185430463576, "grad_norm": 0.8891378730523425, "learning_rate": 0.00029792751121130866, "loss": 1.9062, "step": 2374 }, { "epoch": 1.5728476821192054, "grad_norm": 0.9267373929256784, "learning_rate": 0.0002979164393302091, "loss": 2.375, "step": 2375 }, { "epoch": 1.5735099337748344, "grad_norm": 1.012734501282315, "learning_rate": 0.0002979053381599588, "loss": 2.7344, "step": 2376 }, { "epoch": 1.5741721854304636, "grad_norm": 0.8562492228571245, "learning_rate": 0.0002978942077027558, "loss": 2.5312, "step": 2377 }, { "epoch": 1.5748344370860927, "grad_norm": 0.8732027729215307, "learning_rate": 0.0002978830479608042, "loss": 2.5156, "step": 2378 }, { "epoch": 1.5754966887417219, "grad_norm": 0.8641806821922278, "learning_rate": 0.00029787185893631363, "loss": 2.3906, "step": 2379 }, { "epoch": 1.576158940397351, "grad_norm": 0.8640118854531463, "learning_rate": 0.00029786064063149967, "loss": 2.4375, "step": 2380 }, { "epoch": 1.57682119205298, "grad_norm": 0.9513393906552332, "learning_rate": 0.00029784939304858374, "loss": 2.5312, "step": 2381 }, { "epoch": 1.5774834437086094, "grad_norm": 1.020123235678892, "learning_rate": 0.00029783811618979295, "loss": 2.6875, "step": 2382 }, { "epoch": 1.5781456953642383, "grad_norm": 0.9272453328144052, "learning_rate": 0.0002978268100573602, "loss": 2.5469, "step": 2383 }, { "epoch": 1.5788079470198677, "grad_norm": 0.9623133678816124, "learning_rate": 0.0002978154746535244, "loss": 2.4375, "step": 2384 }, { "epoch": 1.5794701986754967, "grad_norm": 0.9659305387567694, "learning_rate": 0.0002978041099805299, "loss": 2.3594, "step": 2385 }, { "epoch": 1.5801324503311258, "grad_norm": 1.0379222943849518, "learning_rate": 0.00029779271604062703, "loss": 2.4531, "step": 2386 }, { "epoch": 1.580794701986755, "grad_norm": 0.8737672293738501, "learning_rate": 0.00029778129283607207, "loss": 2.4844, "step": 2387 }, { "epoch": 1.5814569536423841, "grad_norm": 0.8325813173900654, "learning_rate": 0.00029776984036912685, "loss": 2.2031, "step": 2388 }, { "epoch": 1.5821192052980133, "grad_norm": 0.8755006523790766, "learning_rate": 0.0002977583586420591, "loss": 2.375, "step": 2389 }, { "epoch": 1.5827814569536423, "grad_norm": 1.0033656813564527, "learning_rate": 0.0002977468476571424, "loss": 2.5781, "step": 2390 }, { "epoch": 1.5834437086092716, "grad_norm": 0.9334592130908209, "learning_rate": 0.0002977353074166559, "loss": 2.4062, "step": 2391 }, { "epoch": 1.5841059602649006, "grad_norm": 0.8954430600009667, "learning_rate": 0.0002977237379228849, "loss": 2.6719, "step": 2392 }, { "epoch": 1.58476821192053, "grad_norm": 0.8210776148421538, "learning_rate": 0.00029771213917812014, "loss": 2.4531, "step": 2393 }, { "epoch": 1.585430463576159, "grad_norm": 0.9484909865010892, "learning_rate": 0.00029770051118465843, "loss": 2.3906, "step": 2394 }, { "epoch": 1.586092715231788, "grad_norm": 0.9987138274395734, "learning_rate": 0.0002976888539448021, "loss": 2.6719, "step": 2395 }, { "epoch": 1.5867549668874172, "grad_norm": 0.8930728065901855, "learning_rate": 0.0002976771674608595, "loss": 2.4688, "step": 2396 }, { "epoch": 1.5874172185430464, "grad_norm": 0.8237134434385337, "learning_rate": 0.0002976654517351447, "loss": 2.5156, "step": 2397 }, { "epoch": 1.5880794701986756, "grad_norm": 0.8619875144250574, "learning_rate": 0.00029765370676997755, "loss": 2.4531, "step": 2398 }, { "epoch": 1.5887417218543045, "grad_norm": 0.9294276874983126, "learning_rate": 0.00029764193256768363, "loss": 2.4531, "step": 2399 }, { "epoch": 1.589403973509934, "grad_norm": 0.8831402325195679, "learning_rate": 0.0002976301291305944, "loss": 2.6406, "step": 2400 }, { "epoch": 1.5900662251655628, "grad_norm": 1.0230412118662695, "learning_rate": 0.00029761829646104705, "loss": 2.4375, "step": 2401 }, { "epoch": 1.590728476821192, "grad_norm": 0.8894562258071816, "learning_rate": 0.00029760643456138466, "loss": 2.2656, "step": 2402 }, { "epoch": 1.5913907284768212, "grad_norm": 0.8078294510479886, "learning_rate": 0.00029759454343395597, "loss": 2.25, "step": 2403 }, { "epoch": 1.5920529801324503, "grad_norm": 1.0209886475711418, "learning_rate": 0.0002975826230811156, "loss": 2.5469, "step": 2404 }, { "epoch": 1.5927152317880795, "grad_norm": 1.0808786751764354, "learning_rate": 0.00029757067350522374, "loss": 2.7031, "step": 2405 }, { "epoch": 1.5933774834437087, "grad_norm": 0.8428675741475674, "learning_rate": 0.00029755869470864685, "loss": 2.4062, "step": 2406 }, { "epoch": 1.5940397350993378, "grad_norm": 1.0010016669089024, "learning_rate": 0.0002975466866937566, "loss": 2.7812, "step": 2407 }, { "epoch": 1.5947019867549668, "grad_norm": 0.9569858602738587, "learning_rate": 0.0002975346494629308, "loss": 2.4531, "step": 2408 }, { "epoch": 1.5953642384105962, "grad_norm": 1.0268137863826086, "learning_rate": 0.0002975225830185531, "loss": 2.5469, "step": 2409 }, { "epoch": 1.596026490066225, "grad_norm": 0.9053607000346373, "learning_rate": 0.0002975104873630126, "loss": 2.4375, "step": 2410 }, { "epoch": 1.5966887417218543, "grad_norm": 1.0184609758465082, "learning_rate": 0.00029749836249870444, "loss": 2.5, "step": 2411 }, { "epoch": 1.5973509933774834, "grad_norm": 0.9042872727381593, "learning_rate": 0.00029748620842802946, "loss": 2.0781, "step": 2412 }, { "epoch": 1.5980132450331126, "grad_norm": 0.8591852312806941, "learning_rate": 0.0002974740251533944, "loss": 2.5, "step": 2413 }, { "epoch": 1.5986754966887418, "grad_norm": 0.934462510995766, "learning_rate": 0.0002974618126772116, "loss": 2.7656, "step": 2414 }, { "epoch": 1.5993377483443707, "grad_norm": 0.8730359068258811, "learning_rate": 0.0002974495710018993, "loss": 2.4375, "step": 2415 }, { "epoch": 1.6, "grad_norm": 0.9872681835620981, "learning_rate": 0.0002974373001298815, "loss": 2.4844, "step": 2416 }, { "epoch": 1.600662251655629, "grad_norm": 0.8919496174672944, "learning_rate": 0.00029742500006358795, "loss": 2.5, "step": 2417 }, { "epoch": 1.6013245033112584, "grad_norm": 0.9487790343811981, "learning_rate": 0.0002974126708054542, "loss": 2.3281, "step": 2418 }, { "epoch": 1.6019867549668874, "grad_norm": 0.8546058049536142, "learning_rate": 0.0002974003123579216, "loss": 2.5, "step": 2419 }, { "epoch": 1.6026490066225165, "grad_norm": 0.859949829192009, "learning_rate": 0.0002973879247234373, "loss": 2.5625, "step": 2420 }, { "epoch": 1.6033112582781457, "grad_norm": 0.7985351918342742, "learning_rate": 0.00029737550790445407, "loss": 2.0625, "step": 2421 }, { "epoch": 1.6039735099337749, "grad_norm": 0.8857349917786531, "learning_rate": 0.0002973630619034307, "loss": 2.3438, "step": 2422 }, { "epoch": 1.604635761589404, "grad_norm": 0.8285877575852945, "learning_rate": 0.00029735058672283166, "loss": 2.4844, "step": 2423 }, { "epoch": 1.605298013245033, "grad_norm": 0.9846045595173922, "learning_rate": 0.00029733808236512716, "loss": 2.5781, "step": 2424 }, { "epoch": 1.6059602649006623, "grad_norm": 0.9801182541446352, "learning_rate": 0.0002973255488327931, "loss": 2.4844, "step": 2425 }, { "epoch": 1.6066225165562913, "grad_norm": 0.9868258291042384, "learning_rate": 0.0002973129861283114, "loss": 2.6562, "step": 2426 }, { "epoch": 1.6072847682119207, "grad_norm": 0.8602606548908338, "learning_rate": 0.0002973003942541695, "loss": 2.5156, "step": 2427 }, { "epoch": 1.6079470198675496, "grad_norm": 0.924995734651, "learning_rate": 0.0002972877732128608, "loss": 2.4375, "step": 2428 }, { "epoch": 1.6086092715231788, "grad_norm": 0.9904999208179349, "learning_rate": 0.0002972751230068844, "loss": 2.5156, "step": 2429 }, { "epoch": 1.609271523178808, "grad_norm": 0.9204157229376017, "learning_rate": 0.0002972624436387452, "loss": 2.5938, "step": 2430 }, { "epoch": 1.6099337748344371, "grad_norm": 0.9287160542684919, "learning_rate": 0.0002972497351109539, "loss": 2.4531, "step": 2431 }, { "epoch": 1.6105960264900663, "grad_norm": 0.9651980681652861, "learning_rate": 0.00029723699742602685, "loss": 2.7031, "step": 2432 }, { "epoch": 1.6112582781456952, "grad_norm": 0.9202741001296313, "learning_rate": 0.0002972242305864863, "loss": 2.1094, "step": 2433 }, { "epoch": 1.6119205298013246, "grad_norm": 0.9875269864386703, "learning_rate": 0.0002972114345948602, "loss": 2.6562, "step": 2434 }, { "epoch": 1.6125827814569536, "grad_norm": 0.9186820809571713, "learning_rate": 0.00029719860945368233, "loss": 2.5781, "step": 2435 }, { "epoch": 1.613245033112583, "grad_norm": 0.8523000304138898, "learning_rate": 0.0002971857551654922, "loss": 2.2344, "step": 2436 }, { "epoch": 1.6139072847682119, "grad_norm": 0.956453137167526, "learning_rate": 0.0002971728717328352, "loss": 2.6406, "step": 2437 }, { "epoch": 1.614569536423841, "grad_norm": 0.9050015882918971, "learning_rate": 0.0002971599591582622, "loss": 2.7969, "step": 2438 }, { "epoch": 1.6152317880794702, "grad_norm": 0.8637140999603633, "learning_rate": 0.0002971470174443302, "loss": 2.4219, "step": 2439 }, { "epoch": 1.6158940397350994, "grad_norm": 0.9863673427417546, "learning_rate": 0.00029713404659360176, "loss": 2.4531, "step": 2440 }, { "epoch": 1.6165562913907285, "grad_norm": 0.8955056107363167, "learning_rate": 0.0002971210466086453, "loss": 2.5, "step": 2441 }, { "epoch": 1.6172185430463575, "grad_norm": 0.8879588175380839, "learning_rate": 0.0002971080174920348, "loss": 2.2188, "step": 2442 }, { "epoch": 1.6178807947019869, "grad_norm": 1.028000790993832, "learning_rate": 0.00029709495924635045, "loss": 2.4844, "step": 2443 }, { "epoch": 1.6185430463576158, "grad_norm": 0.8637714928622464, "learning_rate": 0.0002970818718741777, "loss": 2.3125, "step": 2444 }, { "epoch": 1.6192052980132452, "grad_norm": 0.8635138525369889, "learning_rate": 0.0002970687553781081, "loss": 2.0, "step": 2445 }, { "epoch": 1.6198675496688741, "grad_norm": 0.9575566593010487, "learning_rate": 0.0002970556097607388, "loss": 2.8125, "step": 2446 }, { "epoch": 1.6205298013245033, "grad_norm": 0.8954183814866251, "learning_rate": 0.00029704243502467293, "loss": 2.5, "step": 2447 }, { "epoch": 1.6211920529801325, "grad_norm": 0.9744444378487812, "learning_rate": 0.0002970292311725191, "loss": 2.7031, "step": 2448 }, { "epoch": 1.6218543046357616, "grad_norm": 0.8441910576985993, "learning_rate": 0.0002970159982068918, "loss": 2.3125, "step": 2449 }, { "epoch": 1.6225165562913908, "grad_norm": 0.8350917558959503, "learning_rate": 0.0002970027361304114, "loss": 2.1094, "step": 2450 }, { "epoch": 1.6231788079470197, "grad_norm": 0.9557612191133796, "learning_rate": 0.0002969894449457039, "loss": 2.5781, "step": 2451 }, { "epoch": 1.6238410596026491, "grad_norm": 0.8798993955553475, "learning_rate": 0.0002969761246554012, "loss": 2.4688, "step": 2452 }, { "epoch": 1.624503311258278, "grad_norm": 0.9249936981912198, "learning_rate": 0.00029696277526214073, "loss": 2.5625, "step": 2453 }, { "epoch": 1.6251655629139072, "grad_norm": 0.9011911893529734, "learning_rate": 0.0002969493967685659, "loss": 2.5156, "step": 2454 }, { "epoch": 1.6258278145695364, "grad_norm": 0.8126041197100745, "learning_rate": 0.0002969359891773258, "loss": 2.0469, "step": 2455 }, { "epoch": 1.6264900662251656, "grad_norm": 1.0140357982383312, "learning_rate": 0.00029692255249107527, "loss": 2.5781, "step": 2456 }, { "epoch": 1.6271523178807947, "grad_norm": 0.8957509478966947, "learning_rate": 0.0002969090867124749, "loss": 2.4688, "step": 2457 }, { "epoch": 1.627814569536424, "grad_norm": 0.8271319658991183, "learning_rate": 0.00029689559184419115, "loss": 2.3906, "step": 2458 }, { "epoch": 1.628476821192053, "grad_norm": 0.884732262071767, "learning_rate": 0.0002968820678888961, "loss": 2.2969, "step": 2459 }, { "epoch": 1.629139072847682, "grad_norm": 0.8943569482564087, "learning_rate": 0.00029686851484926765, "loss": 2.5312, "step": 2460 }, { "epoch": 1.6298013245033114, "grad_norm": 0.925214913149083, "learning_rate": 0.00029685493272798945, "loss": 2.4688, "step": 2461 }, { "epoch": 1.6304635761589403, "grad_norm": 0.9227541809368459, "learning_rate": 0.000296841321527751, "loss": 2.2812, "step": 2462 }, { "epoch": 1.6311258278145695, "grad_norm": 0.8909742394212812, "learning_rate": 0.00029682768125124736, "loss": 2.625, "step": 2463 }, { "epoch": 1.6317880794701987, "grad_norm": 0.8858356984496005, "learning_rate": 0.0002968140119011795, "loss": 2.2656, "step": 2464 }, { "epoch": 1.6324503311258278, "grad_norm": 1.006105203066916, "learning_rate": 0.00029680031348025413, "loss": 2.4688, "step": 2465 }, { "epoch": 1.633112582781457, "grad_norm": 0.8962316846275129, "learning_rate": 0.0002967865859911837, "loss": 2.3906, "step": 2466 }, { "epoch": 1.633774834437086, "grad_norm": 0.8756714418443138, "learning_rate": 0.0002967728294366863, "loss": 1.9844, "step": 2467 }, { "epoch": 1.6344370860927153, "grad_norm": 0.882409504515992, "learning_rate": 0.0002967590438194861, "loss": 2.3438, "step": 2468 }, { "epoch": 1.6350993377483443, "grad_norm": 1.0424492258171814, "learning_rate": 0.0002967452291423126, "loss": 2.4062, "step": 2469 }, { "epoch": 1.6357615894039736, "grad_norm": 0.9904991551817648, "learning_rate": 0.0002967313854079014, "loss": 2.6875, "step": 2470 }, { "epoch": 1.6364238410596026, "grad_norm": 0.8835217164067358, "learning_rate": 0.00029671751261899364, "loss": 2.4062, "step": 2471 }, { "epoch": 1.6370860927152318, "grad_norm": 0.8926008196475873, "learning_rate": 0.0002967036107783363, "loss": 2.2188, "step": 2472 }, { "epoch": 1.637748344370861, "grad_norm": 0.8575907785237484, "learning_rate": 0.00029668967988868213, "loss": 2.4062, "step": 2473 }, { "epoch": 1.63841059602649, "grad_norm": 0.9965891846674804, "learning_rate": 0.00029667571995278965, "loss": 2.2656, "step": 2474 }, { "epoch": 1.6390728476821192, "grad_norm": 0.9171955389767135, "learning_rate": 0.000296661730973423, "loss": 2.3281, "step": 2475 }, { "epoch": 1.6397350993377482, "grad_norm": 0.8940498241238249, "learning_rate": 0.0002966477129533522, "loss": 2.625, "step": 2476 }, { "epoch": 1.6403973509933776, "grad_norm": 1.0076348590343014, "learning_rate": 0.000296633665895353, "loss": 2.7031, "step": 2477 }, { "epoch": 1.6410596026490065, "grad_norm": 0.9670535478522838, "learning_rate": 0.00029661958980220683, "loss": 2.5781, "step": 2478 }, { "epoch": 1.641721854304636, "grad_norm": 0.9169111806738729, "learning_rate": 0.000296605484676701, "loss": 2.4844, "step": 2479 }, { "epoch": 1.6423841059602649, "grad_norm": 0.9061594441033957, "learning_rate": 0.0002965913505216283, "loss": 2.4844, "step": 2480 }, { "epoch": 1.643046357615894, "grad_norm": 0.8807957931037431, "learning_rate": 0.00029657718733978763, "loss": 2.7344, "step": 2481 }, { "epoch": 1.6437086092715232, "grad_norm": 0.9516746326153281, "learning_rate": 0.00029656299513398346, "loss": 2.4375, "step": 2482 }, { "epoch": 1.6443708609271523, "grad_norm": 0.885504056530061, "learning_rate": 0.0002965487739070259, "loss": 2.6562, "step": 2483 }, { "epoch": 1.6450331125827815, "grad_norm": 0.9207784940011139, "learning_rate": 0.00029653452366173105, "loss": 2.5, "step": 2484 }, { "epoch": 1.6456953642384105, "grad_norm": 0.8724305614966632, "learning_rate": 0.00029652024440092055, "loss": 2.4062, "step": 2485 }, { "epoch": 1.6463576158940398, "grad_norm": 0.9588223423779472, "learning_rate": 0.00029650593612742175, "loss": 2.0781, "step": 2486 }, { "epoch": 1.6470198675496688, "grad_norm": 0.7856607897252336, "learning_rate": 0.00029649159884406805, "loss": 2.0312, "step": 2487 }, { "epoch": 1.6476821192052982, "grad_norm": 0.9393493811563344, "learning_rate": 0.0002964772325536983, "loss": 2.5938, "step": 2488 }, { "epoch": 1.648344370860927, "grad_norm": 0.8342894854101043, "learning_rate": 0.00029646283725915724, "loss": 2.4531, "step": 2489 }, { "epoch": 1.6490066225165563, "grad_norm": 0.875487361416563, "learning_rate": 0.00029644841296329517, "loss": 2.2188, "step": 2490 }, { "epoch": 1.6496688741721854, "grad_norm": 0.8719427346322601, "learning_rate": 0.0002964339596689684, "loss": 2.3125, "step": 2491 }, { "epoch": 1.6503311258278146, "grad_norm": 0.7819528258197547, "learning_rate": 0.00029641947737903883, "loss": 2.2812, "step": 2492 }, { "epoch": 1.6509933774834438, "grad_norm": 0.9539859238537944, "learning_rate": 0.0002964049660963741, "loss": 2.5938, "step": 2493 }, { "epoch": 1.6516556291390727, "grad_norm": 0.8887884747205, "learning_rate": 0.00029639042582384754, "loss": 2.5938, "step": 2494 }, { "epoch": 1.652317880794702, "grad_norm": 0.7918565196093121, "learning_rate": 0.0002963758565643384, "loss": 2.1562, "step": 2495 }, { "epoch": 1.652980132450331, "grad_norm": 0.8455026625817008, "learning_rate": 0.0002963612583207315, "loss": 2.0156, "step": 2496 }, { "epoch": 1.6536423841059604, "grad_norm": 0.9137528803305601, "learning_rate": 0.00029634663109591745, "loss": 2.375, "step": 2497 }, { "epoch": 1.6543046357615894, "grad_norm": 1.0820947024216907, "learning_rate": 0.0002963319748927927, "loss": 2.3906, "step": 2498 }, { "epoch": 1.6549668874172185, "grad_norm": 0.8570299091131258, "learning_rate": 0.00029631728971425924, "loss": 2.3906, "step": 2499 }, { "epoch": 1.6556291390728477, "grad_norm": 0.9557455477475454, "learning_rate": 0.0002963025755632249, "loss": 2.5938, "step": 2500 }, { "epoch": 1.6562913907284769, "grad_norm": 0.962988600969181, "learning_rate": 0.0002962878324426034, "loss": 2.5938, "step": 2501 }, { "epoch": 1.656953642384106, "grad_norm": 0.9142053875905668, "learning_rate": 0.0002962730603553139, "loss": 2.3594, "step": 2502 }, { "epoch": 1.657615894039735, "grad_norm": 0.8832372421562507, "learning_rate": 0.0002962582593042814, "loss": 2.4688, "step": 2503 }, { "epoch": 1.6582781456953644, "grad_norm": 1.0021957273667554, "learning_rate": 0.00029624342929243685, "loss": 2.3594, "step": 2504 }, { "epoch": 1.6589403973509933, "grad_norm": 1.0082572135692691, "learning_rate": 0.0002962285703227167, "loss": 2.6094, "step": 2505 }, { "epoch": 1.6596026490066225, "grad_norm": 0.9189166003149819, "learning_rate": 0.00029621368239806306, "loss": 2.5938, "step": 2506 }, { "epoch": 1.6602649006622516, "grad_norm": 1.4262735074300428, "learning_rate": 0.0002961987655214241, "loss": 2.2344, "step": 2507 }, { "epoch": 1.6609271523178808, "grad_norm": 0.9321857571152382, "learning_rate": 0.00029618381969575343, "loss": 2.8125, "step": 2508 }, { "epoch": 1.66158940397351, "grad_norm": 0.9378131613914461, "learning_rate": 0.00029616884492401056, "loss": 2.6562, "step": 2509 }, { "epoch": 1.6622516556291391, "grad_norm": 0.9207033235213784, "learning_rate": 0.00029615384120916055, "loss": 2.6562, "step": 2510 }, { "epoch": 1.6629139072847683, "grad_norm": 0.9891487030342996, "learning_rate": 0.00029613880855417445, "loss": 2.4219, "step": 2511 }, { "epoch": 1.6635761589403972, "grad_norm": 0.9565369105191688, "learning_rate": 0.00029612374696202877, "loss": 2.5781, "step": 2512 }, { "epoch": 1.6642384105960266, "grad_norm": 1.0412300250695123, "learning_rate": 0.000296108656435706, "loss": 2.4219, "step": 2513 }, { "epoch": 1.6649006622516556, "grad_norm": 0.812507220229497, "learning_rate": 0.00029609353697819415, "loss": 2.25, "step": 2514 }, { "epoch": 1.6655629139072847, "grad_norm": 0.9664473875187014, "learning_rate": 0.00029607838859248704, "loss": 2.5469, "step": 2515 }, { "epoch": 1.666225165562914, "grad_norm": 0.9288071014102777, "learning_rate": 0.00029606321128158424, "loss": 2.4531, "step": 2516 }, { "epoch": 1.666887417218543, "grad_norm": 0.8402553885271143, "learning_rate": 0.00029604800504849107, "loss": 2.3438, "step": 2517 }, { "epoch": 1.6675496688741722, "grad_norm": 0.9270230842027883, "learning_rate": 0.00029603276989621856, "loss": 2.5469, "step": 2518 }, { "epoch": 1.6682119205298012, "grad_norm": 0.7845918121844384, "learning_rate": 0.0002960175058277834, "loss": 2.25, "step": 2519 }, { "epoch": 1.6688741721854305, "grad_norm": 1.1314153668884515, "learning_rate": 0.00029600221284620796, "loss": 2.5, "step": 2520 }, { "epoch": 1.6695364238410595, "grad_norm": 0.9142033066283387, "learning_rate": 0.0002959868909545206, "loss": 2.3906, "step": 2521 }, { "epoch": 1.6701986754966889, "grad_norm": 0.7461381558359854, "learning_rate": 0.0002959715401557551, "loss": 2.3906, "step": 2522 }, { "epoch": 1.6708609271523178, "grad_norm": 0.8737013975995181, "learning_rate": 0.00029595616045295116, "loss": 2.4531, "step": 2523 }, { "epoch": 1.671523178807947, "grad_norm": 1.0407656323481111, "learning_rate": 0.00029594075184915413, "loss": 2.7344, "step": 2524 }, { "epoch": 1.6721854304635762, "grad_norm": 0.9087627064755572, "learning_rate": 0.00029592531434741507, "loss": 2.3281, "step": 2525 }, { "epoch": 1.6728476821192053, "grad_norm": 0.9507041330774911, "learning_rate": 0.00029590984795079076, "loss": 2.7031, "step": 2526 }, { "epoch": 1.6735099337748345, "grad_norm": 0.8800413602214542, "learning_rate": 0.00029589435266234377, "loss": 2.3281, "step": 2527 }, { "epoch": 1.6741721854304634, "grad_norm": 0.9529742091132241, "learning_rate": 0.0002958788284851424, "loss": 2.4062, "step": 2528 }, { "epoch": 1.6748344370860928, "grad_norm": 0.8583137509860099, "learning_rate": 0.00029586327542226046, "loss": 2.6406, "step": 2529 }, { "epoch": 1.6754966887417218, "grad_norm": 0.8543336615175251, "learning_rate": 0.0002958476934767777, "loss": 2.4531, "step": 2530 }, { "epoch": 1.6761589403973511, "grad_norm": 0.8105857849733314, "learning_rate": 0.0002958320826517796, "loss": 2.4844, "step": 2531 }, { "epoch": 1.67682119205298, "grad_norm": 0.8728057328419205, "learning_rate": 0.0002958164429503572, "loss": 2.375, "step": 2532 }, { "epoch": 1.6774834437086092, "grad_norm": 0.9172841279358025, "learning_rate": 0.00029580077437560735, "loss": 2.5781, "step": 2533 }, { "epoch": 1.6781456953642384, "grad_norm": 0.9290784498546605, "learning_rate": 0.00029578507693063267, "loss": 2.375, "step": 2534 }, { "epoch": 1.6788079470198676, "grad_norm": 0.828707997520692, "learning_rate": 0.00029576935061854134, "loss": 2.1094, "step": 2535 }, { "epoch": 1.6794701986754967, "grad_norm": 0.8279893603198488, "learning_rate": 0.0002957535954424474, "loss": 2.5469, "step": 2536 }, { "epoch": 1.6801324503311257, "grad_norm": 0.9090676639471604, "learning_rate": 0.00029573781140547055, "loss": 2.375, "step": 2537 }, { "epoch": 1.680794701986755, "grad_norm": 1.0400049925891237, "learning_rate": 0.00029572199851073626, "loss": 2.4062, "step": 2538 }, { "epoch": 1.681456953642384, "grad_norm": 0.8678711591981842, "learning_rate": 0.00029570615676137555, "loss": 2.5312, "step": 2539 }, { "epoch": 1.6821192052980134, "grad_norm": 0.8957696337410335, "learning_rate": 0.0002956902861605254, "loss": 2.4219, "step": 2540 }, { "epoch": 1.6827814569536423, "grad_norm": 0.8155419036151653, "learning_rate": 0.00029567438671132827, "loss": 2.2031, "step": 2541 }, { "epoch": 1.6834437086092715, "grad_norm": 0.887347998499952, "learning_rate": 0.0002956584584169325, "loss": 2.5312, "step": 2542 }, { "epoch": 1.6841059602649007, "grad_norm": 0.8282048220679504, "learning_rate": 0.000295642501280492, "loss": 2.3594, "step": 2543 }, { "epoch": 1.6847682119205298, "grad_norm": 2.3942236716724477, "learning_rate": 0.00029562651530516654, "loss": 2.3906, "step": 2544 }, { "epoch": 1.685430463576159, "grad_norm": 0.7996151923135151, "learning_rate": 0.0002956105004941215, "loss": 2.4062, "step": 2545 }, { "epoch": 1.686092715231788, "grad_norm": 0.8299602241433127, "learning_rate": 0.00029559445685052794, "loss": 2.4531, "step": 2546 }, { "epoch": 1.6867549668874173, "grad_norm": 1.4081350342925683, "learning_rate": 0.0002955783843775628, "loss": 2.4531, "step": 2547 }, { "epoch": 1.6874172185430463, "grad_norm": 0.9081043676704833, "learning_rate": 0.0002955622830784085, "loss": 2.5312, "step": 2548 }, { "epoch": 1.6880794701986757, "grad_norm": 0.9831379797706636, "learning_rate": 0.00029554615295625334, "loss": 2.6719, "step": 2549 }, { "epoch": 1.6887417218543046, "grad_norm": 0.8479943026568344, "learning_rate": 0.0002955299940142913, "loss": 2.5156, "step": 2550 }, { "epoch": 1.6894039735099338, "grad_norm": 0.9874858672005594, "learning_rate": 0.000295513806255722, "loss": 2.5781, "step": 2551 }, { "epoch": 1.690066225165563, "grad_norm": 0.9233420361431135, "learning_rate": 0.00029549758968375076, "loss": 2.6406, "step": 2552 }, { "epoch": 1.690728476821192, "grad_norm": 0.8932505653663981, "learning_rate": 0.0002954813443015887, "loss": 2.6406, "step": 2553 }, { "epoch": 1.6913907284768213, "grad_norm": 0.8876311546012673, "learning_rate": 0.0002954650701124526, "loss": 2.3906, "step": 2554 }, { "epoch": 1.6920529801324502, "grad_norm": 0.852746015599343, "learning_rate": 0.0002954487671195649, "loss": 2.5312, "step": 2555 }, { "epoch": 1.6927152317880796, "grad_norm": 0.9468494986205876, "learning_rate": 0.0002954324353261538, "loss": 2.375, "step": 2556 }, { "epoch": 1.6933774834437085, "grad_norm": 0.813245238450774, "learning_rate": 0.00029541607473545313, "loss": 2.5156, "step": 2557 }, { "epoch": 1.6940397350993377, "grad_norm": 0.8858025214658736, "learning_rate": 0.0002953996853507026, "loss": 2.4688, "step": 2558 }, { "epoch": 1.6947019867549669, "grad_norm": 0.8959918089730343, "learning_rate": 0.0002953832671751474, "loss": 2.2812, "step": 2559 }, { "epoch": 1.695364238410596, "grad_norm": 0.8531950629710732, "learning_rate": 0.00029536682021203846, "loss": 2.375, "step": 2560 }, { "epoch": 1.6960264900662252, "grad_norm": 0.8729907720406065, "learning_rate": 0.00029535034446463265, "loss": 2.2031, "step": 2561 }, { "epoch": 1.6966887417218544, "grad_norm": 0.8239848429446279, "learning_rate": 0.00029533383993619216, "loss": 2.2031, "step": 2562 }, { "epoch": 1.6973509933774835, "grad_norm": 0.8489352105649699, "learning_rate": 0.0002953173066299852, "loss": 1.9062, "step": 2563 }, { "epoch": 1.6980132450331125, "grad_norm": 1.0663965885204487, "learning_rate": 0.00029530074454928553, "loss": 2.6094, "step": 2564 }, { "epoch": 1.6986754966887418, "grad_norm": 0.8208280592398177, "learning_rate": 0.00029528415369737263, "loss": 2.5156, "step": 2565 }, { "epoch": 1.6993377483443708, "grad_norm": 0.9004009129093324, "learning_rate": 0.0002952675340775316, "loss": 2.5469, "step": 2566 }, { "epoch": 1.7, "grad_norm": 0.9865877868444369, "learning_rate": 0.00029525088569305345, "loss": 2.3906, "step": 2567 }, { "epoch": 1.7006622516556291, "grad_norm": 0.9815568120361647, "learning_rate": 0.00029523420854723467, "loss": 2.5781, "step": 2568 }, { "epoch": 1.7013245033112583, "grad_norm": 0.8749133725247288, "learning_rate": 0.0002952175026433775, "loss": 2.1094, "step": 2569 }, { "epoch": 1.7019867549668874, "grad_norm": 0.8185752254637615, "learning_rate": 0.00029520076798479004, "loss": 2.3906, "step": 2570 }, { "epoch": 1.7026490066225164, "grad_norm": 0.8725437517479909, "learning_rate": 0.0002951840045747858, "loss": 2.7031, "step": 2571 }, { "epoch": 1.7033112582781458, "grad_norm": 1.0448924896854326, "learning_rate": 0.0002951672124166841, "loss": 2.5469, "step": 2572 }, { "epoch": 1.7039735099337747, "grad_norm": 0.9038091239346047, "learning_rate": 0.00029515039151381013, "loss": 2.4375, "step": 2573 }, { "epoch": 1.704635761589404, "grad_norm": 0.8547174134056884, "learning_rate": 0.0002951335418694946, "loss": 2.3594, "step": 2574 }, { "epoch": 1.705298013245033, "grad_norm": 0.9438213909168077, "learning_rate": 0.00029511666348707377, "loss": 2.625, "step": 2575 }, { "epoch": 1.7059602649006622, "grad_norm": 0.9022703357036153, "learning_rate": 0.0002950997563698899, "loss": 2.6719, "step": 2576 }, { "epoch": 1.7066225165562914, "grad_norm": 0.8758259625307955, "learning_rate": 0.0002950828205212908, "loss": 2.4688, "step": 2577 }, { "epoch": 1.7072847682119205, "grad_norm": 0.7597435384810322, "learning_rate": 0.00029506585594462985, "loss": 2.0, "step": 2578 }, { "epoch": 1.7079470198675497, "grad_norm": 0.9595066863042475, "learning_rate": 0.00029504886264326635, "loss": 2.3438, "step": 2579 }, { "epoch": 1.7086092715231787, "grad_norm": 0.8278902823425804, "learning_rate": 0.00029503184062056503, "loss": 2.4375, "step": 2580 }, { "epoch": 1.709271523178808, "grad_norm": 0.9532073703761917, "learning_rate": 0.0002950147898798966, "loss": 2.4844, "step": 2581 }, { "epoch": 1.709933774834437, "grad_norm": 0.9130402306698728, "learning_rate": 0.0002949977104246372, "loss": 2.4375, "step": 2582 }, { "epoch": 1.7105960264900664, "grad_norm": 0.8643657542637402, "learning_rate": 0.0002949806022581688, "loss": 2.5, "step": 2583 }, { "epoch": 1.7112582781456953, "grad_norm": 0.869608509106683, "learning_rate": 0.00029496346538387903, "loss": 2.125, "step": 2584 }, { "epoch": 1.7119205298013245, "grad_norm": 0.9011635701158911, "learning_rate": 0.00029494629980516114, "loss": 2.7344, "step": 2585 }, { "epoch": 1.7125827814569536, "grad_norm": 0.8917203130369887, "learning_rate": 0.0002949291055254142, "loss": 2.1406, "step": 2586 }, { "epoch": 1.7132450331125828, "grad_norm": 0.815564886098106, "learning_rate": 0.0002949118825480427, "loss": 2.3125, "step": 2587 }, { "epoch": 1.713907284768212, "grad_norm": 0.8388413079792099, "learning_rate": 0.00029489463087645717, "loss": 2.0938, "step": 2588 }, { "epoch": 1.714569536423841, "grad_norm": 0.8938948816351111, "learning_rate": 0.0002948773505140735, "loss": 2.5156, "step": 2589 }, { "epoch": 1.7152317880794703, "grad_norm": 1.4641219330235307, "learning_rate": 0.0002948600414643135, "loss": 2.5469, "step": 2590 }, { "epoch": 1.7158940397350992, "grad_norm": 0.9428062940583675, "learning_rate": 0.00029484270373060457, "loss": 2.5156, "step": 2591 }, { "epoch": 1.7165562913907286, "grad_norm": 1.0349241727680316, "learning_rate": 0.00029482533731637966, "loss": 2.5469, "step": 2592 }, { "epoch": 1.7172185430463576, "grad_norm": 0.924445677813016, "learning_rate": 0.0002948079422250776, "loss": 2.4531, "step": 2593 }, { "epoch": 1.7178807947019867, "grad_norm": 0.8739211016556457, "learning_rate": 0.0002947905184601429, "loss": 2.625, "step": 2594 }, { "epoch": 1.718543046357616, "grad_norm": 0.905093205886452, "learning_rate": 0.00029477306602502547, "loss": 2.3125, "step": 2595 }, { "epoch": 1.719205298013245, "grad_norm": 1.1800893862975632, "learning_rate": 0.0002947555849231813, "loss": 2.5625, "step": 2596 }, { "epoch": 1.7198675496688742, "grad_norm": 0.876992072911533, "learning_rate": 0.0002947380751580716, "loss": 2.6875, "step": 2597 }, { "epoch": 1.7205298013245032, "grad_norm": 0.9045128789329269, "learning_rate": 0.00029472053673316377, "loss": 2.4531, "step": 2598 }, { "epoch": 1.7211920529801326, "grad_norm": 1.0076668952204506, "learning_rate": 0.00029470296965193046, "loss": 2.5312, "step": 2599 }, { "epoch": 1.7218543046357615, "grad_norm": 0.8658080755026943, "learning_rate": 0.00029468537391785023, "loss": 2.1094, "step": 2600 }, { "epoch": 1.7225165562913909, "grad_norm": 0.9112029606928851, "learning_rate": 0.00029466774953440717, "loss": 2.2656, "step": 2601 }, { "epoch": 1.7231788079470198, "grad_norm": 0.9508769476165537, "learning_rate": 0.0002946500965050911, "loss": 2.5469, "step": 2602 }, { "epoch": 1.723841059602649, "grad_norm": 0.8880323273655687, "learning_rate": 0.00029463241483339763, "loss": 2.4844, "step": 2603 }, { "epoch": 1.7245033112582782, "grad_norm": 1.0959616783922201, "learning_rate": 0.0002946147045228278, "loss": 2.5312, "step": 2604 }, { "epoch": 1.7251655629139073, "grad_norm": 0.8380873258396921, "learning_rate": 0.00029459696557688857, "loss": 2.2969, "step": 2605 }, { "epoch": 1.7258278145695365, "grad_norm": 0.9496119050145493, "learning_rate": 0.00029457919799909244, "loss": 2.5625, "step": 2606 }, { "epoch": 1.7264900662251654, "grad_norm": 0.8997829824758405, "learning_rate": 0.0002945614017929575, "loss": 2.5469, "step": 2607 }, { "epoch": 1.7271523178807948, "grad_norm": 0.8013575242660411, "learning_rate": 0.00029454357696200766, "loss": 2.5156, "step": 2608 }, { "epoch": 1.7278145695364238, "grad_norm": 0.7601464676044001, "learning_rate": 0.00029452572350977246, "loss": 2.2812, "step": 2609 }, { "epoch": 1.728476821192053, "grad_norm": 0.8357018195397167, "learning_rate": 0.00029450784143978705, "loss": 2.2812, "step": 2610 }, { "epoch": 1.729139072847682, "grad_norm": 0.8779253565084463, "learning_rate": 0.0002944899307555924, "loss": 2.4219, "step": 2611 }, { "epoch": 1.7298013245033113, "grad_norm": 0.8128595901308762, "learning_rate": 0.00029447199146073483, "loss": 2.2344, "step": 2612 }, { "epoch": 1.7304635761589404, "grad_norm": 0.8914357203209446, "learning_rate": 0.0002944540235587667, "loss": 2.5, "step": 2613 }, { "epoch": 1.7311258278145696, "grad_norm": 0.7938264710319881, "learning_rate": 0.00029443602705324574, "loss": 2.0781, "step": 2614 }, { "epoch": 1.7317880794701987, "grad_norm": 0.9561369119063774, "learning_rate": 0.00029441800194773555, "loss": 2.7031, "step": 2615 }, { "epoch": 1.7324503311258277, "grad_norm": 0.7912879176153828, "learning_rate": 0.00029439994824580527, "loss": 2.0469, "step": 2616 }, { "epoch": 1.733112582781457, "grad_norm": 0.9485230747633093, "learning_rate": 0.0002943818659510297, "loss": 2.375, "step": 2617 }, { "epoch": 1.733774834437086, "grad_norm": 0.8938223942657839, "learning_rate": 0.00029436375506698947, "loss": 2.5625, "step": 2618 }, { "epoch": 1.7344370860927152, "grad_norm": 0.8014321293308624, "learning_rate": 0.0002943456155972706, "loss": 2.5, "step": 2619 }, { "epoch": 1.7350993377483444, "grad_norm": 0.776173143016004, "learning_rate": 0.000294327447545465, "loss": 2.4844, "step": 2620 }, { "epoch": 1.7357615894039735, "grad_norm": 0.8492281761739533, "learning_rate": 0.00029430925091517013, "loss": 2.5781, "step": 2621 }, { "epoch": 1.7364238410596027, "grad_norm": 0.9099304754853278, "learning_rate": 0.0002942910257099891, "loss": 2.125, "step": 2622 }, { "epoch": 1.7370860927152316, "grad_norm": 0.8052824648123044, "learning_rate": 0.00029427277193353076, "loss": 2.4688, "step": 2623 }, { "epoch": 1.737748344370861, "grad_norm": 0.851870556251229, "learning_rate": 0.0002942544895894096, "loss": 2.5469, "step": 2624 }, { "epoch": 1.73841059602649, "grad_norm": 0.913991095015386, "learning_rate": 0.0002942361786812456, "loss": 2.4219, "step": 2625 }, { "epoch": 1.7390728476821193, "grad_norm": 0.9821603424791842, "learning_rate": 0.0002942178392126646, "loss": 2.5938, "step": 2626 }, { "epoch": 1.7397350993377483, "grad_norm": 0.7811066994666566, "learning_rate": 0.00029419947118729803, "loss": 2.3125, "step": 2627 }, { "epoch": 1.7403973509933774, "grad_norm": 0.8660721352089412, "learning_rate": 0.000294181074608783, "loss": 2.4844, "step": 2628 }, { "epoch": 1.7410596026490066, "grad_norm": 0.8990310938208633, "learning_rate": 0.0002941626494807622, "loss": 2.4844, "step": 2629 }, { "epoch": 1.7417218543046358, "grad_norm": 0.8567746170510693, "learning_rate": 0.0002941441958068841, "loss": 2.6562, "step": 2630 }, { "epoch": 1.742384105960265, "grad_norm": 0.8568844723559527, "learning_rate": 0.00029412571359080256, "loss": 2.3594, "step": 2631 }, { "epoch": 1.7430463576158939, "grad_norm": 0.8932848395155568, "learning_rate": 0.0002941072028361775, "loss": 2.4219, "step": 2632 }, { "epoch": 1.7437086092715233, "grad_norm": 0.9534666479369729, "learning_rate": 0.00029408866354667406, "loss": 2.6875, "step": 2633 }, { "epoch": 1.7443708609271522, "grad_norm": 0.8953637745281912, "learning_rate": 0.0002940700957259633, "loss": 2.5312, "step": 2634 }, { "epoch": 1.7450331125827816, "grad_norm": 0.8451000041389838, "learning_rate": 0.00029405149937772193, "loss": 2.5156, "step": 2635 }, { "epoch": 1.7456953642384105, "grad_norm": 0.8940103418471532, "learning_rate": 0.00029403287450563214, "loss": 2.5469, "step": 2636 }, { "epoch": 1.7463576158940397, "grad_norm": 0.9426672271862322, "learning_rate": 0.000294014221113382, "loss": 2.4531, "step": 2637 }, { "epoch": 1.7470198675496689, "grad_norm": 0.9385047914825806, "learning_rate": 0.00029399553920466493, "loss": 2.6406, "step": 2638 }, { "epoch": 1.747682119205298, "grad_norm": 0.9116378443931781, "learning_rate": 0.00029397682878318026, "loss": 2.4062, "step": 2639 }, { "epoch": 1.7483443708609272, "grad_norm": 0.8516199524262862, "learning_rate": 0.00029395808985263286, "loss": 2.1719, "step": 2640 }, { "epoch": 1.7490066225165561, "grad_norm": 0.8570796980818687, "learning_rate": 0.00029393932241673324, "loss": 2.6406, "step": 2641 }, { "epoch": 1.7496688741721855, "grad_norm": 0.8343818904281359, "learning_rate": 0.00029392052647919756, "loss": 2.2969, "step": 2642 }, { "epoch": 1.7503311258278145, "grad_norm": 0.8471568027021015, "learning_rate": 0.00029390170204374763, "loss": 2.2812, "step": 2643 }, { "epoch": 1.7509933774834439, "grad_norm": 0.7638677553252173, "learning_rate": 0.000293882849114111, "loss": 1.9766, "step": 2644 }, { "epoch": 1.7516556291390728, "grad_norm": 0.9641437644247945, "learning_rate": 0.00029386396769402057, "loss": 2.3906, "step": 2645 }, { "epoch": 1.752317880794702, "grad_norm": 0.7798963545236832, "learning_rate": 0.0002938450577872153, "loss": 1.9219, "step": 2646 }, { "epoch": 1.7529801324503311, "grad_norm": 0.8140688030377, "learning_rate": 0.0002938261193974394, "loss": 2.3594, "step": 2647 }, { "epoch": 1.7536423841059603, "grad_norm": 0.8572115492263485, "learning_rate": 0.00029380715252844293, "loss": 2.5625, "step": 2648 }, { "epoch": 1.7543046357615895, "grad_norm": 0.9073779247739722, "learning_rate": 0.00029378815718398156, "loss": 2.5625, "step": 2649 }, { "epoch": 1.7549668874172184, "grad_norm": 0.8386030844272973, "learning_rate": 0.00029376913336781667, "loss": 2.6562, "step": 2650 }, { "epoch": 1.7556291390728478, "grad_norm": 0.7525949989711908, "learning_rate": 0.000293750081083715, "loss": 2.3281, "step": 2651 }, { "epoch": 1.7562913907284767, "grad_norm": 0.8514203681062066, "learning_rate": 0.0002937310003354493, "loss": 2.4062, "step": 2652 }, { "epoch": 1.7569536423841061, "grad_norm": 0.8720789337643873, "learning_rate": 0.00029371189112679767, "loss": 2.4062, "step": 2653 }, { "epoch": 1.757615894039735, "grad_norm": 0.8443915840001723, "learning_rate": 0.0002936927534615441, "loss": 2.4219, "step": 2654 }, { "epoch": 1.7582781456953642, "grad_norm": 0.9770159526010178, "learning_rate": 0.00029367358734347783, "loss": 2.6719, "step": 2655 }, { "epoch": 1.7589403973509934, "grad_norm": 0.9070314453618926, "learning_rate": 0.00029365439277639416, "loss": 2.4219, "step": 2656 }, { "epoch": 1.7596026490066226, "grad_norm": 0.8394463706631946, "learning_rate": 0.0002936351697640938, "loss": 2.0469, "step": 2657 }, { "epoch": 1.7602649006622517, "grad_norm": 0.8333935428662533, "learning_rate": 0.0002936159183103831, "loss": 2.4531, "step": 2658 }, { "epoch": 1.7609271523178807, "grad_norm": 0.8649218055828674, "learning_rate": 0.00029359663841907403, "loss": 2.1406, "step": 2659 }, { "epoch": 1.76158940397351, "grad_norm": 0.8941759011656215, "learning_rate": 0.00029357733009398426, "loss": 2.4062, "step": 2660 }, { "epoch": 1.762251655629139, "grad_norm": 1.0074338571514996, "learning_rate": 0.0002935579933389371, "loss": 2.5, "step": 2661 }, { "epoch": 1.7629139072847684, "grad_norm": 0.9041812272556902, "learning_rate": 0.00029353862815776146, "loss": 2.1094, "step": 2662 }, { "epoch": 1.7635761589403973, "grad_norm": 0.8879486594626026, "learning_rate": 0.0002935192345542918, "loss": 2.3594, "step": 2663 }, { "epoch": 1.7642384105960265, "grad_norm": 0.868098083827403, "learning_rate": 0.0002934998125323683, "loss": 2.3594, "step": 2664 }, { "epoch": 1.7649006622516556, "grad_norm": 0.9695870541360891, "learning_rate": 0.00029348036209583676, "loss": 2.5781, "step": 2665 }, { "epoch": 1.7655629139072848, "grad_norm": 0.8948715250795777, "learning_rate": 0.0002934608832485485, "loss": 2.4531, "step": 2666 }, { "epoch": 1.766225165562914, "grad_norm": 0.8598146439782913, "learning_rate": 0.00029344137599436067, "loss": 2.7188, "step": 2667 }, { "epoch": 1.766887417218543, "grad_norm": 0.8241243856675451, "learning_rate": 0.00029342184033713596, "loss": 2.4375, "step": 2668 }, { "epoch": 1.7675496688741723, "grad_norm": 0.8831663029574784, "learning_rate": 0.00029340227628074254, "loss": 2.5625, "step": 2669 }, { "epoch": 1.7682119205298013, "grad_norm": 0.853652526509957, "learning_rate": 0.0002933826838290543, "loss": 2.25, "step": 2670 }, { "epoch": 1.7688741721854304, "grad_norm": 0.8797026123943295, "learning_rate": 0.0002933630629859509, "loss": 2.7188, "step": 2671 }, { "epoch": 1.7695364238410596, "grad_norm": 0.8237761136968011, "learning_rate": 0.00029334341375531744, "loss": 2.5469, "step": 2672 }, { "epoch": 1.7701986754966887, "grad_norm": 0.9808214391095679, "learning_rate": 0.00029332373614104467, "loss": 2.3125, "step": 2673 }, { "epoch": 1.770860927152318, "grad_norm": 0.8866948571482459, "learning_rate": 0.00029330403014702895, "loss": 2.2969, "step": 2674 }, { "epoch": 1.771523178807947, "grad_norm": 0.8469823823469748, "learning_rate": 0.0002932842957771724, "loss": 2.375, "step": 2675 }, { "epoch": 1.7721854304635762, "grad_norm": 0.7621893699495792, "learning_rate": 0.0002932645330353825, "loss": 1.8047, "step": 2676 }, { "epoch": 1.7728476821192052, "grad_norm": 0.8964418884166597, "learning_rate": 0.00029324474192557264, "loss": 2.5156, "step": 2677 }, { "epoch": 1.7735099337748346, "grad_norm": 0.9008802138384604, "learning_rate": 0.00029322492245166163, "loss": 2.4219, "step": 2678 }, { "epoch": 1.7741721854304635, "grad_norm": 0.9406056613369347, "learning_rate": 0.00029320507461757394, "loss": 2.5312, "step": 2679 }, { "epoch": 1.7748344370860927, "grad_norm": 0.8856409432493761, "learning_rate": 0.0002931851984272397, "loss": 2.4375, "step": 2680 }, { "epoch": 1.7754966887417218, "grad_norm": 0.7864502570816322, "learning_rate": 0.0002931652938845946, "loss": 1.9922, "step": 2681 }, { "epoch": 1.776158940397351, "grad_norm": 0.8648185757329448, "learning_rate": 0.00029314536099357994, "loss": 2.5625, "step": 2682 }, { "epoch": 1.7768211920529802, "grad_norm": 0.8730847341985265, "learning_rate": 0.00029312539975814275, "loss": 2.4688, "step": 2683 }, { "epoch": 1.7774834437086091, "grad_norm": 0.9300643598384762, "learning_rate": 0.0002931054101822355, "loss": 2.2969, "step": 2684 }, { "epoch": 1.7781456953642385, "grad_norm": 1.0270446961806259, "learning_rate": 0.0002930853922698164, "loss": 2.625, "step": 2685 }, { "epoch": 1.7788079470198674, "grad_norm": 0.8484477793427707, "learning_rate": 0.0002930653460248492, "loss": 2.4688, "step": 2686 }, { "epoch": 1.7794701986754968, "grad_norm": 0.8728264532974931, "learning_rate": 0.0002930452714513033, "loss": 2.5312, "step": 2687 }, { "epoch": 1.7801324503311258, "grad_norm": 0.896251269940751, "learning_rate": 0.0002930251685531537, "loss": 2.6719, "step": 2688 }, { "epoch": 1.780794701986755, "grad_norm": 0.8964313104850266, "learning_rate": 0.00029300503733438097, "loss": 2.6875, "step": 2689 }, { "epoch": 1.781456953642384, "grad_norm": 0.923668422300788, "learning_rate": 0.00029298487779897135, "loss": 2.3438, "step": 2690 }, { "epoch": 1.7821192052980133, "grad_norm": 0.9455761584563044, "learning_rate": 0.00029296468995091665, "loss": 2.7969, "step": 2691 }, { "epoch": 1.7827814569536424, "grad_norm": 0.7411616804629328, "learning_rate": 0.0002929444737942143, "loss": 2.375, "step": 2692 }, { "epoch": 1.7834437086092714, "grad_norm": 0.8047181363746244, "learning_rate": 0.0002929242293328674, "loss": 1.9844, "step": 2693 }, { "epoch": 1.7841059602649008, "grad_norm": 0.8629123621638246, "learning_rate": 0.00029290395657088447, "loss": 2.3906, "step": 2694 }, { "epoch": 1.7847682119205297, "grad_norm": 1.0018639065643204, "learning_rate": 0.0002928836555122798, "loss": 2.375, "step": 2695 }, { "epoch": 1.785430463576159, "grad_norm": 0.8634908173056695, "learning_rate": 0.00029286332616107317, "loss": 2.5312, "step": 2696 }, { "epoch": 1.786092715231788, "grad_norm": 0.7548402617398009, "learning_rate": 0.0002928429685212902, "loss": 1.9453, "step": 2697 }, { "epoch": 1.7867549668874172, "grad_norm": 0.8374726248547408, "learning_rate": 0.0002928225825969618, "loss": 2.4062, "step": 2698 }, { "epoch": 1.7874172185430464, "grad_norm": 0.9762234618447262, "learning_rate": 0.00029280216839212456, "loss": 2.6562, "step": 2699 }, { "epoch": 1.7880794701986755, "grad_norm": 0.8263970942745864, "learning_rate": 0.00029278172591082086, "loss": 2.4219, "step": 2700 }, { "epoch": 1.7887417218543047, "grad_norm": 0.8033080088515382, "learning_rate": 0.0002927612551570985, "loss": 2.4844, "step": 2701 }, { "epoch": 1.7894039735099336, "grad_norm": 0.8995662993787829, "learning_rate": 0.00029274075613501093, "loss": 2.5312, "step": 2702 }, { "epoch": 1.790066225165563, "grad_norm": 0.7901400735653719, "learning_rate": 0.0002927202288486171, "loss": 1.9688, "step": 2703 }, { "epoch": 1.790728476821192, "grad_norm": 0.8241622085419643, "learning_rate": 0.00029269967330198177, "loss": 2.6875, "step": 2704 }, { "epoch": 1.7913907284768213, "grad_norm": 0.841941526994542, "learning_rate": 0.00029267908949917517, "loss": 2.5625, "step": 2705 }, { "epoch": 1.7920529801324503, "grad_norm": 0.9325690770359905, "learning_rate": 0.00029265847744427303, "loss": 2.0938, "step": 2706 }, { "epoch": 1.7927152317880795, "grad_norm": 0.9036236428832487, "learning_rate": 0.0002926378371413568, "loss": 2.5312, "step": 2707 }, { "epoch": 1.7933774834437086, "grad_norm": 0.8048091259406812, "learning_rate": 0.0002926171685945136, "loss": 2.2188, "step": 2708 }, { "epoch": 1.7940397350993378, "grad_norm": 0.8784277433348321, "learning_rate": 0.0002925964718078359, "loss": 2.6875, "step": 2709 }, { "epoch": 1.794701986754967, "grad_norm": 0.7946439670008398, "learning_rate": 0.000292575746785422, "loss": 2.3438, "step": 2710 }, { "epoch": 1.795364238410596, "grad_norm": 0.9717756382285718, "learning_rate": 0.0002925549935313757, "loss": 2.7344, "step": 2711 }, { "epoch": 1.7960264900662253, "grad_norm": 0.8252984118908527, "learning_rate": 0.0002925342120498063, "loss": 2.6562, "step": 2712 }, { "epoch": 1.7966887417218542, "grad_norm": 0.7426479975844986, "learning_rate": 0.0002925134023448288, "loss": 2.2344, "step": 2713 }, { "epoch": 1.7973509933774836, "grad_norm": 0.8283589163709337, "learning_rate": 0.0002924925644205638, "loss": 2.6094, "step": 2714 }, { "epoch": 1.7980132450331126, "grad_norm": 0.9166237132175525, "learning_rate": 0.0002924716982811374, "loss": 2.2812, "step": 2715 }, { "epoch": 1.7986754966887417, "grad_norm": 0.7898337290824426, "learning_rate": 0.0002924508039306814, "loss": 2.4375, "step": 2716 }, { "epoch": 1.7993377483443709, "grad_norm": 0.816796393783639, "learning_rate": 0.000292429881373333, "loss": 2.4219, "step": 2717 }, { "epoch": 1.8, "grad_norm": 0.8725703709079965, "learning_rate": 0.00029240893061323526, "loss": 2.6094, "step": 2718 }, { "epoch": 1.8006622516556292, "grad_norm": 0.8162546934214674, "learning_rate": 0.00029238795165453655, "loss": 2.6406, "step": 2719 }, { "epoch": 1.8013245033112582, "grad_norm": 0.8564171661760277, "learning_rate": 0.000292366944501391, "loss": 2.375, "step": 2720 }, { "epoch": 1.8019867549668875, "grad_norm": 0.7311155492659066, "learning_rate": 0.00029234590915795823, "loss": 2.3125, "step": 2721 }, { "epoch": 1.8026490066225165, "grad_norm": 0.9270211585795599, "learning_rate": 0.00029232484562840357, "loss": 2.5625, "step": 2722 }, { "epoch": 1.8033112582781456, "grad_norm": 0.822207187609188, "learning_rate": 0.0002923037539168977, "loss": 2.4688, "step": 2723 }, { "epoch": 1.8039735099337748, "grad_norm": 0.8312581112364031, "learning_rate": 0.00029228263402761713, "loss": 2.4531, "step": 2724 }, { "epoch": 1.804635761589404, "grad_norm": 0.8295052516776861, "learning_rate": 0.0002922614859647438, "loss": 2.2812, "step": 2725 }, { "epoch": 1.8052980132450331, "grad_norm": 0.8563373302409606, "learning_rate": 0.00029224030973246524, "loss": 2.4844, "step": 2726 }, { "epoch": 1.8059602649006623, "grad_norm": 0.9192641508105985, "learning_rate": 0.00029221910533497466, "loss": 2.2656, "step": 2727 }, { "epoch": 1.8066225165562915, "grad_norm": 0.8400966643882242, "learning_rate": 0.00029219787277647073, "loss": 2.3281, "step": 2728 }, { "epoch": 1.8072847682119204, "grad_norm": 0.8111631747747882, "learning_rate": 0.0002921766120611578, "loss": 2.4062, "step": 2729 }, { "epoch": 1.8079470198675498, "grad_norm": 0.8757351986689719, "learning_rate": 0.00029215532319324555, "loss": 1.9844, "step": 2730 }, { "epoch": 1.8086092715231787, "grad_norm": 0.8410628802959517, "learning_rate": 0.0002921340061769497, "loss": 2.5156, "step": 2731 }, { "epoch": 1.809271523178808, "grad_norm": 0.774696143741158, "learning_rate": 0.000292112661016491, "loss": 1.9688, "step": 2732 }, { "epoch": 1.809933774834437, "grad_norm": 0.8145399891170766, "learning_rate": 0.0002920912877160962, "loss": 2.375, "step": 2733 }, { "epoch": 1.8105960264900662, "grad_norm": 0.7943576265657718, "learning_rate": 0.00029206988627999736, "loss": 2.3906, "step": 2734 }, { "epoch": 1.8112582781456954, "grad_norm": 0.9874660056277332, "learning_rate": 0.0002920484567124323, "loss": 2.5, "step": 2735 }, { "epoch": 1.8119205298013243, "grad_norm": 0.7744255413369399, "learning_rate": 0.0002920269990176443, "loss": 2.2812, "step": 2736 }, { "epoch": 1.8125827814569537, "grad_norm": 0.8994562161872767, "learning_rate": 0.00029200551319988213, "loss": 2.2188, "step": 2737 }, { "epoch": 1.8132450331125827, "grad_norm": 0.7936106952254149, "learning_rate": 0.0002919839992634004, "loss": 2.2031, "step": 2738 }, { "epoch": 1.813907284768212, "grad_norm": 0.9217683813124031, "learning_rate": 0.00029196245721245897, "loss": 2.625, "step": 2739 }, { "epoch": 1.814569536423841, "grad_norm": 0.7938510823013709, "learning_rate": 0.0002919408870513235, "loss": 2.2812, "step": 2740 }, { "epoch": 1.8152317880794702, "grad_norm": 0.7857139120627535, "learning_rate": 0.0002919192887842651, "loss": 2.3125, "step": 2741 }, { "epoch": 1.8158940397350993, "grad_norm": 0.8964090478965278, "learning_rate": 0.0002918976624155604, "loss": 2.4219, "step": 2742 }, { "epoch": 1.8165562913907285, "grad_norm": 0.8341573851550185, "learning_rate": 0.00029187600794949175, "loss": 1.9062, "step": 2743 }, { "epoch": 1.8172185430463577, "grad_norm": 0.7623273457320935, "learning_rate": 0.000291854325390347, "loss": 2.0781, "step": 2744 }, { "epoch": 1.8178807947019866, "grad_norm": 0.8184260042359358, "learning_rate": 0.0002918326147424195, "loss": 2.3906, "step": 2745 }, { "epoch": 1.818543046357616, "grad_norm": 0.8538983463350875, "learning_rate": 0.0002918108760100083, "loss": 2.2812, "step": 2746 }, { "epoch": 1.819205298013245, "grad_norm": 0.9150232774235978, "learning_rate": 0.00029178910919741774, "loss": 2.2031, "step": 2747 }, { "epoch": 1.8198675496688743, "grad_norm": 0.9499602497781707, "learning_rate": 0.00029176731430895805, "loss": 2.6406, "step": 2748 }, { "epoch": 1.8205298013245033, "grad_norm": 0.8641877906862377, "learning_rate": 0.0002917454913489448, "loss": 2.3594, "step": 2749 }, { "epoch": 1.8211920529801324, "grad_norm": 0.9318545964926954, "learning_rate": 0.0002917236403216992, "loss": 2.4375, "step": 2750 }, { "epoch": 1.8218543046357616, "grad_norm": 0.9360453402692742, "learning_rate": 0.000291701761231548, "loss": 2.4062, "step": 2751 }, { "epoch": 1.8225165562913908, "grad_norm": 0.8732893664657312, "learning_rate": 0.0002916798540828236, "loss": 2.2656, "step": 2752 }, { "epoch": 1.82317880794702, "grad_norm": 0.9136496531802694, "learning_rate": 0.00029165791887986365, "loss": 2.4062, "step": 2753 }, { "epoch": 1.8238410596026489, "grad_norm": 0.9638747997005405, "learning_rate": 0.0002916359556270118, "loss": 2.5, "step": 2754 }, { "epoch": 1.8245033112582782, "grad_norm": 0.8489194087304295, "learning_rate": 0.00029161396432861693, "loss": 2.6406, "step": 2755 }, { "epoch": 1.8251655629139072, "grad_norm": 0.7840490609945583, "learning_rate": 0.0002915919449890336, "loss": 2.25, "step": 2756 }, { "epoch": 1.8258278145695366, "grad_norm": 0.8587547721510731, "learning_rate": 0.0002915698976126218, "loss": 2.2969, "step": 2757 }, { "epoch": 1.8264900662251655, "grad_norm": 0.8312472589776129, "learning_rate": 0.0002915478222037472, "loss": 2.5781, "step": 2758 }, { "epoch": 1.8271523178807947, "grad_norm": 0.8353897774162931, "learning_rate": 0.00029152571876678106, "loss": 1.9531, "step": 2759 }, { "epoch": 1.8278145695364238, "grad_norm": 0.8586402708690111, "learning_rate": 0.0002915035873061001, "loss": 2.375, "step": 2760 }, { "epoch": 1.828476821192053, "grad_norm": 0.8391416286542711, "learning_rate": 0.00029148142782608647, "loss": 2.5938, "step": 2761 }, { "epoch": 1.8291390728476822, "grad_norm": 0.8248537601060804, "learning_rate": 0.0002914592403311282, "loss": 2.3125, "step": 2762 }, { "epoch": 1.8298013245033111, "grad_norm": 0.8553108737350404, "learning_rate": 0.0002914370248256185, "loss": 2.3281, "step": 2763 }, { "epoch": 1.8304635761589405, "grad_norm": 0.8439215284704801, "learning_rate": 0.0002914147813139564, "loss": 2.4688, "step": 2764 }, { "epoch": 1.8311258278145695, "grad_norm": 0.8965129175885984, "learning_rate": 0.0002913925098005463, "loss": 1.9609, "step": 2765 }, { "epoch": 1.8317880794701988, "grad_norm": 0.8446869236957127, "learning_rate": 0.00029137021028979827, "loss": 2.3125, "step": 2766 }, { "epoch": 1.8324503311258278, "grad_norm": 0.8929966131445781, "learning_rate": 0.00029134788278612785, "loss": 2.6406, "step": 2767 }, { "epoch": 1.833112582781457, "grad_norm": 0.8111008079530692, "learning_rate": 0.0002913255272939561, "loss": 2.4062, "step": 2768 }, { "epoch": 1.833774834437086, "grad_norm": 0.7794080271198303, "learning_rate": 0.0002913031438177098, "loss": 2.2656, "step": 2769 }, { "epoch": 1.8344370860927153, "grad_norm": 0.7882965872198856, "learning_rate": 0.0002912807323618209, "loss": 2.1562, "step": 2770 }, { "epoch": 1.8350993377483444, "grad_norm": 0.8327533274924267, "learning_rate": 0.00029125829293072733, "loss": 2.3906, "step": 2771 }, { "epoch": 1.8357615894039734, "grad_norm": 0.8071806382650956, "learning_rate": 0.0002912358255288723, "loss": 2.4844, "step": 2772 }, { "epoch": 1.8364238410596028, "grad_norm": 0.8507806116229872, "learning_rate": 0.00029121333016070457, "loss": 2.8125, "step": 2773 }, { "epoch": 1.8370860927152317, "grad_norm": 0.866661720911409, "learning_rate": 0.00029119080683067853, "loss": 2.3906, "step": 2774 }, { "epoch": 1.8377483443708609, "grad_norm": 0.857101530741126, "learning_rate": 0.000291168255543254, "loss": 2.3594, "step": 2775 }, { "epoch": 1.83841059602649, "grad_norm": 0.8738788626617696, "learning_rate": 0.0002911456763028965, "loss": 2.5312, "step": 2776 }, { "epoch": 1.8390728476821192, "grad_norm": 0.7948734270727188, "learning_rate": 0.0002911230691140769, "loss": 2.5625, "step": 2777 }, { "epoch": 1.8397350993377484, "grad_norm": 0.9133355256492385, "learning_rate": 0.00029110043398127163, "loss": 2.4531, "step": 2778 }, { "epoch": 1.8403973509933775, "grad_norm": 0.8398510864845652, "learning_rate": 0.0002910777709089628, "loss": 2.2812, "step": 2779 }, { "epoch": 1.8410596026490067, "grad_norm": 0.805385989057586, "learning_rate": 0.00029105507990163796, "loss": 2.375, "step": 2780 }, { "epoch": 1.8417218543046356, "grad_norm": 0.900340262713152, "learning_rate": 0.0002910323609637901, "loss": 2.5156, "step": 2781 }, { "epoch": 1.842384105960265, "grad_norm": 0.8896297431090129, "learning_rate": 0.0002910096140999179, "loss": 2.3906, "step": 2782 }, { "epoch": 1.843046357615894, "grad_norm": 0.7898760945016374, "learning_rate": 0.0002909868393145255, "loss": 2.4219, "step": 2783 }, { "epoch": 1.8437086092715231, "grad_norm": 0.8305570326198095, "learning_rate": 0.0002909640366121225, "loss": 2.5312, "step": 2784 }, { "epoch": 1.8443708609271523, "grad_norm": 0.753343651461002, "learning_rate": 0.00029094120599722423, "loss": 2.4219, "step": 2785 }, { "epoch": 1.8450331125827815, "grad_norm": 0.7727513400864662, "learning_rate": 0.0002909183474743513, "loss": 2.625, "step": 2786 }, { "epoch": 1.8456953642384106, "grad_norm": 0.8796627379549826, "learning_rate": 0.00029089546104803005, "loss": 2.5469, "step": 2787 }, { "epoch": 1.8463576158940396, "grad_norm": 0.7529846631125316, "learning_rate": 0.0002908725467227921, "loss": 2.375, "step": 2788 }, { "epoch": 1.847019867549669, "grad_norm": 0.7336721636830027, "learning_rate": 0.00029084960450317487, "loss": 2.4219, "step": 2789 }, { "epoch": 1.847682119205298, "grad_norm": 0.8028387550156288, "learning_rate": 0.00029082663439372125, "loss": 2.4062, "step": 2790 }, { "epoch": 1.8483443708609273, "grad_norm": 0.8345093726243478, "learning_rate": 0.00029080363639897937, "loss": 2.3125, "step": 2791 }, { "epoch": 1.8490066225165562, "grad_norm": 0.7227866765451151, "learning_rate": 0.00029078061052350337, "loss": 1.9297, "step": 2792 }, { "epoch": 1.8496688741721854, "grad_norm": 0.8835945420530996, "learning_rate": 0.0002907575567718523, "loss": 2.5469, "step": 2793 }, { "epoch": 1.8503311258278146, "grad_norm": 0.7774418813521291, "learning_rate": 0.00029073447514859144, "loss": 2.3281, "step": 2794 }, { "epoch": 1.8509933774834437, "grad_norm": 0.797442563911051, "learning_rate": 0.0002907113656582909, "loss": 2.3438, "step": 2795 }, { "epoch": 1.851655629139073, "grad_norm": 0.7637663790956345, "learning_rate": 0.0002906882283055268, "loss": 2.5156, "step": 2796 }, { "epoch": 1.8523178807947018, "grad_norm": 0.8338452761091721, "learning_rate": 0.00029066506309488057, "loss": 2.3281, "step": 2797 }, { "epoch": 1.8529801324503312, "grad_norm": 0.8432857484155026, "learning_rate": 0.00029064187003093923, "loss": 2.4375, "step": 2798 }, { "epoch": 1.8536423841059602, "grad_norm": 0.7904930686909447, "learning_rate": 0.0002906186491182951, "loss": 2.4688, "step": 2799 }, { "epoch": 1.8543046357615895, "grad_norm": 0.7115944270237506, "learning_rate": 0.00029059540036154637, "loss": 2.2344, "step": 2800 }, { "epoch": 1.8549668874172185, "grad_norm": 0.8457780209832122, "learning_rate": 0.0002905721237652965, "loss": 2.6094, "step": 2801 }, { "epoch": 1.8556291390728477, "grad_norm": 0.8419772186843811, "learning_rate": 0.0002905488193341545, "loss": 2.0312, "step": 2802 }, { "epoch": 1.8562913907284768, "grad_norm": 0.8015757635982855, "learning_rate": 0.00029052548707273493, "loss": 2.4219, "step": 2803 }, { "epoch": 1.856953642384106, "grad_norm": 0.829282490360113, "learning_rate": 0.0002905021269856579, "loss": 2.3594, "step": 2804 }, { "epoch": 1.8576158940397351, "grad_norm": 0.8012979939392302, "learning_rate": 0.00029047873907754883, "loss": 2.3438, "step": 2805 }, { "epoch": 1.858278145695364, "grad_norm": 0.8157340250427647, "learning_rate": 0.00029045532335303896, "loss": 2.375, "step": 2806 }, { "epoch": 1.8589403973509935, "grad_norm": 0.7695781997447553, "learning_rate": 0.00029043187981676476, "loss": 2.4375, "step": 2807 }, { "epoch": 1.8596026490066224, "grad_norm": 0.9213249435618963, "learning_rate": 0.00029040840847336833, "loss": 2.6094, "step": 2808 }, { "epoch": 1.8602649006622518, "grad_norm": 0.8969288540602731, "learning_rate": 0.00029038490932749734, "loss": 2.0, "step": 2809 }, { "epoch": 1.8609271523178808, "grad_norm": 0.8788859216430591, "learning_rate": 0.00029036138238380484, "loss": 2.6719, "step": 2810 }, { "epoch": 1.86158940397351, "grad_norm": 0.8576044046619875, "learning_rate": 0.00029033782764694945, "loss": 2.5469, "step": 2811 }, { "epoch": 1.862251655629139, "grad_norm": 0.7872100352690468, "learning_rate": 0.00029031424512159523, "loss": 2.4219, "step": 2812 }, { "epoch": 1.8629139072847682, "grad_norm": 0.8703829713620044, "learning_rate": 0.0002902906348124119, "loss": 2.5469, "step": 2813 }, { "epoch": 1.8635761589403974, "grad_norm": 0.8403564396988256, "learning_rate": 0.0002902669967240744, "loss": 2.3281, "step": 2814 }, { "epoch": 1.8642384105960264, "grad_norm": 0.8660958068289178, "learning_rate": 0.00029024333086126344, "loss": 2.8438, "step": 2815 }, { "epoch": 1.8649006622516557, "grad_norm": 0.786049768033528, "learning_rate": 0.0002902196372286652, "loss": 2.375, "step": 2816 }, { "epoch": 1.8655629139072847, "grad_norm": 0.7491479307835897, "learning_rate": 0.0002901959158309712, "loss": 2.3281, "step": 2817 }, { "epoch": 1.866225165562914, "grad_norm": 0.836145636419037, "learning_rate": 0.00029017216667287857, "loss": 2.5938, "step": 2818 }, { "epoch": 1.866887417218543, "grad_norm": 0.8387925072746757, "learning_rate": 0.0002901483897590899, "loss": 2.3125, "step": 2819 }, { "epoch": 1.8675496688741722, "grad_norm": 0.917939447487194, "learning_rate": 0.00029012458509431333, "loss": 2.4844, "step": 2820 }, { "epoch": 1.8682119205298013, "grad_norm": 0.7844220582433, "learning_rate": 0.00029010075268326246, "loss": 2.3125, "step": 2821 }, { "epoch": 1.8688741721854305, "grad_norm": 0.8885516729095283, "learning_rate": 0.00029007689253065627, "loss": 2.5469, "step": 2822 }, { "epoch": 1.8695364238410597, "grad_norm": 0.8341378403258463, "learning_rate": 0.0002900530046412195, "loss": 2.3438, "step": 2823 }, { "epoch": 1.8701986754966886, "grad_norm": 0.7612362035962441, "learning_rate": 0.00029002908901968217, "loss": 2.3906, "step": 2824 }, { "epoch": 1.870860927152318, "grad_norm": 0.821570814231991, "learning_rate": 0.00029000514567077977, "loss": 2.4531, "step": 2825 }, { "epoch": 1.871523178807947, "grad_norm": 0.8354198413224346, "learning_rate": 0.0002899811745992535, "loss": 2.375, "step": 2826 }, { "epoch": 1.872185430463576, "grad_norm": 0.8935152760523608, "learning_rate": 0.0002899571758098497, "loss": 2.2656, "step": 2827 }, { "epoch": 1.8728476821192053, "grad_norm": 0.787576569676333, "learning_rate": 0.00028993314930732066, "loss": 2.3438, "step": 2828 }, { "epoch": 1.8735099337748344, "grad_norm": 0.7720794518137264, "learning_rate": 0.00028990909509642377, "loss": 2.5156, "step": 2829 }, { "epoch": 1.8741721854304636, "grad_norm": 0.904941294970232, "learning_rate": 0.00028988501318192195, "loss": 2.375, "step": 2830 }, { "epoch": 1.8748344370860928, "grad_norm": 0.8154539387482661, "learning_rate": 0.00028986090356858385, "loss": 2.4375, "step": 2831 }, { "epoch": 1.875496688741722, "grad_norm": 0.8130472182366187, "learning_rate": 0.0002898367662611834, "loss": 2.4844, "step": 2832 }, { "epoch": 1.8761589403973509, "grad_norm": 0.8462639729876643, "learning_rate": 0.0002898126012645, "loss": 2.4688, "step": 2833 }, { "epoch": 1.8768211920529803, "grad_norm": 0.9254018964305748, "learning_rate": 0.00028978840858331873, "loss": 2.5469, "step": 2834 }, { "epoch": 1.8774834437086092, "grad_norm": 0.8757856654895679, "learning_rate": 0.0002897641882224299, "loss": 2.5, "step": 2835 }, { "epoch": 1.8781456953642384, "grad_norm": 0.8511256546954842, "learning_rate": 0.00028973994018662946, "loss": 2.5156, "step": 2836 }, { "epoch": 1.8788079470198675, "grad_norm": 0.8519454121459769, "learning_rate": 0.0002897156644807188, "loss": 2.5469, "step": 2837 }, { "epoch": 1.8794701986754967, "grad_norm": 0.8098819660902985, "learning_rate": 0.0002896913611095047, "loss": 2.6094, "step": 2838 }, { "epoch": 1.8801324503311259, "grad_norm": 0.909954702721027, "learning_rate": 0.00028966703007779975, "loss": 2.5625, "step": 2839 }, { "epoch": 1.8807947019867548, "grad_norm": 0.865144621946133, "learning_rate": 0.0002896426713904215, "loss": 2.4219, "step": 2840 }, { "epoch": 1.8814569536423842, "grad_norm": 0.8324759325972422, "learning_rate": 0.00028961828505219344, "loss": 2.4531, "step": 2841 }, { "epoch": 1.8821192052980131, "grad_norm": 0.8400855033350236, "learning_rate": 0.00028959387106794416, "loss": 1.8984, "step": 2842 }, { "epoch": 1.8827814569536425, "grad_norm": 0.9236805260648048, "learning_rate": 0.00028956942944250813, "loss": 2.6406, "step": 2843 }, { "epoch": 1.8834437086092715, "grad_norm": 0.8034672820521745, "learning_rate": 0.0002895449601807249, "loss": 2.5156, "step": 2844 }, { "epoch": 1.8841059602649006, "grad_norm": 0.768152749955267, "learning_rate": 0.00028952046328743975, "loss": 2.0625, "step": 2845 }, { "epoch": 1.8847682119205298, "grad_norm": 0.815051255854314, "learning_rate": 0.0002894959387675033, "loss": 2.5156, "step": 2846 }, { "epoch": 1.885430463576159, "grad_norm": 0.9056361646954801, "learning_rate": 0.0002894713866257717, "loss": 2.4844, "step": 2847 }, { "epoch": 1.8860927152317881, "grad_norm": 0.7612968134501811, "learning_rate": 0.00028944680686710656, "loss": 2.3281, "step": 2848 }, { "epoch": 1.886754966887417, "grad_norm": 0.8240153449282337, "learning_rate": 0.000289422199496375, "loss": 2.4844, "step": 2849 }, { "epoch": 1.8874172185430464, "grad_norm": 0.803792293248741, "learning_rate": 0.00028939756451844945, "loss": 2.375, "step": 2850 }, { "epoch": 1.8880794701986754, "grad_norm": 0.8728967799364742, "learning_rate": 0.000289372901938208, "loss": 2.6094, "step": 2851 }, { "epoch": 1.8887417218543048, "grad_norm": 0.816174957864966, "learning_rate": 0.00028934821176053417, "loss": 2.5, "step": 2852 }, { "epoch": 1.8894039735099337, "grad_norm": 0.8149326571798456, "learning_rate": 0.00028932349399031677, "loss": 2.4219, "step": 2853 }, { "epoch": 1.8900662251655629, "grad_norm": 0.8505409004253704, "learning_rate": 0.0002892987486324503, "loss": 2.3281, "step": 2854 }, { "epoch": 1.890728476821192, "grad_norm": 0.8331251102621244, "learning_rate": 0.00028927397569183456, "loss": 2.3438, "step": 2855 }, { "epoch": 1.8913907284768212, "grad_norm": 0.811574080937041, "learning_rate": 0.00028924917517337493, "loss": 2.0156, "step": 2856 }, { "epoch": 1.8920529801324504, "grad_norm": 0.8767633493848389, "learning_rate": 0.00028922434708198217, "loss": 2.4844, "step": 2857 }, { "epoch": 1.8927152317880793, "grad_norm": 1.0272918206818542, "learning_rate": 0.0002891994914225725, "loss": 2.4688, "step": 2858 }, { "epoch": 1.8933774834437087, "grad_norm": 0.8361052106068869, "learning_rate": 0.0002891746082000678, "loss": 2.3281, "step": 2859 }, { "epoch": 1.8940397350993377, "grad_norm": 0.8234535062647853, "learning_rate": 0.0002891496974193949, "loss": 2.2344, "step": 2860 }, { "epoch": 1.894701986754967, "grad_norm": 0.8175272727413886, "learning_rate": 0.0002891247590854867, "loss": 2.0938, "step": 2861 }, { "epoch": 1.895364238410596, "grad_norm": 0.8563851871664755, "learning_rate": 0.00028909979320328123, "loss": 2.3594, "step": 2862 }, { "epoch": 1.8960264900662251, "grad_norm": 1.0408293894653124, "learning_rate": 0.0002890747997777219, "loss": 2.5156, "step": 2863 }, { "epoch": 1.8966887417218543, "grad_norm": 0.890811526080028, "learning_rate": 0.0002890497788137579, "loss": 2.3125, "step": 2864 }, { "epoch": 1.8973509933774835, "grad_norm": 0.774309852144894, "learning_rate": 0.0002890247303163434, "loss": 2.3438, "step": 2865 }, { "epoch": 1.8980132450331126, "grad_norm": 0.7604358930556171, "learning_rate": 0.00028899965429043847, "loss": 1.8516, "step": 2866 }, { "epoch": 1.8986754966887416, "grad_norm": 0.764006807882833, "learning_rate": 0.0002889745507410084, "loss": 2.3281, "step": 2867 }, { "epoch": 1.899337748344371, "grad_norm": 0.7948723753423214, "learning_rate": 0.0002889494196730241, "loss": 2.375, "step": 2868 }, { "epoch": 1.9, "grad_norm": 0.8105348976309922, "learning_rate": 0.0002889242610914615, "loss": 2.2812, "step": 2869 }, { "epoch": 1.9006622516556293, "grad_norm": 0.8412456084894522, "learning_rate": 0.0002888990750013027, "loss": 2.4688, "step": 2870 }, { "epoch": 1.9013245033112582, "grad_norm": 0.8386192721830606, "learning_rate": 0.0002888738614075345, "loss": 2.3906, "step": 2871 }, { "epoch": 1.9019867549668874, "grad_norm": 0.8824798288314648, "learning_rate": 0.00028884862031514964, "loss": 2.3125, "step": 2872 }, { "epoch": 1.9026490066225166, "grad_norm": 0.9463314200858433, "learning_rate": 0.0002888233517291461, "loss": 2.25, "step": 2873 }, { "epoch": 1.9033112582781457, "grad_norm": 0.7923747005374712, "learning_rate": 0.00028879805565452737, "loss": 2.4219, "step": 2874 }, { "epoch": 1.903973509933775, "grad_norm": 0.8535817109477793, "learning_rate": 0.00028877273209630233, "loss": 2.6875, "step": 2875 }, { "epoch": 1.9046357615894038, "grad_norm": 0.7949300347093511, "learning_rate": 0.0002887473810594854, "loss": 2.4375, "step": 2876 }, { "epoch": 1.9052980132450332, "grad_norm": 0.9116518927897763, "learning_rate": 0.0002887220025490963, "loss": 2.4375, "step": 2877 }, { "epoch": 1.9059602649006622, "grad_norm": 0.7514631649350063, "learning_rate": 0.0002886965965701603, "loss": 2.375, "step": 2878 }, { "epoch": 1.9066225165562913, "grad_norm": 0.8610009397095162, "learning_rate": 0.0002886711631277081, "loss": 2.4688, "step": 2879 }, { "epoch": 1.9072847682119205, "grad_norm": 0.7808018642883163, "learning_rate": 0.0002886457022267758, "loss": 2.3438, "step": 2880 }, { "epoch": 1.9079470198675497, "grad_norm": 0.8880085559979206, "learning_rate": 0.0002886202138724049, "loss": 2.5938, "step": 2881 }, { "epoch": 1.9086092715231788, "grad_norm": 0.7366438862000244, "learning_rate": 0.00028859469806964244, "loss": 2.0156, "step": 2882 }, { "epoch": 1.909271523178808, "grad_norm": 0.8104614526582509, "learning_rate": 0.00028856915482354073, "loss": 2.2969, "step": 2883 }, { "epoch": 1.9099337748344372, "grad_norm": 0.7776768693878011, "learning_rate": 0.0002885435841391578, "loss": 2.4219, "step": 2884 }, { "epoch": 1.910596026490066, "grad_norm": 0.7814699169651934, "learning_rate": 0.0002885179860215569, "loss": 2.3281, "step": 2885 }, { "epoch": 1.9112582781456955, "grad_norm": 0.7608477787307424, "learning_rate": 0.0002884923604758067, "loss": 2.3281, "step": 2886 }, { "epoch": 1.9119205298013244, "grad_norm": 0.8600599481103779, "learning_rate": 0.00028846670750698136, "loss": 2.4531, "step": 2887 }, { "epoch": 1.9125827814569536, "grad_norm": 0.8704771604695623, "learning_rate": 0.00028844102712016046, "loss": 2.5, "step": 2888 }, { "epoch": 1.9132450331125828, "grad_norm": 0.9179862719575775, "learning_rate": 0.00028841531932042897, "loss": 2.4844, "step": 2889 }, { "epoch": 1.913907284768212, "grad_norm": 0.7515594568735481, "learning_rate": 0.00028838958411287743, "loss": 2.0312, "step": 2890 }, { "epoch": 1.914569536423841, "grad_norm": 0.8444074129764717, "learning_rate": 0.0002883638215026016, "loss": 2.3594, "step": 2891 }, { "epoch": 1.91523178807947, "grad_norm": 1.8578889189855137, "learning_rate": 0.00028833803149470286, "loss": 2.3906, "step": 2892 }, { "epoch": 1.9158940397350994, "grad_norm": 0.754080134399696, "learning_rate": 0.0002883122140942879, "loss": 2.375, "step": 2893 }, { "epoch": 1.9165562913907284, "grad_norm": 0.833267882887863, "learning_rate": 0.0002882863693064688, "loss": 2.4531, "step": 2894 }, { "epoch": 1.9172185430463577, "grad_norm": 0.809603732360834, "learning_rate": 0.0002882604971363632, "loss": 2.3594, "step": 2895 }, { "epoch": 1.9178807947019867, "grad_norm": 0.8152211413011785, "learning_rate": 0.0002882345975890941, "loss": 2.4062, "step": 2896 }, { "epoch": 1.9185430463576159, "grad_norm": 0.8859369407881263, "learning_rate": 0.0002882086706697898, "loss": 2.4219, "step": 2897 }, { "epoch": 1.919205298013245, "grad_norm": 0.8529278961926933, "learning_rate": 0.0002881827163835843, "loss": 2.5312, "step": 2898 }, { "epoch": 1.9198675496688742, "grad_norm": 0.72315158333853, "learning_rate": 0.00028815673473561666, "loss": 2.125, "step": 2899 }, { "epoch": 1.9205298013245033, "grad_norm": 0.7726036155339066, "learning_rate": 0.00028813072573103163, "loss": 2.4688, "step": 2900 }, { "epoch": 1.9211920529801323, "grad_norm": 0.773743587470732, "learning_rate": 0.0002881046893749793, "loss": 2.2812, "step": 2901 }, { "epoch": 1.9218543046357617, "grad_norm": 0.8666089123457624, "learning_rate": 0.0002880786256726152, "loss": 2.5156, "step": 2902 }, { "epoch": 1.9225165562913906, "grad_norm": 0.8189555334416532, "learning_rate": 0.00028805253462910017, "loss": 2.6719, "step": 2903 }, { "epoch": 1.92317880794702, "grad_norm": 0.7428493848005827, "learning_rate": 0.0002880264162496005, "loss": 2.0469, "step": 2904 }, { "epoch": 1.923841059602649, "grad_norm": 0.7791561407549328, "learning_rate": 0.0002880002705392881, "loss": 2.0312, "step": 2905 }, { "epoch": 1.9245033112582781, "grad_norm": 0.791653762097023, "learning_rate": 0.0002879740975033399, "loss": 2.625, "step": 2906 }, { "epoch": 1.9251655629139073, "grad_norm": 0.8372867842182472, "learning_rate": 0.00028794789714693865, "loss": 2.625, "step": 2907 }, { "epoch": 1.9258278145695364, "grad_norm": 0.7281281423461923, "learning_rate": 0.0002879216694752722, "loss": 2.2344, "step": 2908 }, { "epoch": 1.9264900662251656, "grad_norm": 0.8437208508313229, "learning_rate": 0.000287895414493534, "loss": 2.0625, "step": 2909 }, { "epoch": 1.9271523178807946, "grad_norm": 0.7947660352935073, "learning_rate": 0.0002878691322069228, "loss": 2.125, "step": 2910 }, { "epoch": 1.927814569536424, "grad_norm": 0.8518070873263343, "learning_rate": 0.00028784282262064284, "loss": 2.4688, "step": 2911 }, { "epoch": 1.9284768211920529, "grad_norm": 0.8296905768915149, "learning_rate": 0.00028781648573990363, "loss": 2.4531, "step": 2912 }, { "epoch": 1.9291390728476823, "grad_norm": 0.8523138674206684, "learning_rate": 0.00028779012156992026, "loss": 2.625, "step": 2913 }, { "epoch": 1.9298013245033112, "grad_norm": 0.8118583335329524, "learning_rate": 0.0002877637301159131, "loss": 2.3125, "step": 2914 }, { "epoch": 1.9304635761589404, "grad_norm": 0.795400929278512, "learning_rate": 0.000287737311383108, "loss": 1.9922, "step": 2915 }, { "epoch": 1.9311258278145695, "grad_norm": 0.7856147758158786, "learning_rate": 0.0002877108653767361, "loss": 2.375, "step": 2916 }, { "epoch": 1.9317880794701987, "grad_norm": 0.7584332076625354, "learning_rate": 0.000287684392102034, "loss": 2.0312, "step": 2917 }, { "epoch": 1.9324503311258279, "grad_norm": 0.7853546435614522, "learning_rate": 0.0002876578915642439, "loss": 2.2812, "step": 2918 }, { "epoch": 1.9331125827814568, "grad_norm": 0.8100688802467343, "learning_rate": 0.000287631363768613, "loss": 2.4688, "step": 2919 }, { "epoch": 1.9337748344370862, "grad_norm": 0.8100790259363666, "learning_rate": 0.00028760480872039415, "loss": 2.5, "step": 2920 }, { "epoch": 1.9344370860927151, "grad_norm": 0.8053350334196528, "learning_rate": 0.0002875782264248456, "loss": 1.9922, "step": 2921 }, { "epoch": 1.9350993377483445, "grad_norm": 0.7718928961820292, "learning_rate": 0.00028755161688723095, "loss": 2.3438, "step": 2922 }, { "epoch": 1.9357615894039735, "grad_norm": 0.8266627231019702, "learning_rate": 0.0002875249801128192, "loss": 2.4219, "step": 2923 }, { "epoch": 1.9364238410596026, "grad_norm": 0.7908267174812844, "learning_rate": 0.00028749831610688466, "loss": 2.5156, "step": 2924 }, { "epoch": 1.9370860927152318, "grad_norm": 0.7784876997881196, "learning_rate": 0.0002874716248747072, "loss": 2.4219, "step": 2925 }, { "epoch": 1.937748344370861, "grad_norm": 0.9127165762173451, "learning_rate": 0.00028744490642157195, "loss": 2.5781, "step": 2926 }, { "epoch": 1.9384105960264901, "grad_norm": 0.81527355940955, "learning_rate": 0.0002874181607527695, "loss": 2.4844, "step": 2927 }, { "epoch": 1.939072847682119, "grad_norm": 0.8122161127053907, "learning_rate": 0.0002873913878735957, "loss": 2.25, "step": 2928 }, { "epoch": 1.9397350993377485, "grad_norm": 0.8595757228408031, "learning_rate": 0.00028736458778935204, "loss": 2.4062, "step": 2929 }, { "epoch": 1.9403973509933774, "grad_norm": 0.8285629984323323, "learning_rate": 0.0002873377605053451, "loss": 2.6875, "step": 2930 }, { "epoch": 1.9410596026490068, "grad_norm": 0.8238410643402478, "learning_rate": 0.0002873109060268871, "loss": 2.3438, "step": 2931 }, { "epoch": 1.9417218543046357, "grad_norm": 0.7108613583027408, "learning_rate": 0.00028728402435929546, "loss": 1.9688, "step": 2932 }, { "epoch": 1.942384105960265, "grad_norm": 1.0030386758670962, "learning_rate": 0.00028725711550789314, "loss": 2.5, "step": 2933 }, { "epoch": 1.943046357615894, "grad_norm": 1.0054361190941763, "learning_rate": 0.00028723017947800833, "loss": 2.7812, "step": 2934 }, { "epoch": 1.9437086092715232, "grad_norm": 0.8081275454921016, "learning_rate": 0.00028720321627497464, "loss": 2.3594, "step": 2935 }, { "epoch": 1.9443708609271524, "grad_norm": 0.7314984359958204, "learning_rate": 0.00028717622590413117, "loss": 2.2656, "step": 2936 }, { "epoch": 1.9450331125827813, "grad_norm": 0.7893275317431592, "learning_rate": 0.00028714920837082233, "loss": 2.4219, "step": 2937 }, { "epoch": 1.9456953642384107, "grad_norm": 0.9377042950172152, "learning_rate": 0.00028712216368039786, "loss": 2.5312, "step": 2938 }, { "epoch": 1.9463576158940397, "grad_norm": 0.829180220396996, "learning_rate": 0.0002870950918382129, "loss": 2.3594, "step": 2939 }, { "epoch": 1.9470198675496688, "grad_norm": 0.8748911234973903, "learning_rate": 0.000287067992849628, "loss": 2.25, "step": 2940 }, { "epoch": 1.947682119205298, "grad_norm": 0.8143908515718336, "learning_rate": 0.0002870408667200091, "loss": 1.9688, "step": 2941 }, { "epoch": 1.9483443708609272, "grad_norm": 0.7179007167088912, "learning_rate": 0.00028701371345472745, "loss": 2.1406, "step": 2942 }, { "epoch": 1.9490066225165563, "grad_norm": 0.9522570125357471, "learning_rate": 0.0002869865330591597, "loss": 2.7188, "step": 2943 }, { "epoch": 1.9496688741721855, "grad_norm": 0.7736543496263136, "learning_rate": 0.000286959325538688, "loss": 2.25, "step": 2944 }, { "epoch": 1.9503311258278146, "grad_norm": 0.9017343418215134, "learning_rate": 0.0002869320908986996, "loss": 2.4062, "step": 2945 }, { "epoch": 1.9509933774834436, "grad_norm": 0.9145655620579923, "learning_rate": 0.00028690482914458734, "loss": 2.4844, "step": 2946 }, { "epoch": 1.951655629139073, "grad_norm": 0.7629984312078882, "learning_rate": 0.00028687754028174935, "loss": 2.0312, "step": 2947 }, { "epoch": 1.952317880794702, "grad_norm": 0.8170916030967584, "learning_rate": 0.0002868502243155891, "loss": 2.375, "step": 2948 }, { "epoch": 1.952980132450331, "grad_norm": 0.7616419278151214, "learning_rate": 0.00028682288125151555, "loss": 1.9062, "step": 2949 }, { "epoch": 1.9536423841059603, "grad_norm": 0.8016584807303868, "learning_rate": 0.00028679551109494285, "loss": 2.4219, "step": 2950 }, { "epoch": 1.9543046357615894, "grad_norm": 0.7761216387124626, "learning_rate": 0.0002867681138512906, "loss": 2.3125, "step": 2951 }, { "epoch": 1.9549668874172186, "grad_norm": 0.7599671820765932, "learning_rate": 0.0002867406895259839, "loss": 2.3906, "step": 2952 }, { "epoch": 1.9556291390728475, "grad_norm": 0.8752925787182948, "learning_rate": 0.00028671323812445295, "loss": 2.3281, "step": 2953 }, { "epoch": 1.956291390728477, "grad_norm": 1.011514721219095, "learning_rate": 0.0002866857596521335, "loss": 2.3906, "step": 2954 }, { "epoch": 1.9569536423841059, "grad_norm": 0.8570594366666575, "learning_rate": 0.0002866582541144666, "loss": 2.2031, "step": 2955 }, { "epoch": 1.9576158940397352, "grad_norm": 0.8834455888493055, "learning_rate": 0.0002866307215168986, "loss": 2.5, "step": 2956 }, { "epoch": 1.9582781456953642, "grad_norm": 0.8042688311374435, "learning_rate": 0.0002866031618648814, "loss": 2.4844, "step": 2957 }, { "epoch": 1.9589403973509933, "grad_norm": 0.7930786158157322, "learning_rate": 0.00028657557516387195, "loss": 2.4062, "step": 2958 }, { "epoch": 1.9596026490066225, "grad_norm": 0.9466550603963453, "learning_rate": 0.00028654796141933293, "loss": 2.3438, "step": 2959 }, { "epoch": 1.9602649006622517, "grad_norm": 0.8094797870077459, "learning_rate": 0.00028652032063673197, "loss": 2.4531, "step": 2960 }, { "epoch": 1.9609271523178808, "grad_norm": 0.970401809793331, "learning_rate": 0.00028649265282154244, "loss": 2.4688, "step": 2961 }, { "epoch": 1.9615894039735098, "grad_norm": 0.6907469776593262, "learning_rate": 0.0002864649579792428, "loss": 2.0, "step": 2962 }, { "epoch": 1.9622516556291392, "grad_norm": 0.7883117431588597, "learning_rate": 0.0002864372361153169, "loss": 2.4688, "step": 2963 }, { "epoch": 1.9629139072847681, "grad_norm": 0.8262485500633999, "learning_rate": 0.00028640948723525415, "loss": 2.4688, "step": 2964 }, { "epoch": 1.9635761589403975, "grad_norm": 0.7804466498829276, "learning_rate": 0.000286381711344549, "loss": 2.2812, "step": 2965 }, { "epoch": 1.9642384105960264, "grad_norm": 0.8306677166668551, "learning_rate": 0.00028635390844870145, "loss": 2.4531, "step": 2966 }, { "epoch": 1.9649006622516556, "grad_norm": 0.7664082317708758, "learning_rate": 0.00028632607855321677, "loss": 2.3125, "step": 2967 }, { "epoch": 1.9655629139072848, "grad_norm": 0.7605676466756678, "learning_rate": 0.0002862982216636056, "loss": 2.3281, "step": 2968 }, { "epoch": 1.966225165562914, "grad_norm": 0.8373908271093768, "learning_rate": 0.00028627033778538396, "loss": 2.5781, "step": 2969 }, { "epoch": 1.966887417218543, "grad_norm": 0.8913532349662862, "learning_rate": 0.0002862424269240731, "loss": 2.4062, "step": 2970 }, { "epoch": 1.967549668874172, "grad_norm": 0.8062896553220953, "learning_rate": 0.00028621448908519977, "loss": 2.5156, "step": 2971 }, { "epoch": 1.9682119205298014, "grad_norm": 0.8537440183720169, "learning_rate": 0.000286186524274296, "loss": 2.2969, "step": 2972 }, { "epoch": 1.9688741721854304, "grad_norm": 0.6659432156567302, "learning_rate": 0.000286158532496899, "loss": 1.7422, "step": 2973 }, { "epoch": 1.9695364238410598, "grad_norm": 0.7637334146342779, "learning_rate": 0.00028613051375855165, "loss": 1.9844, "step": 2974 }, { "epoch": 1.9701986754966887, "grad_norm": 0.8661636695192214, "learning_rate": 0.00028610246806480186, "loss": 2.5469, "step": 2975 }, { "epoch": 1.9708609271523179, "grad_norm": 0.9010976718916557, "learning_rate": 0.000286074395421203, "loss": 2.5938, "step": 2976 }, { "epoch": 1.971523178807947, "grad_norm": 0.8404657917573464, "learning_rate": 0.0002860462958333138, "loss": 2.3906, "step": 2977 }, { "epoch": 1.9721854304635762, "grad_norm": 0.8256838659364901, "learning_rate": 0.00028601816930669835, "loss": 2.7031, "step": 2978 }, { "epoch": 1.9728476821192054, "grad_norm": 0.9506167543755507, "learning_rate": 0.00028599001584692595, "loss": 2.25, "step": 2979 }, { "epoch": 1.9735099337748343, "grad_norm": 0.7787009350264543, "learning_rate": 0.00028596183545957137, "loss": 1.875, "step": 2980 }, { "epoch": 1.9741721854304637, "grad_norm": 0.8541601514385739, "learning_rate": 0.0002859336281502146, "loss": 2.5938, "step": 2981 }, { "epoch": 1.9748344370860926, "grad_norm": 0.8227465942645088, "learning_rate": 0.000285905393924441, "loss": 2.4219, "step": 2982 }, { "epoch": 1.975496688741722, "grad_norm": 0.7358629404976218, "learning_rate": 0.0002858771327878413, "loss": 2.0312, "step": 2983 }, { "epoch": 1.976158940397351, "grad_norm": 0.8552340727553472, "learning_rate": 0.00028584884474601154, "loss": 2.5625, "step": 2984 }, { "epoch": 1.9768211920529801, "grad_norm": 0.7856535421710622, "learning_rate": 0.0002858205298045531, "loss": 2.3281, "step": 2985 }, { "epoch": 1.9774834437086093, "grad_norm": 0.7367140394663255, "learning_rate": 0.0002857921879690726, "loss": 2.4375, "step": 2986 }, { "epoch": 1.9781456953642385, "grad_norm": 0.8007103401880254, "learning_rate": 0.0002857638192451821, "loss": 2.2812, "step": 2987 }, { "epoch": 1.9788079470198676, "grad_norm": 0.8067678255289292, "learning_rate": 0.00028573542363849886, "loss": 2.4219, "step": 2988 }, { "epoch": 1.9794701986754966, "grad_norm": 0.7206682472996087, "learning_rate": 0.00028570700115464556, "loss": 2.0, "step": 2989 }, { "epoch": 1.980132450331126, "grad_norm": 0.79344168752213, "learning_rate": 0.0002856785517992503, "loss": 2.2969, "step": 2990 }, { "epoch": 1.980794701986755, "grad_norm": 0.8411552127274997, "learning_rate": 0.00028565007557794617, "loss": 2.4844, "step": 2991 }, { "epoch": 1.981456953642384, "grad_norm": 0.8069420007135865, "learning_rate": 0.000285621572496372, "loss": 2.4219, "step": 2992 }, { "epoch": 1.9821192052980132, "grad_norm": 0.7578209686554758, "learning_rate": 0.0002855930425601716, "loss": 2.3125, "step": 2993 }, { "epoch": 1.9827814569536424, "grad_norm": 0.7980714684551018, "learning_rate": 0.0002855644857749942, "loss": 2.4062, "step": 2994 }, { "epoch": 1.9834437086092715, "grad_norm": 0.7957242808934297, "learning_rate": 0.00028553590214649445, "loss": 2.25, "step": 2995 }, { "epoch": 1.9841059602649007, "grad_norm": 0.7906121709016272, "learning_rate": 0.0002855072916803322, "loss": 2.0625, "step": 2996 }, { "epoch": 1.9847682119205299, "grad_norm": 0.7418735925128951, "learning_rate": 0.00028547865438217267, "loss": 2.3906, "step": 2997 }, { "epoch": 1.9854304635761588, "grad_norm": 0.7724521520361076, "learning_rate": 0.00028544999025768635, "loss": 2.4531, "step": 2998 }, { "epoch": 1.9860927152317882, "grad_norm": 0.7349983842965367, "learning_rate": 0.0002854212993125491, "loss": 2.1562, "step": 2999 }, { "epoch": 1.9867549668874172, "grad_norm": 0.7906857144921268, "learning_rate": 0.000285392581552442, "loss": 2.3594, "step": 3000 }, { "epoch": 1.9874172185430463, "grad_norm": 0.7967094493937565, "learning_rate": 0.00028536383698305156, "loss": 2.2656, "step": 3001 }, { "epoch": 1.9880794701986755, "grad_norm": 0.7345940240802142, "learning_rate": 0.00028533506561006947, "loss": 2.4062, "step": 3002 }, { "epoch": 1.9887417218543046, "grad_norm": 0.7925910814584763, "learning_rate": 0.00028530626743919284, "loss": 2.0156, "step": 3003 }, { "epoch": 1.9894039735099338, "grad_norm": 0.7329176566343101, "learning_rate": 0.000285277442476124, "loss": 2.3594, "step": 3004 }, { "epoch": 1.9900662251655628, "grad_norm": 0.7688851320110776, "learning_rate": 0.0002852485907265706, "loss": 2.2969, "step": 3005 }, { "epoch": 1.9907284768211921, "grad_norm": 0.691042239287597, "learning_rate": 0.0002852197121962458, "loss": 2.1406, "step": 3006 }, { "epoch": 1.991390728476821, "grad_norm": 0.704431539848973, "learning_rate": 0.00028519080689086765, "loss": 1.875, "step": 3007 }, { "epoch": 1.9920529801324505, "grad_norm": 0.8634188034160398, "learning_rate": 0.0002851618748161598, "loss": 2.4688, "step": 3008 }, { "epoch": 1.9927152317880794, "grad_norm": 0.8347909373295213, "learning_rate": 0.0002851329159778512, "loss": 2.4531, "step": 3009 }, { "epoch": 1.9933774834437086, "grad_norm": 0.8376122542428929, "learning_rate": 0.000285103930381676, "loss": 2.3438, "step": 3010 }, { "epoch": 1.9940397350993377, "grad_norm": 0.8201923354550036, "learning_rate": 0.0002850749180333736, "loss": 2.25, "step": 3011 }, { "epoch": 1.994701986754967, "grad_norm": 0.7512952569807326, "learning_rate": 0.00028504587893868897, "loss": 2.375, "step": 3012 }, { "epoch": 1.995364238410596, "grad_norm": 0.7677195904631053, "learning_rate": 0.000285016813103372, "loss": 2.2188, "step": 3013 }, { "epoch": 1.996026490066225, "grad_norm": 0.8307319096118427, "learning_rate": 0.00028498772053317805, "loss": 2.375, "step": 3014 }, { "epoch": 1.9966887417218544, "grad_norm": 0.8725571263053491, "learning_rate": 0.0002849586012338679, "loss": 2.25, "step": 3015 }, { "epoch": 1.9973509933774833, "grad_norm": 0.8151400709447173, "learning_rate": 0.0002849294552112075, "loss": 2.2969, "step": 3016 }, { "epoch": 1.9980132450331127, "grad_norm": 0.6970740944504712, "learning_rate": 0.000284900282470968, "loss": 2.1094, "step": 3017 }, { "epoch": 1.9986754966887417, "grad_norm": 0.767726541052731, "learning_rate": 0.000284871083018926, "loss": 2.3906, "step": 3018 }, { "epoch": 1.9993377483443708, "grad_norm": 0.856240836324722, "learning_rate": 0.00028484185686086333, "loss": 2.3125, "step": 3019 }, { "epoch": 2.0, "grad_norm": 0.8080218053225915, "learning_rate": 0.00028481260400256705, "loss": 2.2344, "step": 3020 }, { "epoch": 2.0, "eval_loss": 2.3295350074768066, "eval_runtime": 34.0055, "eval_samples_per_second": 9.94, "eval_steps_per_second": 9.94, "step": 3020 }, { "epoch": 2.000662251655629, "grad_norm": 0.7822085716206245, "learning_rate": 0.0002847833244498296, "loss": 2.2812, "step": 3021 }, { "epoch": 2.0013245033112583, "grad_norm": 0.7351924332954816, "learning_rate": 0.0002847540182084487, "loss": 2.2969, "step": 3022 }, { "epoch": 2.0019867549668873, "grad_norm": 0.8244890366369247, "learning_rate": 0.0002847246852842273, "loss": 2.0156, "step": 3023 }, { "epoch": 2.0026490066225167, "grad_norm": 0.7119041181724008, "learning_rate": 0.0002846953256829736, "loss": 1.4062, "step": 3024 }, { "epoch": 2.0033112582781456, "grad_norm": 0.7871132037714198, "learning_rate": 0.00028466593941050124, "loss": 1.9453, "step": 3025 }, { "epoch": 2.003973509933775, "grad_norm": 0.7605720798387549, "learning_rate": 0.00028463652647262895, "loss": 2.1406, "step": 3026 }, { "epoch": 2.004635761589404, "grad_norm": 0.8961684730068566, "learning_rate": 0.0002846070868751809, "loss": 2.2188, "step": 3027 }, { "epoch": 2.0052980132450333, "grad_norm": 0.837197825175647, "learning_rate": 0.0002845776206239864, "loss": 2.2812, "step": 3028 }, { "epoch": 2.0059602649006623, "grad_norm": 0.8984064290365772, "learning_rate": 0.0002845481277248801, "loss": 2.4219, "step": 3029 }, { "epoch": 2.006622516556291, "grad_norm": 0.8987708137622051, "learning_rate": 0.000284518608183702, "loss": 2.0781, "step": 3030 }, { "epoch": 2.0072847682119206, "grad_norm": 0.7767318586194267, "learning_rate": 0.0002844890620062973, "loss": 2.0781, "step": 3031 }, { "epoch": 2.0079470198675495, "grad_norm": 0.8110983207289155, "learning_rate": 0.0002844594891985164, "loss": 2.3125, "step": 3032 }, { "epoch": 2.008609271523179, "grad_norm": 0.7690697359258195, "learning_rate": 0.0002844298897662152, "loss": 1.9844, "step": 3033 }, { "epoch": 2.009271523178808, "grad_norm": 0.8046151668047622, "learning_rate": 0.0002844002637152545, "loss": 2.2188, "step": 3034 }, { "epoch": 2.0099337748344372, "grad_norm": 0.8182453332284277, "learning_rate": 0.00028437061105150084, "loss": 2.3125, "step": 3035 }, { "epoch": 2.010596026490066, "grad_norm": 0.8150776848494918, "learning_rate": 0.0002843409317808257, "loss": 2.3594, "step": 3036 }, { "epoch": 2.0112582781456956, "grad_norm": 0.696765242686899, "learning_rate": 0.0002843112259091058, "loss": 1.7969, "step": 3037 }, { "epoch": 2.0119205298013245, "grad_norm": 0.7662577119962337, "learning_rate": 0.0002842814934422234, "loss": 2.1562, "step": 3038 }, { "epoch": 2.0125827814569535, "grad_norm": 0.8412781589294703, "learning_rate": 0.0002842517343860658, "loss": 1.9688, "step": 3039 }, { "epoch": 2.013245033112583, "grad_norm": 0.8110549835615121, "learning_rate": 0.00028422194874652565, "loss": 2.3594, "step": 3040 }, { "epoch": 2.013907284768212, "grad_norm": 0.7956529026298629, "learning_rate": 0.00028419213652950087, "loss": 1.6484, "step": 3041 }, { "epoch": 2.014569536423841, "grad_norm": 0.8333843196351454, "learning_rate": 0.00028416229774089457, "loss": 1.8906, "step": 3042 }, { "epoch": 2.01523178807947, "grad_norm": 0.9274956110875618, "learning_rate": 0.00028413243238661516, "loss": 2.2344, "step": 3043 }, { "epoch": 2.0158940397350995, "grad_norm": 0.9788051570809375, "learning_rate": 0.0002841025404725764, "loss": 2.4531, "step": 3044 }, { "epoch": 2.0165562913907285, "grad_norm": 0.8414831485270553, "learning_rate": 0.0002840726220046972, "loss": 2.2344, "step": 3045 }, { "epoch": 2.017218543046358, "grad_norm": 0.8711942496434394, "learning_rate": 0.00028404267698890176, "loss": 2.1562, "step": 3046 }, { "epoch": 2.017880794701987, "grad_norm": 0.8454803230666261, "learning_rate": 0.00028401270543111954, "loss": 2.3906, "step": 3047 }, { "epoch": 2.0185430463576157, "grad_norm": 0.7521761194131156, "learning_rate": 0.00028398270733728516, "loss": 1.8984, "step": 3048 }, { "epoch": 2.019205298013245, "grad_norm": 0.8538737442365523, "learning_rate": 0.00028395268271333874, "loss": 2.3906, "step": 3049 }, { "epoch": 2.019867549668874, "grad_norm": 0.7523135474096135, "learning_rate": 0.0002839226315652254, "loss": 2.0938, "step": 3050 }, { "epoch": 2.0205298013245034, "grad_norm": 0.822975951350184, "learning_rate": 0.00028389255389889566, "loss": 2.125, "step": 3051 }, { "epoch": 2.0211920529801324, "grad_norm": 0.7688298353858256, "learning_rate": 0.00028386244972030517, "loss": 2.2188, "step": 3052 }, { "epoch": 2.0218543046357618, "grad_norm": 0.8568860086774109, "learning_rate": 0.000283832319035415, "loss": 1.9922, "step": 3053 }, { "epoch": 2.0225165562913907, "grad_norm": 0.8606099574985561, "learning_rate": 0.0002838021618501913, "loss": 2.0938, "step": 3054 }, { "epoch": 2.0231788079470197, "grad_norm": 0.9079625132824076, "learning_rate": 0.00028377197817060554, "loss": 2.4062, "step": 3055 }, { "epoch": 2.023841059602649, "grad_norm": 0.8956579168343832, "learning_rate": 0.0002837417680026345, "loss": 2.2344, "step": 3056 }, { "epoch": 2.024503311258278, "grad_norm": 0.796979935853134, "learning_rate": 0.00028371153135226, "loss": 1.9453, "step": 3057 }, { "epoch": 2.0251655629139074, "grad_norm": 0.7634717956919849, "learning_rate": 0.00028368126822546943, "loss": 2.1719, "step": 3058 }, { "epoch": 2.0258278145695363, "grad_norm": 1.2910630383165485, "learning_rate": 0.00028365097862825513, "loss": 2.2188, "step": 3059 }, { "epoch": 2.0264900662251657, "grad_norm": 0.9142860946154696, "learning_rate": 0.0002836206625666148, "loss": 2.4688, "step": 3060 }, { "epoch": 2.0271523178807946, "grad_norm": 0.7395723062096163, "learning_rate": 0.0002835903200465514, "loss": 1.8203, "step": 3061 }, { "epoch": 2.027814569536424, "grad_norm": 0.9038546366642554, "learning_rate": 0.000283559951074073, "loss": 2.5938, "step": 3062 }, { "epoch": 2.028476821192053, "grad_norm": 0.7863480424396265, "learning_rate": 0.00028352955565519316, "loss": 2.2188, "step": 3063 }, { "epoch": 2.029139072847682, "grad_norm": 0.8261949428134419, "learning_rate": 0.0002834991337959304, "loss": 2.2969, "step": 3064 }, { "epoch": 2.0298013245033113, "grad_norm": 0.7972801406164743, "learning_rate": 0.0002834686855023086, "loss": 1.9531, "step": 3065 }, { "epoch": 2.0304635761589402, "grad_norm": 0.7231815437325241, "learning_rate": 0.00028343821078035693, "loss": 1.75, "step": 3066 }, { "epoch": 2.0311258278145696, "grad_norm": 0.7521245507311504, "learning_rate": 0.0002834077096361097, "loss": 2.1406, "step": 3067 }, { "epoch": 2.0317880794701986, "grad_norm": 1.0264235434684348, "learning_rate": 0.00028337718207560646, "loss": 2.7969, "step": 3068 }, { "epoch": 2.032450331125828, "grad_norm": 0.716934773790205, "learning_rate": 0.00028334662810489207, "loss": 1.6797, "step": 3069 }, { "epoch": 2.033112582781457, "grad_norm": 0.8696441641148054, "learning_rate": 0.0002833160477300166, "loss": 2.375, "step": 3070 }, { "epoch": 2.0337748344370863, "grad_norm": 0.9122303223825191, "learning_rate": 0.0002832854409570352, "loss": 2.3125, "step": 3071 }, { "epoch": 2.0344370860927152, "grad_norm": 0.7640241438415241, "learning_rate": 0.00028325480779200845, "loss": 2.2344, "step": 3072 }, { "epoch": 2.035099337748344, "grad_norm": 0.9634486625711441, "learning_rate": 0.000283224148241002, "loss": 2.2344, "step": 3073 }, { "epoch": 2.0357615894039736, "grad_norm": 0.8418488970874002, "learning_rate": 0.00028319346231008686, "loss": 2.2656, "step": 3074 }, { "epoch": 2.0364238410596025, "grad_norm": 0.8826407865837091, "learning_rate": 0.0002831627500053392, "loss": 2.4062, "step": 3075 }, { "epoch": 2.037086092715232, "grad_norm": 0.7944930509678328, "learning_rate": 0.00028313201133284033, "loss": 1.7969, "step": 3076 }, { "epoch": 2.037748344370861, "grad_norm": 0.8108011502441784, "learning_rate": 0.00028310124629867694, "loss": 1.8984, "step": 3077 }, { "epoch": 2.03841059602649, "grad_norm": 0.7722033275737668, "learning_rate": 0.0002830704549089408, "loss": 1.8203, "step": 3078 }, { "epoch": 2.039072847682119, "grad_norm": 0.8060447789564797, "learning_rate": 0.000283039637169729, "loss": 2.0781, "step": 3079 }, { "epoch": 2.0397350993377485, "grad_norm": 0.8437134224555156, "learning_rate": 0.00028300879308714377, "loss": 1.9688, "step": 3080 }, { "epoch": 2.0403973509933775, "grad_norm": 0.9196681504395526, "learning_rate": 0.0002829779226672927, "loss": 2.2812, "step": 3081 }, { "epoch": 2.0410596026490064, "grad_norm": 3.101775531759244, "learning_rate": 0.0002829470259162883, "loss": 2.3125, "step": 3082 }, { "epoch": 2.041721854304636, "grad_norm": 0.8790655244893846, "learning_rate": 0.0002829161028402486, "loss": 2.4219, "step": 3083 }, { "epoch": 2.0423841059602648, "grad_norm": 0.823235163431939, "learning_rate": 0.00028288515344529675, "loss": 2.0781, "step": 3084 }, { "epoch": 2.043046357615894, "grad_norm": 0.7846403539578026, "learning_rate": 0.00028285417773756097, "loss": 2.0312, "step": 3085 }, { "epoch": 2.043708609271523, "grad_norm": 0.8710351265030196, "learning_rate": 0.0002828231757231749, "loss": 2.0, "step": 3086 }, { "epoch": 2.0443708609271525, "grad_norm": 0.9168808979990573, "learning_rate": 0.0002827921474082773, "loss": 2.0, "step": 3087 }, { "epoch": 2.0450331125827814, "grad_norm": 0.759742998506687, "learning_rate": 0.0002827610927990121, "loss": 1.6016, "step": 3088 }, { "epoch": 2.045695364238411, "grad_norm": 0.9232128403809053, "learning_rate": 0.00028273001190152845, "loss": 2.3125, "step": 3089 }, { "epoch": 2.0463576158940397, "grad_norm": 0.8102866278962713, "learning_rate": 0.0002826989047219807, "loss": 2.0, "step": 3090 }, { "epoch": 2.0470198675496687, "grad_norm": 0.8976244384168187, "learning_rate": 0.0002826677712665285, "loss": 2.2656, "step": 3091 }, { "epoch": 2.047682119205298, "grad_norm": 0.8184290883734581, "learning_rate": 0.0002826366115413366, "loss": 2.0469, "step": 3092 }, { "epoch": 2.048344370860927, "grad_norm": 0.7909456792494507, "learning_rate": 0.000282605425552575, "loss": 1.6797, "step": 3093 }, { "epoch": 2.0490066225165564, "grad_norm": 0.8799167734306641, "learning_rate": 0.0002825742133064189, "loss": 2.0625, "step": 3094 }, { "epoch": 2.0496688741721854, "grad_norm": 0.9324396640392418, "learning_rate": 0.0002825429748090487, "loss": 2.3125, "step": 3095 }, { "epoch": 2.0503311258278147, "grad_norm": 0.8530898999727972, "learning_rate": 0.0002825117100666498, "loss": 2.2188, "step": 3096 }, { "epoch": 2.0509933774834437, "grad_norm": 0.8245024540803908, "learning_rate": 0.0002824804190854132, "loss": 2.4531, "step": 3097 }, { "epoch": 2.0516556291390726, "grad_norm": 0.8573663069793667, "learning_rate": 0.0002824491018715347, "loss": 2.3438, "step": 3098 }, { "epoch": 2.052317880794702, "grad_norm": 0.9130798646665145, "learning_rate": 0.00028241775843121565, "loss": 2.0312, "step": 3099 }, { "epoch": 2.052980132450331, "grad_norm": 0.7987379246026312, "learning_rate": 0.00028238638877066223, "loss": 2.1406, "step": 3100 }, { "epoch": 2.0536423841059603, "grad_norm": 0.8467016153797152, "learning_rate": 0.0002823549928960861, "loss": 2.2344, "step": 3101 }, { "epoch": 2.0543046357615893, "grad_norm": 0.7651170264975742, "learning_rate": 0.000282323570813704, "loss": 2.1719, "step": 3102 }, { "epoch": 2.0549668874172187, "grad_norm": 0.7974496084290763, "learning_rate": 0.0002822921225297378, "loss": 2.2812, "step": 3103 }, { "epoch": 2.0556291390728476, "grad_norm": 0.8419218283380449, "learning_rate": 0.00028226064805041465, "loss": 2.2188, "step": 3104 }, { "epoch": 2.056291390728477, "grad_norm": 0.8183821891990928, "learning_rate": 0.0002822291473819669, "loss": 2.4688, "step": 3105 }, { "epoch": 2.056953642384106, "grad_norm": 0.7669416981098064, "learning_rate": 0.000282197620530632, "loss": 2.125, "step": 3106 }, { "epoch": 2.057615894039735, "grad_norm": 0.7933034425220477, "learning_rate": 0.00028216606750265265, "loss": 2.1719, "step": 3107 }, { "epoch": 2.0582781456953643, "grad_norm": 0.8410366188945075, "learning_rate": 0.0002821344883042767, "loss": 2.3125, "step": 3108 }, { "epoch": 2.058940397350993, "grad_norm": 0.7943328375176141, "learning_rate": 0.0002821028829417572, "loss": 2.3281, "step": 3109 }, { "epoch": 2.0596026490066226, "grad_norm": 0.8197908979710908, "learning_rate": 0.0002820712514213524, "loss": 2.2344, "step": 3110 }, { "epoch": 2.0602649006622515, "grad_norm": 0.7799788168987745, "learning_rate": 0.00028203959374932557, "loss": 1.9062, "step": 3111 }, { "epoch": 2.060927152317881, "grad_norm": 1.6397234326451111, "learning_rate": 0.00028200790993194546, "loss": 1.6406, "step": 3112 }, { "epoch": 2.06158940397351, "grad_norm": 0.9426110548715044, "learning_rate": 0.00028197619997548574, "loss": 2.2344, "step": 3113 }, { "epoch": 2.0622516556291393, "grad_norm": 0.8363704691023665, "learning_rate": 0.00028194446388622534, "loss": 2.0625, "step": 3114 }, { "epoch": 2.062913907284768, "grad_norm": 0.800036050381332, "learning_rate": 0.00028191270167044846, "loss": 2.1719, "step": 3115 }, { "epoch": 2.063576158940397, "grad_norm": 0.8868937666014706, "learning_rate": 0.0002818809133344443, "loss": 2.5625, "step": 3116 }, { "epoch": 2.0642384105960265, "grad_norm": 0.8638151015504766, "learning_rate": 0.00028184909888450733, "loss": 2.2656, "step": 3117 }, { "epoch": 2.0649006622516555, "grad_norm": 0.8893267453546309, "learning_rate": 0.0002818172583269372, "loss": 2.25, "step": 3118 }, { "epoch": 2.065562913907285, "grad_norm": 0.8067311127583038, "learning_rate": 0.0002817853916680386, "loss": 2.2812, "step": 3119 }, { "epoch": 2.066225165562914, "grad_norm": 0.7622862111987854, "learning_rate": 0.0002817534989141217, "loss": 2.1719, "step": 3120 }, { "epoch": 2.066887417218543, "grad_norm": 0.7926266710421536, "learning_rate": 0.00028172158007150143, "loss": 2.2656, "step": 3121 }, { "epoch": 2.067549668874172, "grad_norm": 0.755907987843484, "learning_rate": 0.0002816896351464982, "loss": 1.8828, "step": 3122 }, { "epoch": 2.0682119205298015, "grad_norm": 0.8265968534322612, "learning_rate": 0.0002816576641454374, "loss": 2.2188, "step": 3123 }, { "epoch": 2.0688741721854305, "grad_norm": 0.8199098589470273, "learning_rate": 0.0002816256670746498, "loss": 2.0781, "step": 3124 }, { "epoch": 2.0695364238410594, "grad_norm": 0.8530459411639963, "learning_rate": 0.00028159364394047095, "loss": 2.4375, "step": 3125 }, { "epoch": 2.070198675496689, "grad_norm": 0.7856947973609075, "learning_rate": 0.000281561594749242, "loss": 1.8984, "step": 3126 }, { "epoch": 2.0708609271523177, "grad_norm": 0.8166310147552497, "learning_rate": 0.00028152951950730895, "loss": 2.2031, "step": 3127 }, { "epoch": 2.071523178807947, "grad_norm": 0.8320901590465946, "learning_rate": 0.0002814974182210231, "loss": 2.0781, "step": 3128 }, { "epoch": 2.072185430463576, "grad_norm": 0.8519037425416551, "learning_rate": 0.0002814652908967409, "loss": 2.125, "step": 3129 }, { "epoch": 2.0728476821192054, "grad_norm": 0.7537013947035321, "learning_rate": 0.00028143313754082393, "loss": 1.8203, "step": 3130 }, { "epoch": 2.0735099337748344, "grad_norm": 0.8793310990728718, "learning_rate": 0.0002814009581596388, "loss": 2.1406, "step": 3131 }, { "epoch": 2.0741721854304638, "grad_norm": 0.811602230226561, "learning_rate": 0.0002813687527595576, "loss": 1.9844, "step": 3132 }, { "epoch": 2.0748344370860927, "grad_norm": 0.8777386683507972, "learning_rate": 0.0002813365213469572, "loss": 1.75, "step": 3133 }, { "epoch": 2.0754966887417217, "grad_norm": 0.875854239870614, "learning_rate": 0.00028130426392821984, "loss": 1.8359, "step": 3134 }, { "epoch": 2.076158940397351, "grad_norm": 0.8872111356958647, "learning_rate": 0.0002812719805097329, "loss": 1.9062, "step": 3135 }, { "epoch": 2.07682119205298, "grad_norm": 0.855470303928801, "learning_rate": 0.0002812396710978888, "loss": 1.9531, "step": 3136 }, { "epoch": 2.0774834437086094, "grad_norm": 1.0019710975758873, "learning_rate": 0.00028120733569908524, "loss": 2.375, "step": 3137 }, { "epoch": 2.0781456953642383, "grad_norm": 0.8133040534781214, "learning_rate": 0.0002811749743197249, "loss": 2.1094, "step": 3138 }, { "epoch": 2.0788079470198677, "grad_norm": 0.9238881120290313, "learning_rate": 0.0002811425869662159, "loss": 2.3594, "step": 3139 }, { "epoch": 2.0794701986754967, "grad_norm": 0.868338876743035, "learning_rate": 0.00028111017364497107, "loss": 2.0938, "step": 3140 }, { "epoch": 2.080132450331126, "grad_norm": 0.7834299580394366, "learning_rate": 0.0002810777343624088, "loss": 1.9531, "step": 3141 }, { "epoch": 2.080794701986755, "grad_norm": 0.8840221639877152, "learning_rate": 0.0002810452691249523, "loss": 2.25, "step": 3142 }, { "epoch": 2.081456953642384, "grad_norm": 0.825079397032191, "learning_rate": 0.0002810127779390302, "loss": 2.1719, "step": 3143 }, { "epoch": 2.0821192052980133, "grad_norm": 0.8970107011962454, "learning_rate": 0.00028098026081107603, "loss": 2.2188, "step": 3144 }, { "epoch": 2.0827814569536423, "grad_norm": 0.7661709812279206, "learning_rate": 0.00028094771774752854, "loss": 2.0625, "step": 3145 }, { "epoch": 2.0834437086092716, "grad_norm": 0.8724765531200942, "learning_rate": 0.0002809151487548317, "loss": 2.4375, "step": 3146 }, { "epoch": 2.0841059602649006, "grad_norm": 0.8027041321418783, "learning_rate": 0.00028088255383943453, "loss": 2.2656, "step": 3147 }, { "epoch": 2.08476821192053, "grad_norm": 0.7778881267098369, "learning_rate": 0.00028084993300779115, "loss": 2.2969, "step": 3148 }, { "epoch": 2.085430463576159, "grad_norm": 0.7473978371141099, "learning_rate": 0.0002808172862663609, "loss": 1.7734, "step": 3149 }, { "epoch": 2.0860927152317883, "grad_norm": 0.7427112478913946, "learning_rate": 0.00028078461362160824, "loss": 1.8516, "step": 3150 }, { "epoch": 2.0867549668874172, "grad_norm": 0.9033159801032535, "learning_rate": 0.0002807519150800026, "loss": 2.3125, "step": 3151 }, { "epoch": 2.087417218543046, "grad_norm": 0.8654264293321393, "learning_rate": 0.0002807191906480188, "loss": 2.0938, "step": 3152 }, { "epoch": 2.0880794701986756, "grad_norm": 0.7824684087055018, "learning_rate": 0.00028068644033213663, "loss": 2.2344, "step": 3153 }, { "epoch": 2.0887417218543045, "grad_norm": 0.9550076518351815, "learning_rate": 0.000280653664138841, "loss": 2.3594, "step": 3154 }, { "epoch": 2.089403973509934, "grad_norm": 0.9138409780467831, "learning_rate": 0.00028062086207462195, "loss": 2.1719, "step": 3155 }, { "epoch": 2.090066225165563, "grad_norm": 0.8675920601060703, "learning_rate": 0.0002805880341459747, "loss": 2.2969, "step": 3156 }, { "epoch": 2.0907284768211922, "grad_norm": 0.7901989534135593, "learning_rate": 0.0002805551803593995, "loss": 2.2031, "step": 3157 }, { "epoch": 2.091390728476821, "grad_norm": 0.8714151694948924, "learning_rate": 0.0002805223007214019, "loss": 2.2969, "step": 3158 }, { "epoch": 2.0920529801324506, "grad_norm": 0.7670837948045159, "learning_rate": 0.0002804893952384923, "loss": 1.8359, "step": 3159 }, { "epoch": 2.0927152317880795, "grad_norm": 0.8089192012055985, "learning_rate": 0.0002804564639171865, "loss": 2.2969, "step": 3160 }, { "epoch": 2.0933774834437084, "grad_norm": 0.6718848019256846, "learning_rate": 0.0002804235067640052, "loss": 1.6641, "step": 3161 }, { "epoch": 2.094039735099338, "grad_norm": 0.7069621690083887, "learning_rate": 0.00028039052378547424, "loss": 1.7891, "step": 3162 }, { "epoch": 2.0947019867549668, "grad_norm": 0.7467074023567285, "learning_rate": 0.00028035751498812475, "loss": 1.9062, "step": 3163 }, { "epoch": 2.095364238410596, "grad_norm": 0.7227779512173876, "learning_rate": 0.00028032448037849277, "loss": 1.6953, "step": 3164 }, { "epoch": 2.096026490066225, "grad_norm": 0.8376928072390296, "learning_rate": 0.0002802914199631195, "loss": 1.9922, "step": 3165 }, { "epoch": 2.0966887417218545, "grad_norm": 0.8034678708664186, "learning_rate": 0.0002802583337485514, "loss": 2.125, "step": 3166 }, { "epoch": 2.0973509933774834, "grad_norm": 0.8804627676317095, "learning_rate": 0.00028022522174133986, "loss": 2.4062, "step": 3167 }, { "epoch": 2.0980132450331124, "grad_norm": 0.9297674297291991, "learning_rate": 0.0002801920839480414, "loss": 2.1719, "step": 3168 }, { "epoch": 2.0986754966887418, "grad_norm": 0.8657994609469095, "learning_rate": 0.00028015892037521774, "loss": 2.3125, "step": 3169 }, { "epoch": 2.0993377483443707, "grad_norm": 0.7788842539952289, "learning_rate": 0.0002801257310294356, "loss": 2.0625, "step": 3170 }, { "epoch": 2.1, "grad_norm": 0.8869612062271457, "learning_rate": 0.0002800925159172669, "loss": 2.1719, "step": 3171 }, { "epoch": 2.100662251655629, "grad_norm": 0.8810528926701792, "learning_rate": 0.00028005927504528856, "loss": 2.5, "step": 3172 }, { "epoch": 2.1013245033112584, "grad_norm": 0.878707686483825, "learning_rate": 0.00028002600842008267, "loss": 2.2344, "step": 3173 }, { "epoch": 2.1019867549668874, "grad_norm": 0.8461113755429649, "learning_rate": 0.00027999271604823645, "loss": 2.0625, "step": 3174 }, { "epoch": 2.1026490066225167, "grad_norm": 0.7926432684279495, "learning_rate": 0.0002799593979363421, "loss": 2.375, "step": 3175 }, { "epoch": 2.1033112582781457, "grad_norm": 0.8229155785691155, "learning_rate": 0.00027992605409099704, "loss": 2.2812, "step": 3176 }, { "epoch": 2.1039735099337746, "grad_norm": 0.8720616583297459, "learning_rate": 0.00027989268451880374, "loss": 2.0781, "step": 3177 }, { "epoch": 2.104635761589404, "grad_norm": 0.9041400344498846, "learning_rate": 0.0002798592892263697, "loss": 2.5312, "step": 3178 }, { "epoch": 2.105298013245033, "grad_norm": 0.7435122171247798, "learning_rate": 0.0002798258682203076, "loss": 2.0312, "step": 3179 }, { "epoch": 2.1059602649006623, "grad_norm": 0.9496896881886252, "learning_rate": 0.0002797924215072352, "loss": 2.2969, "step": 3180 }, { "epoch": 2.1066225165562913, "grad_norm": 0.8611402563517535, "learning_rate": 0.0002797589490937754, "loss": 2.2656, "step": 3181 }, { "epoch": 2.1072847682119207, "grad_norm": 0.8187332982870288, "learning_rate": 0.0002797254509865559, "loss": 2.2344, "step": 3182 }, { "epoch": 2.1079470198675496, "grad_norm": 0.7861742796468008, "learning_rate": 0.00027969192719220993, "loss": 2.2188, "step": 3183 }, { "epoch": 2.108609271523179, "grad_norm": 0.8361914810712451, "learning_rate": 0.0002796583777173755, "loss": 1.9062, "step": 3184 }, { "epoch": 2.109271523178808, "grad_norm": 0.8873101087353407, "learning_rate": 0.0002796248025686958, "loss": 2.0312, "step": 3185 }, { "epoch": 2.109933774834437, "grad_norm": 0.7826078986265593, "learning_rate": 0.0002795912017528191, "loss": 1.9766, "step": 3186 }, { "epoch": 2.1105960264900663, "grad_norm": 0.7737710990102193, "learning_rate": 0.00027955757527639867, "loss": 1.9141, "step": 3187 }, { "epoch": 2.111258278145695, "grad_norm": 0.8069783206144504, "learning_rate": 0.0002795239231460931, "loss": 2.125, "step": 3188 }, { "epoch": 2.1119205298013246, "grad_norm": 0.8866085409493999, "learning_rate": 0.00027949024536856577, "loss": 2.4062, "step": 3189 }, { "epoch": 2.1125827814569536, "grad_norm": 0.8189520837471019, "learning_rate": 0.0002794565419504852, "loss": 2.125, "step": 3190 }, { "epoch": 2.113245033112583, "grad_norm": 0.8332274608180126, "learning_rate": 0.0002794228128985253, "loss": 2.2656, "step": 3191 }, { "epoch": 2.113907284768212, "grad_norm": 0.829259255187262, "learning_rate": 0.00027938905821936466, "loss": 2.25, "step": 3192 }, { "epoch": 2.1145695364238413, "grad_norm": 0.8144184450139884, "learning_rate": 0.000279355277919687, "loss": 1.9922, "step": 3193 }, { "epoch": 2.11523178807947, "grad_norm": 0.809308280103592, "learning_rate": 0.00027932147200618134, "loss": 1.8438, "step": 3194 }, { "epoch": 2.115894039735099, "grad_norm": 0.8177378772099362, "learning_rate": 0.00027928764048554164, "loss": 2.1562, "step": 3195 }, { "epoch": 2.1165562913907285, "grad_norm": 0.8935446760475563, "learning_rate": 0.0002792537833644668, "loss": 2.4531, "step": 3196 }, { "epoch": 2.1172185430463575, "grad_norm": 0.9399276644864418, "learning_rate": 0.00027921990064966106, "loss": 2.0625, "step": 3197 }, { "epoch": 2.117880794701987, "grad_norm": 0.7638756757537034, "learning_rate": 0.00027918599234783356, "loss": 1.6719, "step": 3198 }, { "epoch": 2.118543046357616, "grad_norm": 0.9448630519331106, "learning_rate": 0.0002791520584656985, "loss": 2.125, "step": 3199 }, { "epoch": 2.119205298013245, "grad_norm": 0.8166669794794548, "learning_rate": 0.00027911809900997515, "loss": 2.125, "step": 3200 }, { "epoch": 2.119867549668874, "grad_norm": 0.823302143531591, "learning_rate": 0.0002790841139873879, "loss": 1.8281, "step": 3201 }, { "epoch": 2.120529801324503, "grad_norm": 0.8108558188610947, "learning_rate": 0.00027905010340466616, "loss": 2.0781, "step": 3202 }, { "epoch": 2.1211920529801325, "grad_norm": 0.854018729909469, "learning_rate": 0.0002790160672685445, "loss": 2.4062, "step": 3203 }, { "epoch": 2.1218543046357614, "grad_norm": 0.8911762331053634, "learning_rate": 0.00027898200558576237, "loss": 1.8125, "step": 3204 }, { "epoch": 2.122516556291391, "grad_norm": 0.9254976710446435, "learning_rate": 0.0002789479183630644, "loss": 2.3438, "step": 3205 }, { "epoch": 2.1231788079470197, "grad_norm": 0.8610242415273918, "learning_rate": 0.00027891380560720024, "loss": 2.1875, "step": 3206 }, { "epoch": 2.123841059602649, "grad_norm": 0.8142693582448015, "learning_rate": 0.0002788796673249247, "loss": 2.1094, "step": 3207 }, { "epoch": 2.124503311258278, "grad_norm": 0.800838433199322, "learning_rate": 0.0002788455035229974, "loss": 2.3438, "step": 3208 }, { "epoch": 2.1251655629139075, "grad_norm": 0.7114633123115649, "learning_rate": 0.00027881131420818325, "loss": 1.9297, "step": 3209 }, { "epoch": 2.1258278145695364, "grad_norm": 0.7796314936291165, "learning_rate": 0.00027877709938725214, "loss": 1.8594, "step": 3210 }, { "epoch": 2.1264900662251653, "grad_norm": 0.9047998884298306, "learning_rate": 0.0002787428590669789, "loss": 2.5156, "step": 3211 }, { "epoch": 2.1271523178807947, "grad_norm": 0.8102270906604747, "learning_rate": 0.00027870859325414363, "loss": 2.1562, "step": 3212 }, { "epoch": 2.1278145695364237, "grad_norm": 0.8850990790499089, "learning_rate": 0.00027867430195553124, "loss": 2.1406, "step": 3213 }, { "epoch": 2.128476821192053, "grad_norm": 0.9041739049287638, "learning_rate": 0.000278639985177932, "loss": 2.2812, "step": 3214 }, { "epoch": 2.129139072847682, "grad_norm": 0.8249793512853625, "learning_rate": 0.00027860564292814076, "loss": 2.2188, "step": 3215 }, { "epoch": 2.1298013245033114, "grad_norm": 0.7939861418613539, "learning_rate": 0.0002785712752129578, "loss": 2.1562, "step": 3216 }, { "epoch": 2.1304635761589403, "grad_norm": 0.861220834938404, "learning_rate": 0.0002785368820391884, "loss": 2.3281, "step": 3217 }, { "epoch": 2.1311258278145697, "grad_norm": 0.7821094289211991, "learning_rate": 0.00027850246341364266, "loss": 2.0469, "step": 3218 }, { "epoch": 2.1317880794701987, "grad_norm": 0.7976263234562335, "learning_rate": 0.000278468019343136, "loss": 2.1875, "step": 3219 }, { "epoch": 2.1324503311258276, "grad_norm": 0.8077208724635396, "learning_rate": 0.00027843354983448867, "loss": 2.2344, "step": 3220 }, { "epoch": 2.133112582781457, "grad_norm": 0.7884602463280347, "learning_rate": 0.00027839905489452597, "loss": 2.0781, "step": 3221 }, { "epoch": 2.133774834437086, "grad_norm": 0.8085181043787424, "learning_rate": 0.00027836453453007837, "loss": 2.0156, "step": 3222 }, { "epoch": 2.1344370860927153, "grad_norm": 0.8606845761531365, "learning_rate": 0.0002783299887479813, "loss": 2.25, "step": 3223 }, { "epoch": 2.1350993377483443, "grad_norm": 0.9822583445688104, "learning_rate": 0.00027829541755507517, "loss": 2.3125, "step": 3224 }, { "epoch": 2.1357615894039736, "grad_norm": 0.8485399732284782, "learning_rate": 0.0002782608209582056, "loss": 2.2344, "step": 3225 }, { "epoch": 2.1364238410596026, "grad_norm": 0.9110699370383477, "learning_rate": 0.00027822619896422293, "loss": 2.5156, "step": 3226 }, { "epoch": 2.137086092715232, "grad_norm": 0.7998480976676432, "learning_rate": 0.0002781915515799828, "loss": 2.125, "step": 3227 }, { "epoch": 2.137748344370861, "grad_norm": 0.7787851916355009, "learning_rate": 0.0002781568788123458, "loss": 2.0156, "step": 3228 }, { "epoch": 2.13841059602649, "grad_norm": 0.8132257640511175, "learning_rate": 0.0002781221806681775, "loss": 2.0469, "step": 3229 }, { "epoch": 2.1390728476821192, "grad_norm": 0.8502061427062879, "learning_rate": 0.00027808745715434866, "loss": 2.1719, "step": 3230 }, { "epoch": 2.139735099337748, "grad_norm": 0.8815362692246591, "learning_rate": 0.00027805270827773473, "loss": 2.3594, "step": 3231 }, { "epoch": 2.1403973509933776, "grad_norm": 0.9117495523163015, "learning_rate": 0.00027801793404521653, "loss": 2.4219, "step": 3232 }, { "epoch": 2.1410596026490065, "grad_norm": 0.9410253026495625, "learning_rate": 0.0002779831344636797, "loss": 2.25, "step": 3233 }, { "epoch": 2.141721854304636, "grad_norm": 0.8857225887521325, "learning_rate": 0.00027794830954001496, "loss": 1.9766, "step": 3234 }, { "epoch": 2.142384105960265, "grad_norm": 0.8247598418075782, "learning_rate": 0.00027791345928111804, "loss": 2.3281, "step": 3235 }, { "epoch": 2.1430463576158942, "grad_norm": 0.798738937985844, "learning_rate": 0.00027787858369388973, "loss": 1.9453, "step": 3236 }, { "epoch": 2.143708609271523, "grad_norm": 0.8654240560178093, "learning_rate": 0.00027784368278523574, "loss": 2.2812, "step": 3237 }, { "epoch": 2.144370860927152, "grad_norm": 0.8855302169022395, "learning_rate": 0.0002778087565620669, "loss": 2.2188, "step": 3238 }, { "epoch": 2.1450331125827815, "grad_norm": 0.7676852723027048, "learning_rate": 0.00027777380503129893, "loss": 2.1562, "step": 3239 }, { "epoch": 2.1456953642384105, "grad_norm": 0.8381182494245285, "learning_rate": 0.0002777388281998528, "loss": 1.8828, "step": 3240 }, { "epoch": 2.14635761589404, "grad_norm": 0.7969592405784364, "learning_rate": 0.00027770382607465407, "loss": 1.8125, "step": 3241 }, { "epoch": 2.147019867549669, "grad_norm": 0.9181325691003049, "learning_rate": 0.00027766879866263376, "loss": 2.2344, "step": 3242 }, { "epoch": 2.147682119205298, "grad_norm": 0.8671542383801918, "learning_rate": 0.0002776337459707277, "loss": 2.1094, "step": 3243 }, { "epoch": 2.148344370860927, "grad_norm": 0.8123017923607242, "learning_rate": 0.0002775986680058766, "loss": 2.2188, "step": 3244 }, { "epoch": 2.1490066225165565, "grad_norm": 0.8090885548036824, "learning_rate": 0.0002775635647750264, "loss": 1.8906, "step": 3245 }, { "epoch": 2.1496688741721854, "grad_norm": 0.8521149077192472, "learning_rate": 0.00027752843628512785, "loss": 2.1719, "step": 3246 }, { "epoch": 2.1503311258278144, "grad_norm": 0.8645924082368771, "learning_rate": 0.0002774932825431369, "loss": 1.9375, "step": 3247 }, { "epoch": 2.1509933774834438, "grad_norm": 0.9222697095775695, "learning_rate": 0.00027745810355601425, "loss": 2.4688, "step": 3248 }, { "epoch": 2.1516556291390727, "grad_norm": 0.846568195720941, "learning_rate": 0.000277422899330726, "loss": 2.0312, "step": 3249 }, { "epoch": 2.152317880794702, "grad_norm": 0.9426840800579832, "learning_rate": 0.0002773876698742427, "loss": 2.5312, "step": 3250 }, { "epoch": 2.152980132450331, "grad_norm": 0.860878534310882, "learning_rate": 0.0002773524151935403, "loss": 2.375, "step": 3251 }, { "epoch": 2.1536423841059604, "grad_norm": 0.8100590988887295, "learning_rate": 0.0002773171352955997, "loss": 2.2031, "step": 3252 }, { "epoch": 2.1543046357615894, "grad_norm": 0.8948308204547479, "learning_rate": 0.0002772818301874067, "loss": 2.3906, "step": 3253 }, { "epoch": 2.1549668874172188, "grad_norm": 0.7919869371471651, "learning_rate": 0.0002772464998759521, "loss": 2.1875, "step": 3254 }, { "epoch": 2.1556291390728477, "grad_norm": 0.8043753337644759, "learning_rate": 0.0002772111443682316, "loss": 1.9844, "step": 3255 }, { "epoch": 2.1562913907284766, "grad_norm": 0.7974534895558583, "learning_rate": 0.0002771757636712461, "loss": 2.2031, "step": 3256 }, { "epoch": 2.156953642384106, "grad_norm": 0.7741637095045127, "learning_rate": 0.00027714035779200137, "loss": 1.7891, "step": 3257 }, { "epoch": 2.157615894039735, "grad_norm": 0.882262510540357, "learning_rate": 0.00027710492673750826, "loss": 2.2969, "step": 3258 }, { "epoch": 2.1582781456953644, "grad_norm": 0.8788913504587818, "learning_rate": 0.00027706947051478237, "loss": 2.2656, "step": 3259 }, { "epoch": 2.1589403973509933, "grad_norm": 0.8957276353185788, "learning_rate": 0.00027703398913084455, "loss": 2.3594, "step": 3260 }, { "epoch": 2.1596026490066227, "grad_norm": 0.826726458575409, "learning_rate": 0.00027699848259272043, "loss": 2.3281, "step": 3261 }, { "epoch": 2.1602649006622516, "grad_norm": 0.792872375528584, "learning_rate": 0.00027696295090744085, "loss": 2.0938, "step": 3262 }, { "epoch": 2.160927152317881, "grad_norm": 0.8331968500724979, "learning_rate": 0.00027692739408204136, "loss": 2.3594, "step": 3263 }, { "epoch": 2.16158940397351, "grad_norm": 0.7736567263164781, "learning_rate": 0.00027689181212356266, "loss": 1.8594, "step": 3264 }, { "epoch": 2.162251655629139, "grad_norm": 0.852998208040114, "learning_rate": 0.00027685620503905043, "loss": 2.4531, "step": 3265 }, { "epoch": 2.1629139072847683, "grad_norm": 0.8171302746452069, "learning_rate": 0.0002768205728355552, "loss": 2.2812, "step": 3266 }, { "epoch": 2.1635761589403972, "grad_norm": 0.7962997632248591, "learning_rate": 0.0002767849155201326, "loss": 2.1094, "step": 3267 }, { "epoch": 2.1642384105960266, "grad_norm": 0.8250086457383703, "learning_rate": 0.0002767492330998432, "loss": 2.3594, "step": 3268 }, { "epoch": 2.1649006622516556, "grad_norm": 0.7382925284379064, "learning_rate": 0.00027671352558175254, "loss": 2.0469, "step": 3269 }, { "epoch": 2.165562913907285, "grad_norm": 0.8640181501937213, "learning_rate": 0.00027667779297293105, "loss": 2.1719, "step": 3270 }, { "epoch": 2.166225165562914, "grad_norm": 0.8040154250911639, "learning_rate": 0.00027664203528045424, "loss": 2.0625, "step": 3271 }, { "epoch": 2.1668874172185433, "grad_norm": 0.8007769382247858, "learning_rate": 0.00027660625251140256, "loss": 2.2969, "step": 3272 }, { "epoch": 2.167549668874172, "grad_norm": 0.9299384799368382, "learning_rate": 0.0002765704446728614, "loss": 2.4531, "step": 3273 }, { "epoch": 2.168211920529801, "grad_norm": 0.8825890929324891, "learning_rate": 0.0002765346117719211, "loss": 2.1875, "step": 3274 }, { "epoch": 2.1688741721854305, "grad_norm": 0.852743835362979, "learning_rate": 0.00027649875381567695, "loss": 2.0156, "step": 3275 }, { "epoch": 2.1695364238410595, "grad_norm": 0.7881463871581443, "learning_rate": 0.00027646287081122933, "loss": 2.2344, "step": 3276 }, { "epoch": 2.170198675496689, "grad_norm": 0.8206073467339754, "learning_rate": 0.00027642696276568345, "loss": 2.125, "step": 3277 }, { "epoch": 2.170860927152318, "grad_norm": 0.8281263956557895, "learning_rate": 0.0002763910296861495, "loss": 2.1875, "step": 3278 }, { "epoch": 2.171523178807947, "grad_norm": 0.882125613475579, "learning_rate": 0.0002763550715797426, "loss": 2.3594, "step": 3279 }, { "epoch": 2.172185430463576, "grad_norm": 0.8278852714288011, "learning_rate": 0.00027631908845358297, "loss": 2.0469, "step": 3280 }, { "epoch": 2.172847682119205, "grad_norm": 0.785570592764887, "learning_rate": 0.0002762830803147956, "loss": 2.0312, "step": 3281 }, { "epoch": 2.1735099337748345, "grad_norm": 0.7919405724412538, "learning_rate": 0.0002762470471705105, "loss": 2.0938, "step": 3282 }, { "epoch": 2.1741721854304634, "grad_norm": 0.837810773588796, "learning_rate": 0.0002762109890278628, "loss": 2.3906, "step": 3283 }, { "epoch": 2.174834437086093, "grad_norm": 0.7872660246620158, "learning_rate": 0.00027617490589399227, "loss": 2.0938, "step": 3284 }, { "epoch": 2.1754966887417218, "grad_norm": 0.8323316699109079, "learning_rate": 0.00027613879777604373, "loss": 1.9922, "step": 3285 }, { "epoch": 2.176158940397351, "grad_norm": 0.8018992494949806, "learning_rate": 0.0002761026646811672, "loss": 2.0156, "step": 3286 }, { "epoch": 2.17682119205298, "grad_norm": 0.8302308794928105, "learning_rate": 0.0002760665066165173, "loss": 2.1875, "step": 3287 }, { "epoch": 2.1774834437086095, "grad_norm": 0.7558837631959101, "learning_rate": 0.00027603032358925383, "loss": 2.1406, "step": 3288 }, { "epoch": 2.1781456953642384, "grad_norm": 0.7632790741462315, "learning_rate": 0.0002759941156065413, "loss": 1.9688, "step": 3289 }, { "epoch": 2.1788079470198674, "grad_norm": 0.7733118085391162, "learning_rate": 0.0002759578826755495, "loss": 2.0469, "step": 3290 }, { "epoch": 2.1794701986754967, "grad_norm": 0.8137496040176969, "learning_rate": 0.00027592162480345283, "loss": 2.0781, "step": 3291 }, { "epoch": 2.1801324503311257, "grad_norm": 0.99050130317897, "learning_rate": 0.0002758853419974308, "loss": 2.4219, "step": 3292 }, { "epoch": 2.180794701986755, "grad_norm": 0.8032197797888396, "learning_rate": 0.0002758490342646678, "loss": 2.2188, "step": 3293 }, { "epoch": 2.181456953642384, "grad_norm": 0.8396491976663559, "learning_rate": 0.0002758127016123533, "loss": 2.3594, "step": 3294 }, { "epoch": 2.1821192052980134, "grad_norm": 0.8255061920189865, "learning_rate": 0.0002757763440476814, "loss": 2.4375, "step": 3295 }, { "epoch": 2.1827814569536423, "grad_norm": 0.8489285029923598, "learning_rate": 0.0002757399615778514, "loss": 1.8516, "step": 3296 }, { "epoch": 2.1834437086092717, "grad_norm": 0.8508286114644342, "learning_rate": 0.00027570355421006746, "loss": 2.1562, "step": 3297 }, { "epoch": 2.1841059602649007, "grad_norm": 0.7562689755418857, "learning_rate": 0.00027566712195153866, "loss": 1.9922, "step": 3298 }, { "epoch": 2.1847682119205296, "grad_norm": 0.8787242466692003, "learning_rate": 0.000275630664809479, "loss": 2.2969, "step": 3299 }, { "epoch": 2.185430463576159, "grad_norm": 0.9344491983192744, "learning_rate": 0.00027559418279110733, "loss": 2.1875, "step": 3300 }, { "epoch": 2.186092715231788, "grad_norm": 0.8252876463159317, "learning_rate": 0.0002755576759036476, "loss": 2.2188, "step": 3301 }, { "epoch": 2.1867549668874173, "grad_norm": 0.7956282359832736, "learning_rate": 0.00027552114415432864, "loss": 2.2031, "step": 3302 }, { "epoch": 2.1874172185430463, "grad_norm": 0.7222801729364425, "learning_rate": 0.00027548458755038404, "loss": 1.7344, "step": 3303 }, { "epoch": 2.1880794701986757, "grad_norm": 0.7940695992711349, "learning_rate": 0.00027544800609905244, "loss": 1.9766, "step": 3304 }, { "epoch": 2.1887417218543046, "grad_norm": 0.8861022600103349, "learning_rate": 0.0002754113998075775, "loss": 1.9766, "step": 3305 }, { "epoch": 2.1894039735099335, "grad_norm": 1.0031515963194808, "learning_rate": 0.0002753747686832075, "loss": 2.3281, "step": 3306 }, { "epoch": 2.190066225165563, "grad_norm": 0.7601196397115809, "learning_rate": 0.000275338112733196, "loss": 2.0625, "step": 3307 }, { "epoch": 2.190728476821192, "grad_norm": 0.8613390575439762, "learning_rate": 0.0002753014319648012, "loss": 2.2344, "step": 3308 }, { "epoch": 2.1913907284768213, "grad_norm": 0.7556465928778803, "learning_rate": 0.00027526472638528634, "loss": 1.8125, "step": 3309 }, { "epoch": 2.19205298013245, "grad_norm": 0.8319931456644523, "learning_rate": 0.0002752279960019195, "loss": 2.1094, "step": 3310 }, { "epoch": 2.1927152317880796, "grad_norm": 0.7421980487514149, "learning_rate": 0.0002751912408219738, "loss": 1.7422, "step": 3311 }, { "epoch": 2.1933774834437085, "grad_norm": 0.8275927958504509, "learning_rate": 0.0002751544608527272, "loss": 2.1875, "step": 3312 }, { "epoch": 2.194039735099338, "grad_norm": 0.8507836818395664, "learning_rate": 0.0002751176561014624, "loss": 2.2969, "step": 3313 }, { "epoch": 2.194701986754967, "grad_norm": 0.8285849358506636, "learning_rate": 0.00027508082657546734, "loss": 2.3281, "step": 3314 }, { "epoch": 2.195364238410596, "grad_norm": 0.766382384247634, "learning_rate": 0.00027504397228203455, "loss": 1.9453, "step": 3315 }, { "epoch": 2.196026490066225, "grad_norm": 0.8843905447887204, "learning_rate": 0.00027500709322846175, "loss": 2.5156, "step": 3316 }, { "epoch": 2.196688741721854, "grad_norm": 0.7845108208485224, "learning_rate": 0.00027497018942205126, "loss": 2.25, "step": 3317 }, { "epoch": 2.1973509933774835, "grad_norm": 0.8256990756936016, "learning_rate": 0.00027493326087011055, "loss": 2.2031, "step": 3318 }, { "epoch": 2.1980132450331125, "grad_norm": 0.7968136572426746, "learning_rate": 0.0002748963075799519, "loss": 2.2812, "step": 3319 }, { "epoch": 2.198675496688742, "grad_norm": 0.8327481165603114, "learning_rate": 0.0002748593295588924, "loss": 2.25, "step": 3320 }, { "epoch": 2.199337748344371, "grad_norm": 0.8198669467615958, "learning_rate": 0.00027482232681425424, "loss": 2.25, "step": 3321 }, { "epoch": 2.2, "grad_norm": 0.7417890205642291, "learning_rate": 0.0002747852993533643, "loss": 1.9688, "step": 3322 }, { "epoch": 2.200662251655629, "grad_norm": 0.7768295405464892, "learning_rate": 0.00027474824718355445, "loss": 2.0938, "step": 3323 }, { "epoch": 2.201324503311258, "grad_norm": 0.9182396334096297, "learning_rate": 0.00027471117031216153, "loss": 2.3906, "step": 3324 }, { "epoch": 2.2019867549668874, "grad_norm": 0.9132219842886142, "learning_rate": 0.00027467406874652706, "loss": 2.1406, "step": 3325 }, { "epoch": 2.2026490066225164, "grad_norm": 0.8051668784575603, "learning_rate": 0.00027463694249399765, "loss": 1.8438, "step": 3326 }, { "epoch": 2.203311258278146, "grad_norm": 0.8779608231869319, "learning_rate": 0.0002745997915619247, "loss": 2.4062, "step": 3327 }, { "epoch": 2.2039735099337747, "grad_norm": 0.7657089739207261, "learning_rate": 0.0002745626159576646, "loss": 1.6797, "step": 3328 }, { "epoch": 2.204635761589404, "grad_norm": 0.8485164960940961, "learning_rate": 0.00027452541568857847, "loss": 2.0156, "step": 3329 }, { "epoch": 2.205298013245033, "grad_norm": 0.8355818547700689, "learning_rate": 0.0002744881907620323, "loss": 2.1875, "step": 3330 }, { "epoch": 2.2059602649006624, "grad_norm": 0.8558865706646701, "learning_rate": 0.0002744509411853972, "loss": 2.2656, "step": 3331 }, { "epoch": 2.2066225165562914, "grad_norm": 0.7968607240104667, "learning_rate": 0.00027441366696604895, "loss": 2.3438, "step": 3332 }, { "epoch": 2.2072847682119203, "grad_norm": 0.8115129909848648, "learning_rate": 0.0002743763681113683, "loss": 2.1719, "step": 3333 }, { "epoch": 2.2079470198675497, "grad_norm": 0.7745359407731947, "learning_rate": 0.0002743390446287409, "loss": 2.2812, "step": 3334 }, { "epoch": 2.2086092715231787, "grad_norm": 0.8870758405984903, "learning_rate": 0.00027430169652555703, "loss": 2.6406, "step": 3335 }, { "epoch": 2.209271523178808, "grad_norm": 0.8388509120974267, "learning_rate": 0.0002742643238092123, "loss": 2.375, "step": 3336 }, { "epoch": 2.209933774834437, "grad_norm": 0.7701954031581828, "learning_rate": 0.00027422692648710673, "loss": 2.1719, "step": 3337 }, { "epoch": 2.2105960264900664, "grad_norm": 0.7700654296594206, "learning_rate": 0.0002741895045666455, "loss": 2.0781, "step": 3338 }, { "epoch": 2.2112582781456953, "grad_norm": 0.7572760178076289, "learning_rate": 0.0002741520580552386, "loss": 1.9766, "step": 3339 }, { "epoch": 2.2119205298013247, "grad_norm": 0.7811646489989366, "learning_rate": 0.0002741145869603009, "loss": 1.9766, "step": 3340 }, { "epoch": 2.2125827814569536, "grad_norm": 0.7652073370408187, "learning_rate": 0.000274077091289252, "loss": 1.6328, "step": 3341 }, { "epoch": 2.2132450331125826, "grad_norm": 0.735328194049772, "learning_rate": 0.0002740395710495165, "loss": 1.8203, "step": 3342 }, { "epoch": 2.213907284768212, "grad_norm": 0.8286038328340698, "learning_rate": 0.00027400202624852394, "loss": 2.0625, "step": 3343 }, { "epoch": 2.214569536423841, "grad_norm": 0.8768491364725021, "learning_rate": 0.0002739644568937085, "loss": 2.2031, "step": 3344 }, { "epoch": 2.2152317880794703, "grad_norm": 0.8552570456101776, "learning_rate": 0.0002739268629925094, "loss": 2.2656, "step": 3345 }, { "epoch": 2.2158940397350992, "grad_norm": 0.9068003321966285, "learning_rate": 0.00027388924455237063, "loss": 2.125, "step": 3346 }, { "epoch": 2.2165562913907286, "grad_norm": 0.8675120392327351, "learning_rate": 0.00027385160158074115, "loss": 2.1094, "step": 3347 }, { "epoch": 2.2172185430463576, "grad_norm": 0.8327555600723997, "learning_rate": 0.00027381393408507456, "loss": 2.0781, "step": 3348 }, { "epoch": 2.217880794701987, "grad_norm": 0.8418742450561617, "learning_rate": 0.0002737762420728296, "loss": 2.2188, "step": 3349 }, { "epoch": 2.218543046357616, "grad_norm": 0.8699429060159691, "learning_rate": 0.0002737385255514697, "loss": 2.3906, "step": 3350 }, { "epoch": 2.219205298013245, "grad_norm": 0.8509907393944772, "learning_rate": 0.000273700784528463, "loss": 2.3438, "step": 3351 }, { "epoch": 2.2198675496688742, "grad_norm": 0.8841108347587866, "learning_rate": 0.0002736630190112829, "loss": 2.0938, "step": 3352 }, { "epoch": 2.220529801324503, "grad_norm": 0.8334422899623337, "learning_rate": 0.0002736252290074072, "loss": 1.9453, "step": 3353 }, { "epoch": 2.2211920529801326, "grad_norm": 0.7592264525080054, "learning_rate": 0.0002735874145243189, "loss": 1.8125, "step": 3354 }, { "epoch": 2.2218543046357615, "grad_norm": 0.7825530875598116, "learning_rate": 0.0002735495755695056, "loss": 2.2344, "step": 3355 }, { "epoch": 2.222516556291391, "grad_norm": 0.8848044485857259, "learning_rate": 0.0002735117121504599, "loss": 2.3438, "step": 3356 }, { "epoch": 2.22317880794702, "grad_norm": 0.8737085402343399, "learning_rate": 0.0002734738242746792, "loss": 2.2812, "step": 3357 }, { "epoch": 2.223841059602649, "grad_norm": 0.8442083587277228, "learning_rate": 0.0002734359119496657, "loss": 2.2344, "step": 3358 }, { "epoch": 2.224503311258278, "grad_norm": 0.7677761452029833, "learning_rate": 0.0002733979751829265, "loss": 2.0781, "step": 3359 }, { "epoch": 2.225165562913907, "grad_norm": 0.8103208074750657, "learning_rate": 0.0002733600139819735, "loss": 2.3594, "step": 3360 }, { "epoch": 2.2258278145695365, "grad_norm": 0.7631405913554564, "learning_rate": 0.00027332202835432344, "loss": 2.0938, "step": 3361 }, { "epoch": 2.2264900662251654, "grad_norm": 0.8657307515518499, "learning_rate": 0.0002732840183074979, "loss": 2.2969, "step": 3362 }, { "epoch": 2.227152317880795, "grad_norm": 0.8110410252870841, "learning_rate": 0.00027324598384902333, "loss": 2.1406, "step": 3363 }, { "epoch": 2.2278145695364238, "grad_norm": 0.8640540740701811, "learning_rate": 0.000273207924986431, "loss": 2.0312, "step": 3364 }, { "epoch": 2.228476821192053, "grad_norm": 0.7989624386226248, "learning_rate": 0.00027316984172725704, "loss": 2.2031, "step": 3365 }, { "epoch": 2.229139072847682, "grad_norm": 0.7494145080518465, "learning_rate": 0.0002731317340790423, "loss": 2.1406, "step": 3366 }, { "epoch": 2.2298013245033115, "grad_norm": 0.8311134685557905, "learning_rate": 0.00027309360204933256, "loss": 2.0938, "step": 3367 }, { "epoch": 2.2304635761589404, "grad_norm": 0.7508644414489907, "learning_rate": 0.0002730554456456783, "loss": 2.1406, "step": 3368 }, { "epoch": 2.2311258278145694, "grad_norm": 0.7550064483810309, "learning_rate": 0.00027301726487563517, "loss": 1.9453, "step": 3369 }, { "epoch": 2.2317880794701987, "grad_norm": 0.7435367177173581, "learning_rate": 0.00027297905974676316, "loss": 1.9297, "step": 3370 }, { "epoch": 2.2324503311258277, "grad_norm": 0.8487484380639131, "learning_rate": 0.0002729408302666274, "loss": 2.1875, "step": 3371 }, { "epoch": 2.233112582781457, "grad_norm": 0.7569325897381546, "learning_rate": 0.0002729025764427978, "loss": 2.0156, "step": 3372 }, { "epoch": 2.233774834437086, "grad_norm": 0.8441153267497679, "learning_rate": 0.000272864298282849, "loss": 2.2188, "step": 3373 }, { "epoch": 2.2344370860927154, "grad_norm": 0.8027582806848588, "learning_rate": 0.0002728259957943606, "loss": 2.1562, "step": 3374 }, { "epoch": 2.2350993377483444, "grad_norm": 0.8066221003452809, "learning_rate": 0.00027278766898491686, "loss": 2.0781, "step": 3375 }, { "epoch": 2.2357615894039737, "grad_norm": 0.8281426998927284, "learning_rate": 0.00027274931786210694, "loss": 2.2188, "step": 3376 }, { "epoch": 2.2364238410596027, "grad_norm": 0.8218323511052216, "learning_rate": 0.0002727109424335248, "loss": 2.2031, "step": 3377 }, { "epoch": 2.2370860927152316, "grad_norm": 0.7743687151133861, "learning_rate": 0.00027267254270676923, "loss": 2.1406, "step": 3378 }, { "epoch": 2.237748344370861, "grad_norm": 0.794579298472305, "learning_rate": 0.0002726341186894438, "loss": 1.8047, "step": 3379 }, { "epoch": 2.23841059602649, "grad_norm": 0.8208912133462046, "learning_rate": 0.00027259567038915694, "loss": 2.3281, "step": 3380 }, { "epoch": 2.2390728476821193, "grad_norm": 0.8330499760238012, "learning_rate": 0.0002725571978135218, "loss": 2.0469, "step": 3381 }, { "epoch": 2.2397350993377483, "grad_norm": 0.7804178423215485, "learning_rate": 0.00027251870097015645, "loss": 2.0781, "step": 3382 }, { "epoch": 2.2403973509933777, "grad_norm": 0.7975006795801435, "learning_rate": 0.0002724801798666837, "loss": 2.2656, "step": 3383 }, { "epoch": 2.2410596026490066, "grad_norm": 0.8617674106901189, "learning_rate": 0.00027244163451073107, "loss": 2.2969, "step": 3384 }, { "epoch": 2.241721854304636, "grad_norm": 0.7248057874259806, "learning_rate": 0.0002724030649099311, "loss": 1.7344, "step": 3385 }, { "epoch": 2.242384105960265, "grad_norm": 0.7608766067926409, "learning_rate": 0.000272364471071921, "loss": 2.0312, "step": 3386 }, { "epoch": 2.243046357615894, "grad_norm": 0.8349490815247091, "learning_rate": 0.00027232585300434274, "loss": 2.1406, "step": 3387 }, { "epoch": 2.2437086092715233, "grad_norm": 0.8948656507471446, "learning_rate": 0.0002722872107148431, "loss": 2.2969, "step": 3388 }, { "epoch": 2.244370860927152, "grad_norm": 0.8515136297390447, "learning_rate": 0.0002722485442110739, "loss": 2.1094, "step": 3389 }, { "epoch": 2.2450331125827816, "grad_norm": 0.7520520842508782, "learning_rate": 0.0002722098535006913, "loss": 1.9453, "step": 3390 }, { "epoch": 2.2456953642384105, "grad_norm": 0.7633417223611352, "learning_rate": 0.00027217113859135663, "loss": 2.0312, "step": 3391 }, { "epoch": 2.24635761589404, "grad_norm": 0.8743428750761952, "learning_rate": 0.0002721323994907359, "loss": 2.5312, "step": 3392 }, { "epoch": 2.247019867549669, "grad_norm": 0.8852405185858262, "learning_rate": 0.00027209363620649986, "loss": 2.2969, "step": 3393 }, { "epoch": 2.247682119205298, "grad_norm": 0.8798641678710732, "learning_rate": 0.0002720548487463241, "loss": 2.4688, "step": 3394 }, { "epoch": 2.248344370860927, "grad_norm": 0.7365239863128915, "learning_rate": 0.000272016037117889, "loss": 1.75, "step": 3395 }, { "epoch": 2.249006622516556, "grad_norm": 0.835785839902504, "learning_rate": 0.00027197720132887964, "loss": 1.8516, "step": 3396 }, { "epoch": 2.2496688741721855, "grad_norm": 0.7508971485986422, "learning_rate": 0.000271938341386986, "loss": 1.9297, "step": 3397 }, { "epoch": 2.2503311258278145, "grad_norm": 0.777671344037389, "learning_rate": 0.0002718994572999028, "loss": 2.2344, "step": 3398 }, { "epoch": 2.250993377483444, "grad_norm": 0.7987089376070218, "learning_rate": 0.0002718605490753295, "loss": 2.1094, "step": 3399 }, { "epoch": 2.251655629139073, "grad_norm": 0.7933753326309906, "learning_rate": 0.00027182161672097046, "loss": 2.1406, "step": 3400 }, { "epoch": 2.2523178807947017, "grad_norm": 0.7271138779708894, "learning_rate": 0.0002717826602445346, "loss": 2.0156, "step": 3401 }, { "epoch": 2.252980132450331, "grad_norm": 0.8040104440661278, "learning_rate": 0.0002717436796537358, "loss": 2.1875, "step": 3402 }, { "epoch": 2.25364238410596, "grad_norm": 0.9207808506519045, "learning_rate": 0.0002717046749562928, "loss": 2.4844, "step": 3403 }, { "epoch": 2.2543046357615895, "grad_norm": 0.7615877458930103, "learning_rate": 0.00027166564615992877, "loss": 2.0312, "step": 3404 }, { "epoch": 2.2549668874172184, "grad_norm": 0.8909187387123997, "learning_rate": 0.00027162659327237195, "loss": 2.1562, "step": 3405 }, { "epoch": 2.255629139072848, "grad_norm": 0.8784966515385724, "learning_rate": 0.0002715875163013553, "loss": 2.4844, "step": 3406 }, { "epoch": 2.2562913907284767, "grad_norm": 0.798228395516519, "learning_rate": 0.0002715484152546164, "loss": 2.1719, "step": 3407 }, { "epoch": 2.256953642384106, "grad_norm": 0.8096575769378106, "learning_rate": 0.0002715092901398978, "loss": 2.1562, "step": 3408 }, { "epoch": 2.257615894039735, "grad_norm": 0.8263312669295252, "learning_rate": 0.00027147014096494675, "loss": 2.25, "step": 3409 }, { "epoch": 2.258278145695364, "grad_norm": 0.8032506928755974, "learning_rate": 0.0002714309677375151, "loss": 1.7578, "step": 3410 }, { "epoch": 2.2589403973509934, "grad_norm": 0.8118776506411731, "learning_rate": 0.00027139177046535976, "loss": 2.0781, "step": 3411 }, { "epoch": 2.2596026490066223, "grad_norm": 0.7489189597481541, "learning_rate": 0.0002713525491562421, "loss": 2.25, "step": 3412 }, { "epoch": 2.2602649006622517, "grad_norm": 0.8356459198460788, "learning_rate": 0.00027131330381792847, "loss": 2.125, "step": 3413 }, { "epoch": 2.2609271523178807, "grad_norm": 0.8126627671433717, "learning_rate": 0.00027127403445818983, "loss": 2.2656, "step": 3414 }, { "epoch": 2.26158940397351, "grad_norm": 0.8761888272040123, "learning_rate": 0.00027123474108480206, "loss": 2.1562, "step": 3415 }, { "epoch": 2.262251655629139, "grad_norm": 0.9409405461740266, "learning_rate": 0.0002711954237055456, "loss": 2.6406, "step": 3416 }, { "epoch": 2.2629139072847684, "grad_norm": 0.8472126965510226, "learning_rate": 0.0002711560823282058, "loss": 2.2031, "step": 3417 }, { "epoch": 2.2635761589403973, "grad_norm": 0.8420486264164412, "learning_rate": 0.00027111671696057274, "loss": 2.3125, "step": 3418 }, { "epoch": 2.2642384105960263, "grad_norm": 0.7985861220922338, "learning_rate": 0.0002710773276104411, "loss": 2.2344, "step": 3419 }, { "epoch": 2.2649006622516556, "grad_norm": 0.810427869583552, "learning_rate": 0.0002710379142856105, "loss": 2.4531, "step": 3420 }, { "epoch": 2.2655629139072846, "grad_norm": 0.8618888251122615, "learning_rate": 0.0002709984769938852, "loss": 2.4688, "step": 3421 }, { "epoch": 2.266225165562914, "grad_norm": 0.8198747872523499, "learning_rate": 0.0002709590157430743, "loss": 2.0781, "step": 3422 }, { "epoch": 2.266887417218543, "grad_norm": 0.778355310806136, "learning_rate": 0.0002709195305409915, "loss": 2.1719, "step": 3423 }, { "epoch": 2.2675496688741723, "grad_norm": 0.6993554432064756, "learning_rate": 0.0002708800213954554, "loss": 1.7344, "step": 3424 }, { "epoch": 2.2682119205298013, "grad_norm": 0.8238243243100243, "learning_rate": 0.0002708404883142891, "loss": 2.1875, "step": 3425 }, { "epoch": 2.2688741721854306, "grad_norm": 0.8666749818245352, "learning_rate": 0.00027080093130532086, "loss": 2.1406, "step": 3426 }, { "epoch": 2.2695364238410596, "grad_norm": 0.8421336644096673, "learning_rate": 0.00027076135037638324, "loss": 2.4062, "step": 3427 }, { "epoch": 2.2701986754966885, "grad_norm": 0.8559991768060545, "learning_rate": 0.00027072174553531374, "loss": 2.3438, "step": 3428 }, { "epoch": 2.270860927152318, "grad_norm": 0.8303548829817005, "learning_rate": 0.0002706821167899546, "loss": 2.25, "step": 3429 }, { "epoch": 2.271523178807947, "grad_norm": 0.8382802285375041, "learning_rate": 0.0002706424641481528, "loss": 2.4062, "step": 3430 }, { "epoch": 2.2721854304635762, "grad_norm": 0.8803112071613581, "learning_rate": 0.0002706027876177599, "loss": 2.0156, "step": 3431 }, { "epoch": 2.272847682119205, "grad_norm": 0.7720090301112768, "learning_rate": 0.0002705630872066324, "loss": 2.0469, "step": 3432 }, { "epoch": 2.2735099337748346, "grad_norm": 0.7653652268116603, "learning_rate": 0.0002705233629226315, "loss": 2.1406, "step": 3433 }, { "epoch": 2.2741721854304635, "grad_norm": 0.8053940854900532, "learning_rate": 0.0002704836147736229, "loss": 2.3594, "step": 3434 }, { "epoch": 2.274834437086093, "grad_norm": 0.7385875211216661, "learning_rate": 0.00027044384276747737, "loss": 2.0469, "step": 3435 }, { "epoch": 2.275496688741722, "grad_norm": 0.8804774699265401, "learning_rate": 0.00027040404691207006, "loss": 2.2656, "step": 3436 }, { "epoch": 2.276158940397351, "grad_norm": 0.8636738229663521, "learning_rate": 0.0002703642272152811, "loss": 2.1406, "step": 3437 }, { "epoch": 2.27682119205298, "grad_norm": 0.8235057521386625, "learning_rate": 0.0002703243836849952, "loss": 2.25, "step": 3438 }, { "epoch": 2.277483443708609, "grad_norm": 0.8204884388212093, "learning_rate": 0.00027028451632910185, "loss": 2.4062, "step": 3439 }, { "epoch": 2.2781456953642385, "grad_norm": 0.7989171557033451, "learning_rate": 0.00027024462515549524, "loss": 2.2031, "step": 3440 }, { "epoch": 2.2788079470198674, "grad_norm": 0.7493318433626527, "learning_rate": 0.0002702047101720744, "loss": 1.7266, "step": 3441 }, { "epoch": 2.279470198675497, "grad_norm": 0.7815136002279841, "learning_rate": 0.0002701647713867427, "loss": 1.9922, "step": 3442 }, { "epoch": 2.2801324503311258, "grad_norm": 0.8637766627426595, "learning_rate": 0.0002701248088074087, "loss": 2.4844, "step": 3443 }, { "epoch": 2.280794701986755, "grad_norm": 0.8447658776345155, "learning_rate": 0.00027008482244198536, "loss": 2.3438, "step": 3444 }, { "epoch": 2.281456953642384, "grad_norm": 0.7316717558792969, "learning_rate": 0.0002700448122983904, "loss": 1.8906, "step": 3445 }, { "epoch": 2.282119205298013, "grad_norm": 0.7467793634492488, "learning_rate": 0.0002700047783845464, "loss": 2.1719, "step": 3446 }, { "epoch": 2.2827814569536424, "grad_norm": 0.8118183386161748, "learning_rate": 0.00026996472070838043, "loss": 2.2656, "step": 3447 }, { "epoch": 2.2834437086092714, "grad_norm": 0.8660195484110779, "learning_rate": 0.0002699246392778244, "loss": 2.3438, "step": 3448 }, { "epoch": 2.2841059602649008, "grad_norm": 0.82448801922249, "learning_rate": 0.000269884534100815, "loss": 1.9297, "step": 3449 }, { "epoch": 2.2847682119205297, "grad_norm": 0.8362884112837178, "learning_rate": 0.0002698444051852933, "loss": 2.3438, "step": 3450 }, { "epoch": 2.285430463576159, "grad_norm": 0.8277670515587894, "learning_rate": 0.00026980425253920543, "loss": 2.3906, "step": 3451 }, { "epoch": 2.286092715231788, "grad_norm": 0.8090071707804996, "learning_rate": 0.00026976407617050206, "loss": 2.1562, "step": 3452 }, { "epoch": 2.2867549668874174, "grad_norm": 0.7835278686907772, "learning_rate": 0.0002697238760871386, "loss": 2.2812, "step": 3453 }, { "epoch": 2.2874172185430464, "grad_norm": 0.8842711401990846, "learning_rate": 0.00026968365229707497, "loss": 2.4531, "step": 3454 }, { "epoch": 2.2880794701986753, "grad_norm": 0.9212893370067547, "learning_rate": 0.00026964340480827617, "loss": 2.125, "step": 3455 }, { "epoch": 2.2887417218543047, "grad_norm": 0.8342558015598699, "learning_rate": 0.0002696031336287115, "loss": 2.4531, "step": 3456 }, { "epoch": 2.2894039735099336, "grad_norm": 0.7207004619667601, "learning_rate": 0.0002695628387663552, "loss": 2.0938, "step": 3457 }, { "epoch": 2.290066225165563, "grad_norm": 0.7898243661231695, "learning_rate": 0.000269522520229186, "loss": 1.9297, "step": 3458 }, { "epoch": 2.290728476821192, "grad_norm": 0.882192972919901, "learning_rate": 0.0002694821780251876, "loss": 1.9531, "step": 3459 }, { "epoch": 2.2913907284768213, "grad_norm": 0.7844110008067673, "learning_rate": 0.0002694418121623481, "loss": 2.1562, "step": 3460 }, { "epoch": 2.2920529801324503, "grad_norm": 0.8541480116558099, "learning_rate": 0.00026940142264866043, "loss": 2.1875, "step": 3461 }, { "epoch": 2.2927152317880797, "grad_norm": 0.8364263343369465, "learning_rate": 0.00026936100949212223, "loss": 2.1719, "step": 3462 }, { "epoch": 2.2933774834437086, "grad_norm": 0.8427578151844962, "learning_rate": 0.00026932057270073566, "loss": 2.0469, "step": 3463 }, { "epoch": 2.2940397350993376, "grad_norm": 0.8043233239213405, "learning_rate": 0.0002692801122825078, "loss": 2.1719, "step": 3464 }, { "epoch": 2.294701986754967, "grad_norm": 0.8113074335858285, "learning_rate": 0.00026923962824545013, "loss": 2.2656, "step": 3465 }, { "epoch": 2.295364238410596, "grad_norm": 0.8139702581361432, "learning_rate": 0.00026919912059757904, "loss": 2.4375, "step": 3466 }, { "epoch": 2.2960264900662253, "grad_norm": 0.7799906404235268, "learning_rate": 0.00026915858934691554, "loss": 2.2031, "step": 3467 }, { "epoch": 2.296688741721854, "grad_norm": 0.8405551450197039, "learning_rate": 0.0002691180345014852, "loss": 2.3281, "step": 3468 }, { "epoch": 2.2973509933774836, "grad_norm": 0.9412805990773327, "learning_rate": 0.00026907745606931843, "loss": 2.2812, "step": 3469 }, { "epoch": 2.2980132450331126, "grad_norm": 0.8074816861066985, "learning_rate": 0.0002690368540584501, "loss": 1.7891, "step": 3470 }, { "epoch": 2.298675496688742, "grad_norm": 0.7915758279067221, "learning_rate": 0.00026899622847692, "loss": 2.2188, "step": 3471 }, { "epoch": 2.299337748344371, "grad_norm": 0.7805398828352382, "learning_rate": 0.00026895557933277243, "loss": 2.2656, "step": 3472 }, { "epoch": 2.3, "grad_norm": 0.8029758477770688, "learning_rate": 0.0002689149066340564, "loss": 2.1406, "step": 3473 }, { "epoch": 2.300662251655629, "grad_norm": 0.7411551043751734, "learning_rate": 0.0002688742103888255, "loss": 1.9688, "step": 3474 }, { "epoch": 2.301324503311258, "grad_norm": 0.8215263787747553, "learning_rate": 0.0002688334906051381, "loss": 2.2812, "step": 3475 }, { "epoch": 2.3019867549668875, "grad_norm": 0.8063639458068304, "learning_rate": 0.00026879274729105717, "loss": 2.3906, "step": 3476 }, { "epoch": 2.3026490066225165, "grad_norm": 0.7333760557926674, "learning_rate": 0.00026875198045465037, "loss": 1.7109, "step": 3477 }, { "epoch": 2.303311258278146, "grad_norm": 0.810451754095096, "learning_rate": 0.00026871119010399004, "loss": 2.1719, "step": 3478 }, { "epoch": 2.303973509933775, "grad_norm": 0.8072731258411935, "learning_rate": 0.000268670376247153, "loss": 2.3125, "step": 3479 }, { "epoch": 2.304635761589404, "grad_norm": 0.7285133021589374, "learning_rate": 0.000268629538892221, "loss": 1.6797, "step": 3480 }, { "epoch": 2.305298013245033, "grad_norm": 0.788643311835183, "learning_rate": 0.00026858867804728027, "loss": 2.1406, "step": 3481 }, { "epoch": 2.305960264900662, "grad_norm": 0.7993309623498945, "learning_rate": 0.0002685477937204217, "loss": 2.1094, "step": 3482 }, { "epoch": 2.3066225165562915, "grad_norm": 0.7779715105032944, "learning_rate": 0.00026850688591974084, "loss": 2.0469, "step": 3483 }, { "epoch": 2.3072847682119204, "grad_norm": 0.8859747129026393, "learning_rate": 0.00026846595465333795, "loss": 2.4531, "step": 3484 }, { "epoch": 2.30794701986755, "grad_norm": 0.9794557863363869, "learning_rate": 0.00026842499992931786, "loss": 2.2969, "step": 3485 }, { "epoch": 2.3086092715231787, "grad_norm": 0.711960240393943, "learning_rate": 0.0002683840217557901, "loss": 1.9609, "step": 3486 }, { "epoch": 2.309271523178808, "grad_norm": 0.8364098197557683, "learning_rate": 0.0002683430201408688, "loss": 2.0625, "step": 3487 }, { "epoch": 2.309933774834437, "grad_norm": 0.7489954030200862, "learning_rate": 0.00026830199509267274, "loss": 2.0469, "step": 3488 }, { "epoch": 2.3105960264900665, "grad_norm": 0.8383299262573207, "learning_rate": 0.0002682609466193254, "loss": 2.2188, "step": 3489 }, { "epoch": 2.3112582781456954, "grad_norm": 0.8322237063990285, "learning_rate": 0.0002682198747289548, "loss": 2.1562, "step": 3490 }, { "epoch": 2.3119205298013243, "grad_norm": 0.808984873461122, "learning_rate": 0.00026817877942969363, "loss": 2.2969, "step": 3491 }, { "epoch": 2.3125827814569537, "grad_norm": 0.8564167958122387, "learning_rate": 0.00026813766072967925, "loss": 2.2344, "step": 3492 }, { "epoch": 2.3132450331125827, "grad_norm": 0.8271129423968329, "learning_rate": 0.0002680965186370537, "loss": 1.6953, "step": 3493 }, { "epoch": 2.313907284768212, "grad_norm": 0.7825199759283893, "learning_rate": 0.00026805535315996347, "loss": 2.1094, "step": 3494 }, { "epoch": 2.314569536423841, "grad_norm": 0.7558963310563874, "learning_rate": 0.0002680141643065599, "loss": 1.7812, "step": 3495 }, { "epoch": 2.3152317880794704, "grad_norm": 0.8409712057374107, "learning_rate": 0.0002679729520849988, "loss": 2.0938, "step": 3496 }, { "epoch": 2.3158940397350993, "grad_norm": 1.0284948757191428, "learning_rate": 0.0002679317165034406, "loss": 2.3438, "step": 3497 }, { "epoch": 2.3165562913907287, "grad_norm": 0.823717652873751, "learning_rate": 0.00026789045757005055, "loss": 2.2812, "step": 3498 }, { "epoch": 2.3172185430463577, "grad_norm": 0.7742367679207242, "learning_rate": 0.00026784917529299834, "loss": 2.0469, "step": 3499 }, { "epoch": 2.3178807947019866, "grad_norm": 0.8300108887562018, "learning_rate": 0.0002678078696804583, "loss": 2.1406, "step": 3500 }, { "epoch": 2.318543046357616, "grad_norm": 0.8019883200688397, "learning_rate": 0.00026776654074060947, "loss": 2.0, "step": 3501 }, { "epoch": 2.319205298013245, "grad_norm": 0.7767040976168101, "learning_rate": 0.0002677251884816354, "loss": 2.1406, "step": 3502 }, { "epoch": 2.3198675496688743, "grad_norm": 0.7871385438991763, "learning_rate": 0.0002676838129117244, "loss": 2.125, "step": 3503 }, { "epoch": 2.3205298013245033, "grad_norm": 0.8101185525919736, "learning_rate": 0.0002676424140390691, "loss": 2.2031, "step": 3504 }, { "epoch": 2.321192052980132, "grad_norm": 0.8173896386150242, "learning_rate": 0.00026760099187186725, "loss": 2.2812, "step": 3505 }, { "epoch": 2.3218543046357616, "grad_norm": 0.7869629040036665, "learning_rate": 0.00026755954641832064, "loss": 1.8281, "step": 3506 }, { "epoch": 2.322516556291391, "grad_norm": 0.7880492843609324, "learning_rate": 0.00026751807768663616, "loss": 2.0469, "step": 3507 }, { "epoch": 2.32317880794702, "grad_norm": 0.8280483733652815, "learning_rate": 0.0002674765856850249, "loss": 2.2812, "step": 3508 }, { "epoch": 2.323841059602649, "grad_norm": 0.7558704103339472, "learning_rate": 0.0002674350704217029, "loss": 1.9922, "step": 3509 }, { "epoch": 2.3245033112582782, "grad_norm": 0.7772141431684703, "learning_rate": 0.00026739353190489057, "loss": 2.1562, "step": 3510 }, { "epoch": 2.325165562913907, "grad_norm": 0.754192091853306, "learning_rate": 0.0002673519701428131, "loss": 1.9609, "step": 3511 }, { "epoch": 2.3258278145695366, "grad_norm": 0.7710161717638109, "learning_rate": 0.0002673103851437001, "loss": 2.1562, "step": 3512 }, { "epoch": 2.3264900662251655, "grad_norm": 0.8134440167567158, "learning_rate": 0.000267268776915786, "loss": 1.9609, "step": 3513 }, { "epoch": 2.3271523178807945, "grad_norm": 0.9498468649480726, "learning_rate": 0.00026722714546730957, "loss": 2.3438, "step": 3514 }, { "epoch": 2.327814569536424, "grad_norm": 0.753497630990035, "learning_rate": 0.0002671854908065144, "loss": 1.8828, "step": 3515 }, { "epoch": 2.328476821192053, "grad_norm": 0.7475686125932557, "learning_rate": 0.00026714381294164853, "loss": 1.7578, "step": 3516 }, { "epoch": 2.329139072847682, "grad_norm": 0.7368471150432514, "learning_rate": 0.0002671021118809647, "loss": 1.7969, "step": 3517 }, { "epoch": 2.329801324503311, "grad_norm": 0.7627186588359872, "learning_rate": 0.0002670603876327202, "loss": 2.1094, "step": 3518 }, { "epoch": 2.3304635761589405, "grad_norm": 0.8241381966534143, "learning_rate": 0.00026701864020517685, "loss": 2.2656, "step": 3519 }, { "epoch": 2.3311258278145695, "grad_norm": 0.8102145676316767, "learning_rate": 0.00026697686960660123, "loss": 1.9922, "step": 3520 }, { "epoch": 2.331788079470199, "grad_norm": 0.8620441010616755, "learning_rate": 0.00026693507584526425, "loss": 2.3438, "step": 3521 }, { "epoch": 2.332450331125828, "grad_norm": 0.8842616765315734, "learning_rate": 0.0002668932589294417, "loss": 2.2969, "step": 3522 }, { "epoch": 2.3331125827814567, "grad_norm": 0.7528527379787083, "learning_rate": 0.0002668514188674137, "loss": 1.8359, "step": 3523 }, { "epoch": 2.333774834437086, "grad_norm": 0.8130074819141339, "learning_rate": 0.0002668095556674652, "loss": 2.0312, "step": 3524 }, { "epoch": 2.334437086092715, "grad_norm": 0.8015450699548855, "learning_rate": 0.0002667676693378854, "loss": 2.1875, "step": 3525 }, { "epoch": 2.3350993377483444, "grad_norm": 0.8550538713528306, "learning_rate": 0.00026672575988696844, "loss": 2.25, "step": 3526 }, { "epoch": 2.3357615894039734, "grad_norm": 0.7655186600584316, "learning_rate": 0.00026668382732301276, "loss": 2.0, "step": 3527 }, { "epoch": 2.3364238410596028, "grad_norm": 0.8703591867480458, "learning_rate": 0.0002666418716543215, "loss": 2.1719, "step": 3528 }, { "epoch": 2.3370860927152317, "grad_norm": 0.8716923787579354, "learning_rate": 0.0002665998928892025, "loss": 2.2031, "step": 3529 }, { "epoch": 2.337748344370861, "grad_norm": 0.7623059992171638, "learning_rate": 0.00026655789103596783, "loss": 1.8203, "step": 3530 }, { "epoch": 2.33841059602649, "grad_norm": 0.83047209439055, "learning_rate": 0.00026651586610293453, "loss": 2.25, "step": 3531 }, { "epoch": 2.339072847682119, "grad_norm": 0.7910681377650448, "learning_rate": 0.0002664738180984239, "loss": 2.0312, "step": 3532 }, { "epoch": 2.3397350993377484, "grad_norm": 0.8100789357020621, "learning_rate": 0.0002664317470307619, "loss": 2.0938, "step": 3533 }, { "epoch": 2.3403973509933773, "grad_norm": 0.9017538204288273, "learning_rate": 0.0002663896529082792, "loss": 2.1406, "step": 3534 }, { "epoch": 2.3410596026490067, "grad_norm": 0.7412165669742986, "learning_rate": 0.00026634753573931083, "loss": 2.0469, "step": 3535 }, { "epoch": 2.3417218543046356, "grad_norm": 0.8849553799040792, "learning_rate": 0.00026630539553219656, "loss": 2.2188, "step": 3536 }, { "epoch": 2.342384105960265, "grad_norm": 0.7832643093217307, "learning_rate": 0.00026626323229528054, "loss": 1.9609, "step": 3537 }, { "epoch": 2.343046357615894, "grad_norm": 0.8241418829860712, "learning_rate": 0.0002662210460369116, "loss": 2.2969, "step": 3538 }, { "epoch": 2.3437086092715234, "grad_norm": 0.8246287905762533, "learning_rate": 0.0002661788367654431, "loss": 2.1094, "step": 3539 }, { "epoch": 2.3443708609271523, "grad_norm": 0.8955625126478135, "learning_rate": 0.000266136604489233, "loss": 2.1719, "step": 3540 }, { "epoch": 2.3450331125827812, "grad_norm": 0.9896257990148383, "learning_rate": 0.00026609434921664375, "loss": 2.2812, "step": 3541 }, { "epoch": 2.3456953642384106, "grad_norm": 0.8209655093030814, "learning_rate": 0.0002660520709560424, "loss": 2.1875, "step": 3542 }, { "epoch": 2.3463576158940396, "grad_norm": 0.8153954341921122, "learning_rate": 0.0002660097697158005, "loss": 2.1719, "step": 3543 }, { "epoch": 2.347019867549669, "grad_norm": 0.7872034619718571, "learning_rate": 0.00026596744550429414, "loss": 2.1406, "step": 3544 }, { "epoch": 2.347682119205298, "grad_norm": 0.8290463355424399, "learning_rate": 0.00026592509832990403, "loss": 2.0781, "step": 3545 }, { "epoch": 2.3483443708609273, "grad_norm": 0.722934519215767, "learning_rate": 0.0002658827282010155, "loss": 2.0625, "step": 3546 }, { "epoch": 2.3490066225165562, "grad_norm": 0.8174556561968537, "learning_rate": 0.00026584033512601823, "loss": 2.0469, "step": 3547 }, { "epoch": 2.3496688741721856, "grad_norm": 0.7975791625499015, "learning_rate": 0.0002657979191133065, "loss": 1.8984, "step": 3548 }, { "epoch": 2.3503311258278146, "grad_norm": 0.9192925546141507, "learning_rate": 0.0002657554801712792, "loss": 2.3906, "step": 3549 }, { "epoch": 2.3509933774834435, "grad_norm": 0.7872758239517652, "learning_rate": 0.0002657130183083397, "loss": 2.2188, "step": 3550 }, { "epoch": 2.351655629139073, "grad_norm": 0.7702925193856948, "learning_rate": 0.000265670533532896, "loss": 2.2344, "step": 3551 }, { "epoch": 2.352317880794702, "grad_norm": 0.7936957568718466, "learning_rate": 0.00026562802585336053, "loss": 2.25, "step": 3552 }, { "epoch": 2.352980132450331, "grad_norm": 0.783762734306259, "learning_rate": 0.00026558549527815035, "loss": 2.1094, "step": 3553 }, { "epoch": 2.35364238410596, "grad_norm": 0.8405530902811719, "learning_rate": 0.000265542941815687, "loss": 1.7344, "step": 3554 }, { "epoch": 2.3543046357615895, "grad_norm": 0.7364878035352765, "learning_rate": 0.0002655003654743964, "loss": 2.0312, "step": 3555 }, { "epoch": 2.3549668874172185, "grad_norm": 0.8523626169481099, "learning_rate": 0.0002654577662627093, "loss": 2.2188, "step": 3556 }, { "epoch": 2.355629139072848, "grad_norm": 0.770420021299208, "learning_rate": 0.00026541514418906083, "loss": 2.0781, "step": 3557 }, { "epoch": 2.356291390728477, "grad_norm": 0.8365722566100738, "learning_rate": 0.0002653724992618906, "loss": 2.0781, "step": 3558 }, { "epoch": 2.3569536423841058, "grad_norm": 0.8492206160524566, "learning_rate": 0.00026532983148964284, "loss": 2.2031, "step": 3559 }, { "epoch": 2.357615894039735, "grad_norm": 0.8912623066850461, "learning_rate": 0.0002652871408807662, "loss": 2.0625, "step": 3560 }, { "epoch": 2.358278145695364, "grad_norm": 0.7814844799594335, "learning_rate": 0.000265244427443714, "loss": 1.9453, "step": 3561 }, { "epoch": 2.3589403973509935, "grad_norm": 0.8588486143474591, "learning_rate": 0.00026520169118694385, "loss": 2.2344, "step": 3562 }, { "epoch": 2.3596026490066224, "grad_norm": 0.7846538466182011, "learning_rate": 0.0002651589321189181, "loss": 1.9219, "step": 3563 }, { "epoch": 2.360264900662252, "grad_norm": 0.816868818092568, "learning_rate": 0.00026511615024810365, "loss": 1.9219, "step": 3564 }, { "epoch": 2.3609271523178808, "grad_norm": 0.9338972370468528, "learning_rate": 0.0002650733455829716, "loss": 2.2656, "step": 3565 }, { "epoch": 2.36158940397351, "grad_norm": 0.8599554968607911, "learning_rate": 0.00026503051813199784, "loss": 2.2812, "step": 3566 }, { "epoch": 2.362251655629139, "grad_norm": 0.8571277365505403, "learning_rate": 0.0002649876679036627, "loss": 2.25, "step": 3567 }, { "epoch": 2.362913907284768, "grad_norm": 0.787730185536069, "learning_rate": 0.0002649447949064511, "loss": 2.1719, "step": 3568 }, { "epoch": 2.3635761589403974, "grad_norm": 0.7541041324615795, "learning_rate": 0.0002649018991488523, "loss": 1.7422, "step": 3569 }, { "epoch": 2.3642384105960264, "grad_norm": 0.9222148801036135, "learning_rate": 0.0002648589806393601, "loss": 2.3125, "step": 3570 }, { "epoch": 2.3649006622516557, "grad_norm": 0.7649390027216305, "learning_rate": 0.0002648160393864729, "loss": 1.9922, "step": 3571 }, { "epoch": 2.3655629139072847, "grad_norm": 0.9568446131772311, "learning_rate": 0.00026477307539869363, "loss": 2.2031, "step": 3572 }, { "epoch": 2.366225165562914, "grad_norm": 0.7702773392732393, "learning_rate": 0.00026473008868452963, "loss": 2.125, "step": 3573 }, { "epoch": 2.366887417218543, "grad_norm": 0.7873764885522335, "learning_rate": 0.0002646870792524927, "loss": 2.0156, "step": 3574 }, { "epoch": 2.3675496688741724, "grad_norm": 0.9573384268443924, "learning_rate": 0.00026464404711109917, "loss": 2.4688, "step": 3575 }, { "epoch": 2.3682119205298013, "grad_norm": 0.7859632885560607, "learning_rate": 0.00026460099226887, "loss": 2.125, "step": 3576 }, { "epoch": 2.3688741721854303, "grad_norm": 0.7656407422301027, "learning_rate": 0.00026455791473433055, "loss": 2.1094, "step": 3577 }, { "epoch": 2.3695364238410597, "grad_norm": 0.8581622538286189, "learning_rate": 0.0002645148145160106, "loss": 2.3125, "step": 3578 }, { "epoch": 2.3701986754966886, "grad_norm": 0.8412551384807456, "learning_rate": 0.0002644716916224445, "loss": 2.1719, "step": 3579 }, { "epoch": 2.370860927152318, "grad_norm": 0.7527793173231068, "learning_rate": 0.0002644285460621711, "loss": 2.1562, "step": 3580 }, { "epoch": 2.371523178807947, "grad_norm": 0.7875784960554587, "learning_rate": 0.0002643853778437337, "loss": 2.2344, "step": 3581 }, { "epoch": 2.3721854304635763, "grad_norm": 0.7972729524228418, "learning_rate": 0.00026434218697568004, "loss": 2.3281, "step": 3582 }, { "epoch": 2.3728476821192053, "grad_norm": 0.7538283326101876, "learning_rate": 0.0002642989734665625, "loss": 2.1875, "step": 3583 }, { "epoch": 2.3735099337748347, "grad_norm": 0.7308391322920831, "learning_rate": 0.00026425573732493784, "loss": 2.0781, "step": 3584 }, { "epoch": 2.3741721854304636, "grad_norm": 0.7821157463649815, "learning_rate": 0.0002642124785593673, "loss": 2.0156, "step": 3585 }, { "epoch": 2.3748344370860925, "grad_norm": 0.7619751460233185, "learning_rate": 0.0002641691971784166, "loss": 1.9297, "step": 3586 }, { "epoch": 2.375496688741722, "grad_norm": 0.7937685605739325, "learning_rate": 0.0002641258931906559, "loss": 2.2344, "step": 3587 }, { "epoch": 2.376158940397351, "grad_norm": 0.8312520885940347, "learning_rate": 0.00026408256660466, "loss": 2.2969, "step": 3588 }, { "epoch": 2.3768211920529803, "grad_norm": 0.7833330596737437, "learning_rate": 0.00026403921742900794, "loss": 2.0, "step": 3589 }, { "epoch": 2.377483443708609, "grad_norm": 0.7929016774121538, "learning_rate": 0.00026399584567228344, "loss": 2.3281, "step": 3590 }, { "epoch": 2.3781456953642386, "grad_norm": 0.7753201205484034, "learning_rate": 0.0002639524513430746, "loss": 2.0312, "step": 3591 }, { "epoch": 2.3788079470198675, "grad_norm": 0.8070256858649795, "learning_rate": 0.00026390903444997396, "loss": 2.0938, "step": 3592 }, { "epoch": 2.379470198675497, "grad_norm": 0.8784035595165001, "learning_rate": 0.00026386559500157856, "loss": 2.2344, "step": 3593 }, { "epoch": 2.380132450331126, "grad_norm": 0.7934989968736885, "learning_rate": 0.0002638221330064899, "loss": 2.25, "step": 3594 }, { "epoch": 2.380794701986755, "grad_norm": 0.7999236763820265, "learning_rate": 0.00026377864847331405, "loss": 2.1719, "step": 3595 }, { "epoch": 2.381456953642384, "grad_norm": 0.810801944709112, "learning_rate": 0.0002637351414106613, "loss": 2.1406, "step": 3596 }, { "epoch": 2.382119205298013, "grad_norm": 0.8681317904333736, "learning_rate": 0.00026369161182714665, "loss": 2.2031, "step": 3597 }, { "epoch": 2.3827814569536425, "grad_norm": 0.8895130495198013, "learning_rate": 0.00026364805973138944, "loss": 2.2656, "step": 3598 }, { "epoch": 2.3834437086092715, "grad_norm": 0.7717012487635636, "learning_rate": 0.00026360448513201347, "loss": 2.1094, "step": 3599 }, { "epoch": 2.384105960264901, "grad_norm": 0.8782813227523034, "learning_rate": 0.000263560888037647, "loss": 2.5312, "step": 3600 }, { "epoch": 2.38476821192053, "grad_norm": 0.8088371036538807, "learning_rate": 0.0002635172684569227, "loss": 2.0781, "step": 3601 }, { "epoch": 2.385430463576159, "grad_norm": 0.7506189918319235, "learning_rate": 0.0002634736263984779, "loss": 2.0625, "step": 3602 }, { "epoch": 2.386092715231788, "grad_norm": 0.7448361186872774, "learning_rate": 0.0002634299618709541, "loss": 2.0156, "step": 3603 }, { "epoch": 2.386754966887417, "grad_norm": 0.7429560549759459, "learning_rate": 0.0002633862748829975, "loss": 1.9766, "step": 3604 }, { "epoch": 2.3874172185430464, "grad_norm": 0.7293237850457983, "learning_rate": 0.0002633425654432585, "loss": 2.0938, "step": 3605 }, { "epoch": 2.3880794701986754, "grad_norm": 0.8067632906532726, "learning_rate": 0.000263298833560392, "loss": 2.2969, "step": 3606 }, { "epoch": 2.388741721854305, "grad_norm": 0.8669901850886488, "learning_rate": 0.00026325507924305764, "loss": 2.1875, "step": 3607 }, { "epoch": 2.3894039735099337, "grad_norm": 0.7814395338743131, "learning_rate": 0.00026321130249991917, "loss": 2.2656, "step": 3608 }, { "epoch": 2.390066225165563, "grad_norm": 0.8058662060806352, "learning_rate": 0.0002631675033396448, "loss": 2.25, "step": 3609 }, { "epoch": 2.390728476821192, "grad_norm": 0.8135754639778354, "learning_rate": 0.0002631236817709074, "loss": 2.0938, "step": 3610 }, { "epoch": 2.3913907284768214, "grad_norm": 0.8231872854923911, "learning_rate": 0.00026307983780238405, "loss": 2.375, "step": 3611 }, { "epoch": 2.3920529801324504, "grad_norm": 0.7798938095782424, "learning_rate": 0.00026303597144275643, "loss": 2.0469, "step": 3612 }, { "epoch": 2.3927152317880793, "grad_norm": 0.769214196087789, "learning_rate": 0.00026299208270071057, "loss": 2.1406, "step": 3613 }, { "epoch": 2.3933774834437087, "grad_norm": 0.8931987315375328, "learning_rate": 0.0002629481715849369, "loss": 2.2031, "step": 3614 }, { "epoch": 2.3940397350993377, "grad_norm": 0.8324613832885025, "learning_rate": 0.00026290423810413033, "loss": 2.375, "step": 3615 }, { "epoch": 2.394701986754967, "grad_norm": 0.7790733031306819, "learning_rate": 0.00026286028226699023, "loss": 2.0156, "step": 3616 }, { "epoch": 2.395364238410596, "grad_norm": 0.8176880623287325, "learning_rate": 0.0002628163040822203, "loss": 2.375, "step": 3617 }, { "epoch": 2.396026490066225, "grad_norm": 0.842797782019801, "learning_rate": 0.00026277230355852883, "loss": 2.3125, "step": 3618 }, { "epoch": 2.3966887417218543, "grad_norm": 0.8096954669219841, "learning_rate": 0.0002627282807046283, "loss": 2.2969, "step": 3619 }, { "epoch": 2.3973509933774833, "grad_norm": 0.7591993295150838, "learning_rate": 0.0002626842355292359, "loss": 2.0781, "step": 3620 }, { "epoch": 2.3980132450331126, "grad_norm": 0.7455173718863316, "learning_rate": 0.0002626401680410729, "loss": 2.1875, "step": 3621 }, { "epoch": 2.3986754966887416, "grad_norm": 0.7332975584379399, "learning_rate": 0.00026259607824886526, "loss": 1.8047, "step": 3622 }, { "epoch": 2.399337748344371, "grad_norm": 0.8238404928995606, "learning_rate": 0.0002625519661613433, "loss": 2.2031, "step": 3623 }, { "epoch": 2.4, "grad_norm": 0.7633460862918544, "learning_rate": 0.00026250783178724166, "loss": 2.2812, "step": 3624 }, { "epoch": 2.4006622516556293, "grad_norm": 0.7227952906113905, "learning_rate": 0.0002624636751352994, "loss": 2.0781, "step": 3625 }, { "epoch": 2.4013245033112582, "grad_norm": 0.8055733301251006, "learning_rate": 0.0002624194962142602, "loss": 2.125, "step": 3626 }, { "epoch": 2.401986754966887, "grad_norm": 0.7939315294182231, "learning_rate": 0.0002623752950328718, "loss": 2.3906, "step": 3627 }, { "epoch": 2.4026490066225166, "grad_norm": 0.757823821228571, "learning_rate": 0.0002623310715998867, "loss": 2.125, "step": 3628 }, { "epoch": 2.4033112582781455, "grad_norm": 0.8216954008180362, "learning_rate": 0.0002622868259240616, "loss": 2.0781, "step": 3629 }, { "epoch": 2.403973509933775, "grad_norm": 0.7742602116984382, "learning_rate": 0.00026224255801415765, "loss": 2.125, "step": 3630 }, { "epoch": 2.404635761589404, "grad_norm": 0.8687000798179293, "learning_rate": 0.0002621982678789404, "loss": 2.5938, "step": 3631 }, { "epoch": 2.4052980132450332, "grad_norm": 0.7554301587009465, "learning_rate": 0.00026215395552717973, "loss": 2.0781, "step": 3632 }, { "epoch": 2.405960264900662, "grad_norm": 0.7480187277505671, "learning_rate": 0.00026210962096765014, "loss": 2.25, "step": 3633 }, { "epoch": 2.4066225165562916, "grad_norm": 0.8014807323742578, "learning_rate": 0.00026206526420913023, "loss": 2.2969, "step": 3634 }, { "epoch": 2.4072847682119205, "grad_norm": 0.7786140637073341, "learning_rate": 0.0002620208852604033, "loss": 2.2344, "step": 3635 }, { "epoch": 2.4079470198675494, "grad_norm": 0.8985461561247333, "learning_rate": 0.00026197648413025674, "loss": 2.1094, "step": 3636 }, { "epoch": 2.408609271523179, "grad_norm": 0.7788293376187034, "learning_rate": 0.0002619320608274826, "loss": 2.0, "step": 3637 }, { "epoch": 2.4092715231788078, "grad_norm": 0.8471027246759101, "learning_rate": 0.0002618876153608771, "loss": 2.3594, "step": 3638 }, { "epoch": 2.409933774834437, "grad_norm": 0.7894665103156032, "learning_rate": 0.00026184314773924103, "loss": 2.0938, "step": 3639 }, { "epoch": 2.410596026490066, "grad_norm": 0.712404186045945, "learning_rate": 0.0002617986579713795, "loss": 1.7031, "step": 3640 }, { "epoch": 2.4112582781456955, "grad_norm": 0.7956530994207354, "learning_rate": 0.000261754146066102, "loss": 2.2656, "step": 3641 }, { "epoch": 2.4119205298013244, "grad_norm": 0.808948601121387, "learning_rate": 0.0002617096120322223, "loss": 2.1562, "step": 3642 }, { "epoch": 2.412582781456954, "grad_norm": 0.8156357415889068, "learning_rate": 0.0002616650558785587, "loss": 1.9219, "step": 3643 }, { "epoch": 2.4132450331125828, "grad_norm": 0.9026219565846519, "learning_rate": 0.00026162047761393387, "loss": 2.3906, "step": 3644 }, { "epoch": 2.4139072847682117, "grad_norm": 0.828435762212483, "learning_rate": 0.00026157587724717475, "loss": 2.0938, "step": 3645 }, { "epoch": 2.414569536423841, "grad_norm": 0.8252140024733946, "learning_rate": 0.00026153125478711275, "loss": 2.1875, "step": 3646 }, { "epoch": 2.41523178807947, "grad_norm": 0.8520896357360669, "learning_rate": 0.00026148661024258374, "loss": 2.4688, "step": 3647 }, { "epoch": 2.4158940397350994, "grad_norm": 0.8412381300223911, "learning_rate": 0.00026144194362242765, "loss": 2.2656, "step": 3648 }, { "epoch": 2.4165562913907284, "grad_norm": 0.7537007870515904, "learning_rate": 0.0002613972549354891, "loss": 2.0312, "step": 3649 }, { "epoch": 2.4172185430463577, "grad_norm": 0.727992445937327, "learning_rate": 0.00026135254419061695, "loss": 1.7969, "step": 3650 }, { "epoch": 2.4178807947019867, "grad_norm": 0.8413015170724273, "learning_rate": 0.00026130781139666444, "loss": 2.3906, "step": 3651 }, { "epoch": 2.418543046357616, "grad_norm": 0.7684723693713971, "learning_rate": 0.00026126305656248914, "loss": 2.0312, "step": 3652 }, { "epoch": 2.419205298013245, "grad_norm": 0.7508854474142914, "learning_rate": 0.0002612182796969531, "loss": 1.8125, "step": 3653 }, { "epoch": 2.419867549668874, "grad_norm": 0.732397409630897, "learning_rate": 0.00026117348080892264, "loss": 1.75, "step": 3654 }, { "epoch": 2.4205298013245033, "grad_norm": 0.8742543732150815, "learning_rate": 0.0002611286599072683, "loss": 2.4375, "step": 3655 }, { "epoch": 2.4211920529801323, "grad_norm": 0.7538414469191606, "learning_rate": 0.00026108381700086535, "loss": 2.1094, "step": 3656 }, { "epoch": 2.4218543046357617, "grad_norm": 0.7484323067327109, "learning_rate": 0.00026103895209859306, "loss": 2.0, "step": 3657 }, { "epoch": 2.4225165562913906, "grad_norm": 0.8465057680432724, "learning_rate": 0.00026099406520933523, "loss": 2.1875, "step": 3658 }, { "epoch": 2.42317880794702, "grad_norm": 1.0162209057975777, "learning_rate": 0.00026094915634198, "loss": 2.3594, "step": 3659 }, { "epoch": 2.423841059602649, "grad_norm": 0.7283380008014367, "learning_rate": 0.0002609042255054198, "loss": 1.9766, "step": 3660 }, { "epoch": 2.4245033112582783, "grad_norm": 0.7888572002465285, "learning_rate": 0.0002608592727085515, "loss": 2.1719, "step": 3661 }, { "epoch": 2.4251655629139073, "grad_norm": 0.8199963998615016, "learning_rate": 0.00026081429796027624, "loss": 2.0156, "step": 3662 }, { "epoch": 2.4258278145695362, "grad_norm": 0.8334344732561343, "learning_rate": 0.0002607693012694995, "loss": 2.1875, "step": 3663 }, { "epoch": 2.4264900662251656, "grad_norm": 0.8386123031054049, "learning_rate": 0.00026072428264513125, "loss": 2.1875, "step": 3664 }, { "epoch": 2.4271523178807946, "grad_norm": 0.8204409577548738, "learning_rate": 0.00026067924209608557, "loss": 2.2656, "step": 3665 }, { "epoch": 2.427814569536424, "grad_norm": 0.8790249933305871, "learning_rate": 0.0002606341796312811, "loss": 2.25, "step": 3666 }, { "epoch": 2.428476821192053, "grad_norm": 0.8651708312663978, "learning_rate": 0.0002605890952596406, "loss": 2.0625, "step": 3667 }, { "epoch": 2.4291390728476823, "grad_norm": 0.8130654986601692, "learning_rate": 0.00026054398899009144, "loss": 2.25, "step": 3668 }, { "epoch": 2.429801324503311, "grad_norm": 0.8081244852505798, "learning_rate": 0.00026049886083156517, "loss": 2.2812, "step": 3669 }, { "epoch": 2.4304635761589406, "grad_norm": 0.7721411828936044, "learning_rate": 0.00026045371079299755, "loss": 2.0469, "step": 3670 }, { "epoch": 2.4311258278145695, "grad_norm": 0.8499352959739105, "learning_rate": 0.00026040853888332886, "loss": 2.2969, "step": 3671 }, { "epoch": 2.4317880794701985, "grad_norm": 0.8580259171222804, "learning_rate": 0.0002603633451115037, "loss": 1.9453, "step": 3672 }, { "epoch": 2.432450331125828, "grad_norm": 0.7928081475423464, "learning_rate": 0.00026031812948647097, "loss": 2.1875, "step": 3673 }, { "epoch": 2.433112582781457, "grad_norm": 0.7808383073504149, "learning_rate": 0.00026027289201718384, "loss": 1.8906, "step": 3674 }, { "epoch": 2.433774834437086, "grad_norm": 0.8641230958121657, "learning_rate": 0.00026022763271259984, "loss": 2.375, "step": 3675 }, { "epoch": 2.434437086092715, "grad_norm": 0.842412673700197, "learning_rate": 0.00026018235158168084, "loss": 2.25, "step": 3676 }, { "epoch": 2.4350993377483445, "grad_norm": 0.9618234748444455, "learning_rate": 0.000260137048633393, "loss": 2.1719, "step": 3677 }, { "epoch": 2.4357615894039735, "grad_norm": 0.973700784335045, "learning_rate": 0.0002600917238767069, "loss": 2.2969, "step": 3678 }, { "epoch": 2.436423841059603, "grad_norm": 0.8418957199314805, "learning_rate": 0.00026004637732059734, "loss": 2.25, "step": 3679 }, { "epoch": 2.437086092715232, "grad_norm": 0.856697794327708, "learning_rate": 0.00026000100897404334, "loss": 2.2344, "step": 3680 }, { "epoch": 2.4377483443708607, "grad_norm": 0.9385182614973339, "learning_rate": 0.0002599556188460285, "loss": 2.2969, "step": 3681 }, { "epoch": 2.43841059602649, "grad_norm": 0.7859602931843207, "learning_rate": 0.0002599102069455405, "loss": 2.2188, "step": 3682 }, { "epoch": 2.439072847682119, "grad_norm": 0.9065440712055216, "learning_rate": 0.0002598647732815715, "loss": 2.4062, "step": 3683 }, { "epoch": 2.4397350993377485, "grad_norm": 0.8255287607682568, "learning_rate": 0.00025981931786311777, "loss": 2.25, "step": 3684 }, { "epoch": 2.4403973509933774, "grad_norm": 0.8603593157376187, "learning_rate": 0.00025977384069918007, "loss": 2.3281, "step": 3685 }, { "epoch": 2.441059602649007, "grad_norm": 0.7663879075345988, "learning_rate": 0.0002597283417987634, "loss": 2.0, "step": 3686 }, { "epoch": 2.4417218543046357, "grad_norm": 0.7949237063570226, "learning_rate": 0.0002596828211708771, "loss": 2.2812, "step": 3687 }, { "epoch": 2.442384105960265, "grad_norm": 0.7834429210955349, "learning_rate": 0.0002596372788245347, "loss": 2.4219, "step": 3688 }, { "epoch": 2.443046357615894, "grad_norm": 0.7931726668190414, "learning_rate": 0.00025959171476875416, "loss": 2.0156, "step": 3689 }, { "epoch": 2.443708609271523, "grad_norm": 0.8245492384761287, "learning_rate": 0.0002595461290125576, "loss": 2.5, "step": 3690 }, { "epoch": 2.4443708609271524, "grad_norm": 0.8745721561770207, "learning_rate": 0.00025950052156497173, "loss": 2.3438, "step": 3691 }, { "epoch": 2.4450331125827813, "grad_norm": 0.7579249099032168, "learning_rate": 0.0002594548924350271, "loss": 2.0156, "step": 3692 }, { "epoch": 2.4456953642384107, "grad_norm": 0.7940598235940702, "learning_rate": 0.0002594092416317589, "loss": 2.2812, "step": 3693 }, { "epoch": 2.4463576158940397, "grad_norm": 0.7181037377864162, "learning_rate": 0.00025936356916420657, "loss": 1.8906, "step": 3694 }, { "epoch": 2.447019867549669, "grad_norm": 0.7290275657274292, "learning_rate": 0.00025931787504141374, "loss": 2.2031, "step": 3695 }, { "epoch": 2.447682119205298, "grad_norm": 0.7641701887600025, "learning_rate": 0.0002592721592724283, "loss": 2.0469, "step": 3696 }, { "epoch": 2.4483443708609274, "grad_norm": 0.7735180077417082, "learning_rate": 0.0002592264218663026, "loss": 2.1875, "step": 3697 }, { "epoch": 2.4490066225165563, "grad_norm": 0.8266234811514622, "learning_rate": 0.00025918066283209316, "loss": 2.3906, "step": 3698 }, { "epoch": 2.4496688741721853, "grad_norm": 0.754399097723304, "learning_rate": 0.0002591348821788607, "loss": 2.0781, "step": 3699 }, { "epoch": 2.4503311258278146, "grad_norm": 0.7801089380252292, "learning_rate": 0.00025908907991567043, "loss": 2.125, "step": 3700 }, { "epoch": 2.4509933774834436, "grad_norm": 0.7558862254531314, "learning_rate": 0.0002590432560515917, "loss": 1.875, "step": 3701 }, { "epoch": 2.451655629139073, "grad_norm": 0.8260401207822209, "learning_rate": 0.000258997410595698, "loss": 2.0312, "step": 3702 }, { "epoch": 2.452317880794702, "grad_norm": 0.7345401689908106, "learning_rate": 0.0002589515435570675, "loss": 2.0781, "step": 3703 }, { "epoch": 2.4529801324503313, "grad_norm": 0.7947041494812258, "learning_rate": 0.0002589056549447823, "loss": 2.0938, "step": 3704 }, { "epoch": 2.4536423841059603, "grad_norm": 0.7550221168067907, "learning_rate": 0.0002588597447679288, "loss": 2.2031, "step": 3705 }, { "epoch": 2.4543046357615896, "grad_norm": 0.72700331959345, "learning_rate": 0.0002588138130355978, "loss": 1.8594, "step": 3706 }, { "epoch": 2.4549668874172186, "grad_norm": 0.7812857362563028, "learning_rate": 0.00025876785975688436, "loss": 2.125, "step": 3707 }, { "epoch": 2.4556291390728475, "grad_norm": 0.7338502259373492, "learning_rate": 0.00025872188494088766, "loss": 1.9766, "step": 3708 }, { "epoch": 2.456291390728477, "grad_norm": 0.8109446502930461, "learning_rate": 0.00025867588859671133, "loss": 1.7656, "step": 3709 }, { "epoch": 2.456953642384106, "grad_norm": 0.7663438984587653, "learning_rate": 0.00025862987073346305, "loss": 2.1406, "step": 3710 }, { "epoch": 2.4576158940397352, "grad_norm": 0.7769394682684838, "learning_rate": 0.00025858383136025505, "loss": 2.0156, "step": 3711 }, { "epoch": 2.458278145695364, "grad_norm": 0.8079150413107431, "learning_rate": 0.0002585377704862035, "loss": 2.0781, "step": 3712 }, { "epoch": 2.4589403973509936, "grad_norm": 0.767665684192676, "learning_rate": 0.0002584916881204291, "loss": 2.1406, "step": 3713 }, { "epoch": 2.4596026490066225, "grad_norm": 0.8659724611527772, "learning_rate": 0.00025844558427205657, "loss": 2.2656, "step": 3714 }, { "epoch": 2.460264900662252, "grad_norm": 0.8002472614255726, "learning_rate": 0.0002583994589502151, "loss": 1.8047, "step": 3715 }, { "epoch": 2.460927152317881, "grad_norm": 0.7759990484041954, "learning_rate": 0.00025835331216403807, "loss": 1.6719, "step": 3716 }, { "epoch": 2.46158940397351, "grad_norm": 0.8365995049284329, "learning_rate": 0.00025830714392266295, "loss": 2.2188, "step": 3717 }, { "epoch": 2.462251655629139, "grad_norm": 0.8401222617562081, "learning_rate": 0.0002582609542352316, "loss": 2.1094, "step": 3718 }, { "epoch": 2.462913907284768, "grad_norm": 0.8044452882262215, "learning_rate": 0.00025821474311089014, "loss": 2.25, "step": 3719 }, { "epoch": 2.4635761589403975, "grad_norm": 0.8219732114269241, "learning_rate": 0.00025816851055878896, "loss": 2.5469, "step": 3720 }, { "epoch": 2.4642384105960264, "grad_norm": 0.8174796258453926, "learning_rate": 0.00025812225658808255, "loss": 2.2188, "step": 3721 }, { "epoch": 2.4649006622516554, "grad_norm": 0.8108640060730444, "learning_rate": 0.00025807598120792976, "loss": 2.1094, "step": 3722 }, { "epoch": 2.4655629139072848, "grad_norm": 0.8683068022438082, "learning_rate": 0.00025802968442749364, "loss": 2.0938, "step": 3723 }, { "epoch": 2.466225165562914, "grad_norm": 0.750924543068475, "learning_rate": 0.0002579833662559414, "loss": 2.1094, "step": 3724 }, { "epoch": 2.466887417218543, "grad_norm": 0.7386933190562606, "learning_rate": 0.00025793702670244475, "loss": 2.1094, "step": 3725 }, { "epoch": 2.467549668874172, "grad_norm": 0.8536993131565537, "learning_rate": 0.00025789066577617934, "loss": 2.2188, "step": 3726 }, { "epoch": 2.4682119205298014, "grad_norm": 0.8332192958507407, "learning_rate": 0.00025784428348632515, "loss": 2.25, "step": 3727 }, { "epoch": 2.4688741721854304, "grad_norm": 0.7364051704302544, "learning_rate": 0.0002577978798420665, "loss": 1.8281, "step": 3728 }, { "epoch": 2.4695364238410598, "grad_norm": 0.7396566173883143, "learning_rate": 0.0002577514548525917, "loss": 2.2969, "step": 3729 }, { "epoch": 2.4701986754966887, "grad_norm": 0.740898850741441, "learning_rate": 0.00025770500852709353, "loss": 2.0156, "step": 3730 }, { "epoch": 2.4708609271523176, "grad_norm": 0.7729389738421756, "learning_rate": 0.00025765854087476893, "loss": 1.9375, "step": 3731 }, { "epoch": 2.471523178807947, "grad_norm": 0.8624638876107098, "learning_rate": 0.00025761205190481893, "loss": 2.2812, "step": 3732 }, { "epoch": 2.472185430463576, "grad_norm": 0.8023476474046847, "learning_rate": 0.0002575655416264489, "loss": 2.3125, "step": 3733 }, { "epoch": 2.4728476821192054, "grad_norm": 0.7255421905873592, "learning_rate": 0.0002575190100488684, "loss": 1.8281, "step": 3734 }, { "epoch": 2.4735099337748343, "grad_norm": 0.7931748315734833, "learning_rate": 0.0002574724571812913, "loss": 2.1562, "step": 3735 }, { "epoch": 2.4741721854304637, "grad_norm": 0.7662039930519703, "learning_rate": 0.00025742588303293555, "loss": 2.4062, "step": 3736 }, { "epoch": 2.4748344370860926, "grad_norm": 0.7825147613296451, "learning_rate": 0.00025737928761302337, "loss": 2.3438, "step": 3737 }, { "epoch": 2.475496688741722, "grad_norm": 0.7816595323637575, "learning_rate": 0.0002573326709307812, "loss": 2.2969, "step": 3738 }, { "epoch": 2.476158940397351, "grad_norm": 0.782474503950054, "learning_rate": 0.00025728603299543957, "loss": 2.2031, "step": 3739 }, { "epoch": 2.47682119205298, "grad_norm": 0.8265525780186007, "learning_rate": 0.00025723937381623343, "loss": 2.2344, "step": 3740 }, { "epoch": 2.4774834437086093, "grad_norm": 0.8073755991536216, "learning_rate": 0.00025719269340240185, "loss": 2.3438, "step": 3741 }, { "epoch": 2.4781456953642382, "grad_norm": 0.8453282986605583, "learning_rate": 0.0002571459917631881, "loss": 2.3125, "step": 3742 }, { "epoch": 2.4788079470198676, "grad_norm": 0.7701714877545602, "learning_rate": 0.0002570992689078395, "loss": 2.0156, "step": 3743 }, { "epoch": 2.4794701986754966, "grad_norm": 0.7535412329756577, "learning_rate": 0.00025705252484560786, "loss": 2.2188, "step": 3744 }, { "epoch": 2.480132450331126, "grad_norm": 0.7977866501667391, "learning_rate": 0.00025700575958574894, "loss": 2.1094, "step": 3745 }, { "epoch": 2.480794701986755, "grad_norm": 0.8567449504658754, "learning_rate": 0.0002569589731375229, "loss": 2.3594, "step": 3746 }, { "epoch": 2.4814569536423843, "grad_norm": 0.7443684422763787, "learning_rate": 0.0002569121655101939, "loss": 1.9609, "step": 3747 }, { "epoch": 2.482119205298013, "grad_norm": 0.754164299096999, "learning_rate": 0.0002568653367130304, "loss": 2.0625, "step": 3748 }, { "epoch": 2.482781456953642, "grad_norm": 0.7450370401636187, "learning_rate": 0.0002568184867553051, "loss": 1.7734, "step": 3749 }, { "epoch": 2.4834437086092715, "grad_norm": 0.7343611777486058, "learning_rate": 0.0002567716156462948, "loss": 1.7734, "step": 3750 }, { "epoch": 2.4841059602649005, "grad_norm": 0.8837410321717628, "learning_rate": 0.0002567247233952805, "loss": 2.0312, "step": 3751 }, { "epoch": 2.48476821192053, "grad_norm": 0.7554996689083994, "learning_rate": 0.00025667781001154733, "loss": 1.8984, "step": 3752 }, { "epoch": 2.485430463576159, "grad_norm": 0.8562522973241116, "learning_rate": 0.0002566308755043848, "loss": 2.2031, "step": 3753 }, { "epoch": 2.486092715231788, "grad_norm": 0.8354555574608559, "learning_rate": 0.00025658391988308643, "loss": 2.2188, "step": 3754 }, { "epoch": 2.486754966887417, "grad_norm": 0.8557149768317479, "learning_rate": 0.00025653694315695, "loss": 2.1094, "step": 3755 }, { "epoch": 2.4874172185430465, "grad_norm": 0.8166114702044995, "learning_rate": 0.00025648994533527744, "loss": 2.125, "step": 3756 }, { "epoch": 2.4880794701986755, "grad_norm": 0.7882667142255402, "learning_rate": 0.0002564429264273748, "loss": 2.2344, "step": 3757 }, { "epoch": 2.4887417218543044, "grad_norm": 0.7843107808145644, "learning_rate": 0.0002563958864425524, "loss": 2.1562, "step": 3758 }, { "epoch": 2.489403973509934, "grad_norm": 0.8913234831605202, "learning_rate": 0.00025634882539012473, "loss": 2.4375, "step": 3759 }, { "epoch": 2.4900662251655628, "grad_norm": 0.7760531183654524, "learning_rate": 0.00025630174327941036, "loss": 2.0312, "step": 3760 }, { "epoch": 2.490728476821192, "grad_norm": 0.8237800597839532, "learning_rate": 0.0002562546401197321, "loss": 2.2344, "step": 3761 }, { "epoch": 2.491390728476821, "grad_norm": 0.7538543525796487, "learning_rate": 0.00025620751592041695, "loss": 1.9688, "step": 3762 }, { "epoch": 2.4920529801324505, "grad_norm": 0.7671719916062989, "learning_rate": 0.000256160370690796, "loss": 2.125, "step": 3763 }, { "epoch": 2.4927152317880794, "grad_norm": 0.7264601185989904, "learning_rate": 0.00025611320444020454, "loss": 2.125, "step": 3764 }, { "epoch": 2.493377483443709, "grad_norm": 0.7540591425870986, "learning_rate": 0.00025606601717798207, "loss": 2.0469, "step": 3765 }, { "epoch": 2.4940397350993377, "grad_norm": 0.8000031191656632, "learning_rate": 0.00025601880891347223, "loss": 2.3281, "step": 3766 }, { "epoch": 2.4947019867549667, "grad_norm": 0.9173837159614713, "learning_rate": 0.00025597157965602275, "loss": 2.5156, "step": 3767 }, { "epoch": 2.495364238410596, "grad_norm": 0.8574803970528944, "learning_rate": 0.00025592432941498556, "loss": 2.4219, "step": 3768 }, { "epoch": 2.496026490066225, "grad_norm": 0.7116906170460164, "learning_rate": 0.00025587705819971675, "loss": 1.7266, "step": 3769 }, { "epoch": 2.4966887417218544, "grad_norm": 0.8645167719637118, "learning_rate": 0.0002558297660195766, "loss": 2.3594, "step": 3770 }, { "epoch": 2.4973509933774833, "grad_norm": 0.8022357297590467, "learning_rate": 0.00025578245288392944, "loss": 2.3281, "step": 3771 }, { "epoch": 2.4980132450331127, "grad_norm": 0.6870707544917136, "learning_rate": 0.0002557351188021439, "loss": 1.75, "step": 3772 }, { "epoch": 2.4986754966887417, "grad_norm": 0.7505084647297366, "learning_rate": 0.0002556877637835926, "loss": 2.0625, "step": 3773 }, { "epoch": 2.499337748344371, "grad_norm": 0.7825213787198477, "learning_rate": 0.0002556403878376524, "loss": 2.3594, "step": 3774 }, { "epoch": 2.5, "grad_norm": 0.8331305704984378, "learning_rate": 0.00025559299097370426, "loss": 2.2344, "step": 3775 }, { "epoch": 2.500662251655629, "grad_norm": 0.8127604660915183, "learning_rate": 0.00025554557320113335, "loss": 2.2188, "step": 3776 }, { "epoch": 2.5013245033112583, "grad_norm": 0.8757668774831419, "learning_rate": 0.00025549813452932887, "loss": 2.3281, "step": 3777 }, { "epoch": 2.5019867549668873, "grad_norm": 0.796343673496541, "learning_rate": 0.0002554506749676843, "loss": 2.0312, "step": 3778 }, { "epoch": 2.5026490066225167, "grad_norm": 0.697684377385825, "learning_rate": 0.00025540319452559715, "loss": 1.7422, "step": 3779 }, { "epoch": 2.5033112582781456, "grad_norm": 0.7508516375462173, "learning_rate": 0.000255355693212469, "loss": 2.0625, "step": 3780 }, { "epoch": 2.503973509933775, "grad_norm": 0.8995981351168479, "learning_rate": 0.0002553081710377058, "loss": 2.2656, "step": 3781 }, { "epoch": 2.504635761589404, "grad_norm": 0.9442778658895785, "learning_rate": 0.0002552606280107174, "loss": 2.4531, "step": 3782 }, { "epoch": 2.5052980132450333, "grad_norm": 0.848668594315267, "learning_rate": 0.0002552130641409179, "loss": 2.1875, "step": 3783 }, { "epoch": 2.5059602649006623, "grad_norm": 0.8128908052676902, "learning_rate": 0.00025516547943772543, "loss": 2.2188, "step": 3784 }, { "epoch": 2.506622516556291, "grad_norm": 0.817947238228866, "learning_rate": 0.00025511787391056237, "loss": 2.0781, "step": 3785 }, { "epoch": 2.5072847682119206, "grad_norm": 0.7906552605647277, "learning_rate": 0.0002550702475688551, "loss": 2.2969, "step": 3786 }, { "epoch": 2.5079470198675495, "grad_norm": 0.8487939423019191, "learning_rate": 0.0002550226004220343, "loss": 2.5, "step": 3787 }, { "epoch": 2.508609271523179, "grad_norm": 0.783383233932687, "learning_rate": 0.0002549749324795345, "loss": 2.0781, "step": 3788 }, { "epoch": 2.509271523178808, "grad_norm": 0.8424805306609977, "learning_rate": 0.00025492724375079457, "loss": 2.3125, "step": 3789 }, { "epoch": 2.5099337748344372, "grad_norm": 0.8537090061892594, "learning_rate": 0.00025487953424525747, "loss": 2.375, "step": 3790 }, { "epoch": 2.510596026490066, "grad_norm": 0.8162934038231799, "learning_rate": 0.0002548318039723702, "loss": 2.0625, "step": 3791 }, { "epoch": 2.5112582781456956, "grad_norm": 0.7981103742956785, "learning_rate": 0.0002547840529415838, "loss": 2.3281, "step": 3792 }, { "epoch": 2.5119205298013245, "grad_norm": 0.8942816627767144, "learning_rate": 0.0002547362811623537, "loss": 2.0625, "step": 3793 }, { "epoch": 2.5125827814569535, "grad_norm": 0.8452463509791648, "learning_rate": 0.00025468848864413913, "loss": 2.1875, "step": 3794 }, { "epoch": 2.513245033112583, "grad_norm": 0.8139976073956928, "learning_rate": 0.00025464067539640356, "loss": 1.8828, "step": 3795 }, { "epoch": 2.513907284768212, "grad_norm": 0.849179213799658, "learning_rate": 0.00025459284142861465, "loss": 2.0781, "step": 3796 }, { "epoch": 2.514569536423841, "grad_norm": 0.80543474258563, "learning_rate": 0.00025454498675024395, "loss": 2.3438, "step": 3797 }, { "epoch": 2.51523178807947, "grad_norm": 0.8449208722342026, "learning_rate": 0.00025449711137076733, "loss": 2.4531, "step": 3798 }, { "epoch": 2.515894039735099, "grad_norm": 0.7963480556022876, "learning_rate": 0.00025444921529966467, "loss": 1.7812, "step": 3799 }, { "epoch": 2.5165562913907285, "grad_norm": 0.7979864641611971, "learning_rate": 0.0002544012985464199, "loss": 1.7344, "step": 3800 }, { "epoch": 2.517218543046358, "grad_norm": 0.8261186208922794, "learning_rate": 0.0002543533611205211, "loss": 2.2188, "step": 3801 }, { "epoch": 2.517880794701987, "grad_norm": 0.7656984243107973, "learning_rate": 0.00025430540303146044, "loss": 2.0, "step": 3802 }, { "epoch": 2.5185430463576157, "grad_norm": 0.8187946922760879, "learning_rate": 0.0002542574242887341, "loss": 2.1406, "step": 3803 }, { "epoch": 2.519205298013245, "grad_norm": 0.7909393604313028, "learning_rate": 0.00025420942490184253, "loss": 2.2656, "step": 3804 }, { "epoch": 2.519867549668874, "grad_norm": 0.7923129991722448, "learning_rate": 0.0002541614048802901, "loss": 2.2188, "step": 3805 }, { "epoch": 2.5205298013245034, "grad_norm": 0.8009879228520259, "learning_rate": 0.00025411336423358533, "loss": 2.0938, "step": 3806 }, { "epoch": 2.5211920529801324, "grad_norm": 0.8765228361984988, "learning_rate": 0.0002540653029712408, "loss": 2.1562, "step": 3807 }, { "epoch": 2.5218543046357613, "grad_norm": 0.8177431591227649, "learning_rate": 0.00025401722110277323, "loss": 2.2031, "step": 3808 }, { "epoch": 2.5225165562913907, "grad_norm": 0.7852915290432646, "learning_rate": 0.0002539691186377034, "loss": 2.3125, "step": 3809 }, { "epoch": 2.52317880794702, "grad_norm": 0.7835910034076287, "learning_rate": 0.00025392099558555603, "loss": 1.9922, "step": 3810 }, { "epoch": 2.523841059602649, "grad_norm": 0.820804249496446, "learning_rate": 0.0002538728519558601, "loss": 2.1875, "step": 3811 }, { "epoch": 2.524503311258278, "grad_norm": 0.7604540774669745, "learning_rate": 0.0002538246877581487, "loss": 2.0625, "step": 3812 }, { "epoch": 2.5251655629139074, "grad_norm": 0.8402028053161055, "learning_rate": 0.0002537765030019588, "loss": 2.3281, "step": 3813 }, { "epoch": 2.5258278145695363, "grad_norm": 0.7518663424086643, "learning_rate": 0.0002537282976968316, "loss": 2.0938, "step": 3814 }, { "epoch": 2.5264900662251657, "grad_norm": 0.758418025852073, "learning_rate": 0.0002536800718523122, "loss": 2.125, "step": 3815 }, { "epoch": 2.5271523178807946, "grad_norm": 0.8030682646351185, "learning_rate": 0.00025363182547794987, "loss": 2.3281, "step": 3816 }, { "epoch": 2.5278145695364236, "grad_norm": 0.7977846492405214, "learning_rate": 0.00025358355858329804, "loss": 2.2969, "step": 3817 }, { "epoch": 2.528476821192053, "grad_norm": 0.697532931937724, "learning_rate": 0.00025353527117791405, "loss": 1.9531, "step": 3818 }, { "epoch": 2.5291390728476824, "grad_norm": 0.7431021991406611, "learning_rate": 0.0002534869632713594, "loss": 2.2031, "step": 3819 }, { "epoch": 2.5298013245033113, "grad_norm": 0.7206783428453616, "learning_rate": 0.00025343863487319957, "loss": 2.1719, "step": 3820 }, { "epoch": 2.5304635761589402, "grad_norm": 0.7655455629253477, "learning_rate": 0.0002533902859930041, "loss": 1.8594, "step": 3821 }, { "epoch": 2.5311258278145696, "grad_norm": 0.8014182720001765, "learning_rate": 0.00025334191664034677, "loss": 2.0625, "step": 3822 }, { "epoch": 2.5317880794701986, "grad_norm": 0.8222967477942553, "learning_rate": 0.0002532935268248051, "loss": 2.1875, "step": 3823 }, { "epoch": 2.532450331125828, "grad_norm": 0.8357026003372796, "learning_rate": 0.00025324511655596093, "loss": 2.125, "step": 3824 }, { "epoch": 2.533112582781457, "grad_norm": 0.7276083069883442, "learning_rate": 0.0002531966858434, "loss": 1.7656, "step": 3825 }, { "epoch": 2.533774834437086, "grad_norm": 0.7588331997235731, "learning_rate": 0.0002531482346967121, "loss": 1.8906, "step": 3826 }, { "epoch": 2.5344370860927152, "grad_norm": 0.7996064951243721, "learning_rate": 0.00025309976312549125, "loss": 2.125, "step": 3827 }, { "epoch": 2.5350993377483446, "grad_norm": 0.8378376099200675, "learning_rate": 0.00025305127113933534, "loss": 2.2969, "step": 3828 }, { "epoch": 2.5357615894039736, "grad_norm": 0.7013426592142689, "learning_rate": 0.0002530027587478462, "loss": 1.6953, "step": 3829 }, { "epoch": 2.5364238410596025, "grad_norm": 0.7935753869658606, "learning_rate": 0.00025295422596063, "loss": 2.2969, "step": 3830 }, { "epoch": 2.537086092715232, "grad_norm": 0.7367119358807327, "learning_rate": 0.00025290567278729673, "loss": 1.9375, "step": 3831 }, { "epoch": 2.537748344370861, "grad_norm": 0.82439960677723, "learning_rate": 0.0002528570992374604, "loss": 2.1875, "step": 3832 }, { "epoch": 2.53841059602649, "grad_norm": 0.8318401715741789, "learning_rate": 0.0002528085053207393, "loss": 2.4844, "step": 3833 }, { "epoch": 2.539072847682119, "grad_norm": 0.8423100442204545, "learning_rate": 0.00025275989104675545, "loss": 2.3438, "step": 3834 }, { "epoch": 2.539735099337748, "grad_norm": 0.7580464044835085, "learning_rate": 0.00025271125642513506, "loss": 2.0625, "step": 3835 }, { "epoch": 2.5403973509933775, "grad_norm": 0.8202448178531816, "learning_rate": 0.0002526626014655083, "loss": 2.3125, "step": 3836 }, { "epoch": 2.541059602649007, "grad_norm": 0.8581297181064826, "learning_rate": 0.0002526139261775096, "loss": 2.2656, "step": 3837 }, { "epoch": 2.541721854304636, "grad_norm": 0.7770333772543426, "learning_rate": 0.000252565230570777, "loss": 2.0938, "step": 3838 }, { "epoch": 2.5423841059602648, "grad_norm": 0.7600408267322143, "learning_rate": 0.0002525165146549528, "loss": 2.1719, "step": 3839 }, { "epoch": 2.543046357615894, "grad_norm": 0.8145785006120807, "learning_rate": 0.0002524677784396835, "loss": 2.3281, "step": 3840 }, { "epoch": 2.543708609271523, "grad_norm": 0.7932561086411951, "learning_rate": 0.00025241902193461925, "loss": 1.8594, "step": 3841 }, { "epoch": 2.5443708609271525, "grad_norm": 0.816912772459861, "learning_rate": 0.0002523702451494145, "loss": 2.0312, "step": 3842 }, { "epoch": 2.5450331125827814, "grad_norm": 0.7893623463166362, "learning_rate": 0.00025232144809372755, "loss": 1.8203, "step": 3843 }, { "epoch": 2.5456953642384104, "grad_norm": 0.8119976781239691, "learning_rate": 0.0002522726307772208, "loss": 2.0938, "step": 3844 }, { "epoch": 2.5463576158940397, "grad_norm": 0.737490234660676, "learning_rate": 0.0002522237932095607, "loss": 1.9531, "step": 3845 }, { "epoch": 2.547019867549669, "grad_norm": 0.8219803837715125, "learning_rate": 0.0002521749354004175, "loss": 2.2188, "step": 3846 }, { "epoch": 2.547682119205298, "grad_norm": 0.746898840525575, "learning_rate": 0.00025212605735946573, "loss": 2.0781, "step": 3847 }, { "epoch": 2.548344370860927, "grad_norm": 0.84047601011077, "learning_rate": 0.0002520771590963838, "loss": 2.3281, "step": 3848 }, { "epoch": 2.5490066225165564, "grad_norm": 0.7470349945710681, "learning_rate": 0.0002520282406208541, "loss": 1.9141, "step": 3849 }, { "epoch": 2.5496688741721854, "grad_norm": 0.886438730134503, "learning_rate": 0.00025197930194256296, "loss": 2.2344, "step": 3850 }, { "epoch": 2.5503311258278147, "grad_norm": 0.8008237416313609, "learning_rate": 0.00025193034307120093, "loss": 2.0156, "step": 3851 }, { "epoch": 2.5509933774834437, "grad_norm": 0.7750035070561504, "learning_rate": 0.00025188136401646234, "loss": 2.3281, "step": 3852 }, { "epoch": 2.5516556291390726, "grad_norm": 0.8144553860891628, "learning_rate": 0.00025183236478804565, "loss": 2.375, "step": 3853 }, { "epoch": 2.552317880794702, "grad_norm": 0.7571409837859496, "learning_rate": 0.0002517833453956533, "loss": 1.9219, "step": 3854 }, { "epoch": 2.5529801324503314, "grad_norm": 0.8286387279034257, "learning_rate": 0.0002517343058489917, "loss": 2.125, "step": 3855 }, { "epoch": 2.5536423841059603, "grad_norm": 0.7662489255544056, "learning_rate": 0.0002516852461577711, "loss": 2.125, "step": 3856 }, { "epoch": 2.5543046357615893, "grad_norm": 0.776737139507395, "learning_rate": 0.000251636166331706, "loss": 2.1094, "step": 3857 }, { "epoch": 2.5549668874172187, "grad_norm": 0.7450599419848773, "learning_rate": 0.0002515870663805148, "loss": 2.0469, "step": 3858 }, { "epoch": 2.5556291390728476, "grad_norm": 0.8460261715935387, "learning_rate": 0.0002515379463139197, "loss": 2.3281, "step": 3859 }, { "epoch": 2.556291390728477, "grad_norm": 0.7901148467077456, "learning_rate": 0.0002514888061416472, "loss": 2.0781, "step": 3860 }, { "epoch": 2.556953642384106, "grad_norm": 0.7571472625483164, "learning_rate": 0.0002514396458734276, "loss": 1.9453, "step": 3861 }, { "epoch": 2.557615894039735, "grad_norm": 0.7446337855542936, "learning_rate": 0.00025139046551899503, "loss": 1.7344, "step": 3862 }, { "epoch": 2.5582781456953643, "grad_norm": 0.8532667038400915, "learning_rate": 0.000251341265088088, "loss": 2.375, "step": 3863 }, { "epoch": 2.558940397350993, "grad_norm": 0.7420028567272337, "learning_rate": 0.00025129204459044854, "loss": 2.0781, "step": 3864 }, { "epoch": 2.5596026490066226, "grad_norm": 0.7596643386979913, "learning_rate": 0.0002512428040358231, "loss": 1.9062, "step": 3865 }, { "epoch": 2.5602649006622515, "grad_norm": 0.7574154303372109, "learning_rate": 0.00025119354343396165, "loss": 1.9766, "step": 3866 }, { "epoch": 2.560927152317881, "grad_norm": 0.8264921010946084, "learning_rate": 0.00025114426279461844, "loss": 2.2812, "step": 3867 }, { "epoch": 2.56158940397351, "grad_norm": 0.7804975081695901, "learning_rate": 0.00025109496212755167, "loss": 2.1875, "step": 3868 }, { "epoch": 2.5622516556291393, "grad_norm": 0.6984941253379536, "learning_rate": 0.00025104564144252337, "loss": 1.7578, "step": 3869 }, { "epoch": 2.562913907284768, "grad_norm": 0.795031467703789, "learning_rate": 0.0002509963007492996, "loss": 2.125, "step": 3870 }, { "epoch": 2.563576158940397, "grad_norm": 0.8906735255965582, "learning_rate": 0.0002509469400576504, "loss": 2.4375, "step": 3871 }, { "epoch": 2.5642384105960265, "grad_norm": 0.7837441486020402, "learning_rate": 0.0002508975593773498, "loss": 2.2344, "step": 3872 }, { "epoch": 2.5649006622516555, "grad_norm": 0.7754712567649199, "learning_rate": 0.00025084815871817567, "loss": 2.0312, "step": 3873 }, { "epoch": 2.565562913907285, "grad_norm": 0.8368026191313812, "learning_rate": 0.00025079873808990993, "loss": 2.4062, "step": 3874 }, { "epoch": 2.566225165562914, "grad_norm": 0.8056979895304843, "learning_rate": 0.00025074929750233834, "loss": 2.0469, "step": 3875 }, { "epoch": 2.566887417218543, "grad_norm": 0.7737076296081229, "learning_rate": 0.00025069983696525086, "loss": 2.0938, "step": 3876 }, { "epoch": 2.567549668874172, "grad_norm": 0.8821511327502463, "learning_rate": 0.0002506503564884412, "loss": 2.2188, "step": 3877 }, { "epoch": 2.5682119205298015, "grad_norm": 0.7850543575422075, "learning_rate": 0.000250600856081707, "loss": 2.2969, "step": 3878 }, { "epoch": 2.5688741721854305, "grad_norm": 0.7974938824474251, "learning_rate": 0.00025055133575485, "loss": 2.3438, "step": 3879 }, { "epoch": 2.5695364238410594, "grad_norm": 0.8057804249562326, "learning_rate": 0.00025050179551767564, "loss": 2.0781, "step": 3880 }, { "epoch": 2.570198675496689, "grad_norm": 0.8380163782506149, "learning_rate": 0.0002504522353799936, "loss": 2.3125, "step": 3881 }, { "epoch": 2.5708609271523177, "grad_norm": 0.793542440330441, "learning_rate": 0.0002504026553516173, "loss": 2.4219, "step": 3882 }, { "epoch": 2.571523178807947, "grad_norm": 0.7196688817113375, "learning_rate": 0.0002503530554423642, "loss": 2.125, "step": 3883 }, { "epoch": 2.572185430463576, "grad_norm": 0.790425489076223, "learning_rate": 0.0002503034356620556, "loss": 2.125, "step": 3884 }, { "epoch": 2.5728476821192054, "grad_norm": 0.8057085002503803, "learning_rate": 0.00025025379602051673, "loss": 2.1719, "step": 3885 }, { "epoch": 2.5735099337748344, "grad_norm": 0.8376618130457825, "learning_rate": 0.00025020413652757697, "loss": 2.2812, "step": 3886 }, { "epoch": 2.5741721854304638, "grad_norm": 0.8016221358236878, "learning_rate": 0.0002501544571930693, "loss": 2.0938, "step": 3887 }, { "epoch": 2.5748344370860927, "grad_norm": 0.7847848188366371, "learning_rate": 0.0002501047580268309, "loss": 1.8672, "step": 3888 }, { "epoch": 2.5754966887417217, "grad_norm": 0.8862670117657797, "learning_rate": 0.0002500550390387028, "loss": 2.1875, "step": 3889 }, { "epoch": 2.576158940397351, "grad_norm": 0.8455305159506907, "learning_rate": 0.0002500053002385298, "loss": 2.3438, "step": 3890 }, { "epoch": 2.57682119205298, "grad_norm": 0.7520496874918873, "learning_rate": 0.00024995554163616084, "loss": 2.0156, "step": 3891 }, { "epoch": 2.5774834437086094, "grad_norm": 0.7916388899951395, "learning_rate": 0.0002499057632414487, "loss": 2.1406, "step": 3892 }, { "epoch": 2.5781456953642383, "grad_norm": 0.7721191100194016, "learning_rate": 0.00024985596506425003, "loss": 2.1094, "step": 3893 }, { "epoch": 2.5788079470198677, "grad_norm": 0.8198154597827289, "learning_rate": 0.0002498061471144255, "loss": 2.4688, "step": 3894 }, { "epoch": 2.5794701986754967, "grad_norm": 0.8064199334058884, "learning_rate": 0.00024975630940183966, "loss": 2.2188, "step": 3895 }, { "epoch": 2.580132450331126, "grad_norm": 0.7305130426522344, "learning_rate": 0.0002497064519363608, "loss": 2.0781, "step": 3896 }, { "epoch": 2.580794701986755, "grad_norm": 0.7498111705044653, "learning_rate": 0.0002496565747278614, "loss": 1.8984, "step": 3897 }, { "epoch": 2.581456953642384, "grad_norm": 0.7590683379340506, "learning_rate": 0.00024960667778621765, "loss": 2.0312, "step": 3898 }, { "epoch": 2.5821192052980133, "grad_norm": 0.783864640611157, "learning_rate": 0.00024955676112130976, "loss": 2.0156, "step": 3899 }, { "epoch": 2.5827814569536423, "grad_norm": 0.7770826105235369, "learning_rate": 0.0002495068247430218, "loss": 2.0312, "step": 3900 }, { "epoch": 2.5834437086092716, "grad_norm": 0.800684155541999, "learning_rate": 0.0002494568686612417, "loss": 2.3281, "step": 3901 }, { "epoch": 2.5841059602649006, "grad_norm": 0.7634111481055363, "learning_rate": 0.00024940689288586136, "loss": 1.9531, "step": 3902 }, { "epoch": 2.58476821192053, "grad_norm": 0.7832052277772961, "learning_rate": 0.0002493568974267766, "loss": 2.1406, "step": 3903 }, { "epoch": 2.585430463576159, "grad_norm": 0.8647448520588311, "learning_rate": 0.000249306882293887, "loss": 2.4062, "step": 3904 }, { "epoch": 2.5860927152317883, "grad_norm": 0.7974701684893966, "learning_rate": 0.0002492568474970963, "loss": 2.1875, "step": 3905 }, { "epoch": 2.5867549668874172, "grad_norm": 0.8444881083020391, "learning_rate": 0.00024920679304631175, "loss": 2.3125, "step": 3906 }, { "epoch": 2.587417218543046, "grad_norm": 0.8457713890441648, "learning_rate": 0.00024915671895144487, "loss": 2.0469, "step": 3907 }, { "epoch": 2.5880794701986756, "grad_norm": 0.7906509776985224, "learning_rate": 0.0002491066252224108, "loss": 2.1719, "step": 3908 }, { "epoch": 2.5887417218543045, "grad_norm": 0.7299289061804439, "learning_rate": 0.0002490565118691288, "loss": 2.0469, "step": 3909 }, { "epoch": 2.589403973509934, "grad_norm": 0.7905565234884893, "learning_rate": 0.0002490063789015217, "loss": 2.125, "step": 3910 }, { "epoch": 2.590066225165563, "grad_norm": 0.7683221796802914, "learning_rate": 0.0002489562263295166, "loss": 2.0469, "step": 3911 }, { "epoch": 2.590728476821192, "grad_norm": 0.8803555791993387, "learning_rate": 0.0002489060541630441, "loss": 2.2031, "step": 3912 }, { "epoch": 2.591390728476821, "grad_norm": 0.7626386118997812, "learning_rate": 0.00024885586241203905, "loss": 1.75, "step": 3913 }, { "epoch": 2.5920529801324506, "grad_norm": 0.8097270998924269, "learning_rate": 0.0002488056510864398, "loss": 2.1875, "step": 3914 }, { "epoch": 2.5927152317880795, "grad_norm": 0.7915051937119802, "learning_rate": 0.0002487554201961889, "loss": 2.3438, "step": 3915 }, { "epoch": 2.5933774834437084, "grad_norm": 0.7859923727052803, "learning_rate": 0.0002487051697512326, "loss": 2.0781, "step": 3916 }, { "epoch": 2.594039735099338, "grad_norm": 0.8268539256371864, "learning_rate": 0.0002486548997615211, "loss": 2.0, "step": 3917 }, { "epoch": 2.5947019867549668, "grad_norm": 0.7464508124294178, "learning_rate": 0.0002486046102370084, "loss": 2.2344, "step": 3918 }, { "epoch": 2.595364238410596, "grad_norm": 0.8414754123614265, "learning_rate": 0.0002485543011876524, "loss": 2.1875, "step": 3919 }, { "epoch": 2.596026490066225, "grad_norm": 0.8364486476440537, "learning_rate": 0.0002485039726234149, "loss": 2.375, "step": 3920 }, { "epoch": 2.596688741721854, "grad_norm": 0.8499332486914752, "learning_rate": 0.00024845362455426143, "loss": 2.1562, "step": 3921 }, { "epoch": 2.5973509933774834, "grad_norm": 0.790382090333589, "learning_rate": 0.00024840325699016164, "loss": 2.2188, "step": 3922 }, { "epoch": 2.598013245033113, "grad_norm": 0.8460013384668975, "learning_rate": 0.0002483528699410888, "loss": 2.5, "step": 3923 }, { "epoch": 2.5986754966887418, "grad_norm": 0.8201312812497731, "learning_rate": 0.00024830246341702014, "loss": 2.4219, "step": 3924 }, { "epoch": 2.5993377483443707, "grad_norm": 0.8111659467613763, "learning_rate": 0.0002482520374279367, "loss": 2.2656, "step": 3925 }, { "epoch": 2.6, "grad_norm": 0.740928949865739, "learning_rate": 0.00024820159198382343, "loss": 2.0625, "step": 3926 }, { "epoch": 2.600662251655629, "grad_norm": 0.8269110977117258, "learning_rate": 0.0002481511270946691, "loss": 2.2812, "step": 3927 }, { "epoch": 2.6013245033112584, "grad_norm": 0.753430514367967, "learning_rate": 0.0002481006427704663, "loss": 2.125, "step": 3928 }, { "epoch": 2.6019867549668874, "grad_norm": 0.7126898240367171, "learning_rate": 0.00024805013902121163, "loss": 1.7266, "step": 3929 }, { "epoch": 2.6026490066225163, "grad_norm": 0.8762948430242081, "learning_rate": 0.0002479996158569053, "loss": 2.2031, "step": 3930 }, { "epoch": 2.6033112582781457, "grad_norm": 0.7843506402537388, "learning_rate": 0.00024794907328755145, "loss": 2.0469, "step": 3931 }, { "epoch": 2.603973509933775, "grad_norm": 0.7381331086742765, "learning_rate": 0.00024789851132315816, "loss": 2.1094, "step": 3932 }, { "epoch": 2.604635761589404, "grad_norm": 0.7670185246648662, "learning_rate": 0.0002478479299737373, "loss": 2.125, "step": 3933 }, { "epoch": 2.605298013245033, "grad_norm": 0.8211015603506455, "learning_rate": 0.0002477973292493045, "loss": 2.2188, "step": 3934 }, { "epoch": 2.6059602649006623, "grad_norm": 0.8443723710307479, "learning_rate": 0.0002477467091598793, "loss": 2.3281, "step": 3935 }, { "epoch": 2.6066225165562913, "grad_norm": 0.7899296007156675, "learning_rate": 0.00024769606971548506, "loss": 2.3125, "step": 3936 }, { "epoch": 2.6072847682119207, "grad_norm": 0.9183019042447146, "learning_rate": 0.0002476454109261489, "loss": 2.2188, "step": 3937 }, { "epoch": 2.6079470198675496, "grad_norm": 0.7841027032669037, "learning_rate": 0.00024759473280190197, "loss": 2.1719, "step": 3938 }, { "epoch": 2.6086092715231786, "grad_norm": 0.7833066897542446, "learning_rate": 0.000247544035352779, "loss": 2.0625, "step": 3939 }, { "epoch": 2.609271523178808, "grad_norm": 0.7837985822577075, "learning_rate": 0.0002474933185888188, "loss": 1.7188, "step": 3940 }, { "epoch": 2.6099337748344373, "grad_norm": 0.7456934853250844, "learning_rate": 0.00024744258252006377, "loss": 1.9844, "step": 3941 }, { "epoch": 2.6105960264900663, "grad_norm": 0.7907739402003513, "learning_rate": 0.0002473918271565603, "loss": 2.0938, "step": 3942 }, { "epoch": 2.611258278145695, "grad_norm": 0.754651893790459, "learning_rate": 0.0002473410525083584, "loss": 2.1562, "step": 3943 }, { "epoch": 2.6119205298013246, "grad_norm": 0.7631920670205864, "learning_rate": 0.0002472902585855122, "loss": 2.1719, "step": 3944 }, { "epoch": 2.6125827814569536, "grad_norm": 0.8016581288052913, "learning_rate": 0.0002472394453980793, "loss": 1.9922, "step": 3945 }, { "epoch": 2.613245033112583, "grad_norm": 0.7526853919703954, "learning_rate": 0.00024718861295612153, "loss": 2.1875, "step": 3946 }, { "epoch": 2.613907284768212, "grad_norm": 0.7771445276399125, "learning_rate": 0.00024713776126970415, "loss": 2.2344, "step": 3947 }, { "epoch": 2.614569536423841, "grad_norm": 0.7729366534333044, "learning_rate": 0.0002470868903488963, "loss": 2.0312, "step": 3948 }, { "epoch": 2.61523178807947, "grad_norm": 0.888943939173716, "learning_rate": 0.00024703600020377115, "loss": 2.1875, "step": 3949 }, { "epoch": 2.6158940397350996, "grad_norm": 0.7831434195196432, "learning_rate": 0.0002469850908444055, "loss": 2.3281, "step": 3950 }, { "epoch": 2.6165562913907285, "grad_norm": 0.7718073110457113, "learning_rate": 0.0002469341622808799, "loss": 1.8359, "step": 3951 }, { "epoch": 2.6172185430463575, "grad_norm": 0.8134867977303436, "learning_rate": 0.0002468832145232789, "loss": 2.0469, "step": 3952 }, { "epoch": 2.617880794701987, "grad_norm": 0.8340184999293999, "learning_rate": 0.00024683224758169066, "loss": 1.8594, "step": 3953 }, { "epoch": 2.618543046357616, "grad_norm": 0.8092752229261603, "learning_rate": 0.00024678126146620725, "loss": 2.4062, "step": 3954 }, { "epoch": 2.619205298013245, "grad_norm": 0.7871138882713193, "learning_rate": 0.0002467302561869246, "loss": 2.0781, "step": 3955 }, { "epoch": 2.619867549668874, "grad_norm": 0.8734304829454115, "learning_rate": 0.00024667923175394213, "loss": 2.4531, "step": 3956 }, { "epoch": 2.620529801324503, "grad_norm": 0.7617866558802842, "learning_rate": 0.0002466281881773635, "loss": 2.0781, "step": 3957 }, { "epoch": 2.6211920529801325, "grad_norm": 0.8235196479221422, "learning_rate": 0.0002465771254672957, "loss": 2.125, "step": 3958 }, { "epoch": 2.621854304635762, "grad_norm": 0.8494079559319179, "learning_rate": 0.0002465260436338499, "loss": 2.1094, "step": 3959 }, { "epoch": 2.622516556291391, "grad_norm": 0.7950793798216379, "learning_rate": 0.0002464749426871408, "loss": 2.0156, "step": 3960 }, { "epoch": 2.6231788079470197, "grad_norm": 0.8087438620222626, "learning_rate": 0.000246423822637287, "loss": 2.2969, "step": 3961 }, { "epoch": 2.623841059602649, "grad_norm": 0.7970314748320818, "learning_rate": 0.00024637268349441083, "loss": 2.25, "step": 3962 }, { "epoch": 2.624503311258278, "grad_norm": 0.7772945032674179, "learning_rate": 0.0002463215252686385, "loss": 2.0781, "step": 3963 }, { "epoch": 2.6251655629139075, "grad_norm": 0.7574067714493823, "learning_rate": 0.0002462703479700998, "loss": 2.1719, "step": 3964 }, { "epoch": 2.6258278145695364, "grad_norm": 0.8778551151112557, "learning_rate": 0.0002462191516089286, "loss": 2.2812, "step": 3965 }, { "epoch": 2.6264900662251653, "grad_norm": 0.7474688429951181, "learning_rate": 0.0002461679361952622, "loss": 1.7812, "step": 3966 }, { "epoch": 2.6271523178807947, "grad_norm": 0.9117482978222983, "learning_rate": 0.00024611670173924196, "loss": 2.4375, "step": 3967 }, { "epoch": 2.627814569536424, "grad_norm": 0.8099232061960885, "learning_rate": 0.0002460654482510128, "loss": 2.2344, "step": 3968 }, { "epoch": 2.628476821192053, "grad_norm": 0.7650339671617369, "learning_rate": 0.00024601417574072353, "loss": 2.0312, "step": 3969 }, { "epoch": 2.629139072847682, "grad_norm": 0.7350188181502101, "learning_rate": 0.0002459628842185267, "loss": 2.1406, "step": 3970 }, { "epoch": 2.6298013245033114, "grad_norm": 0.7883223074748857, "learning_rate": 0.0002459115736945786, "loss": 2.0469, "step": 3971 }, { "epoch": 2.6304635761589403, "grad_norm": 0.9102055676408702, "learning_rate": 0.00024586024417903933, "loss": 2.3906, "step": 3972 }, { "epoch": 2.6311258278145697, "grad_norm": 0.8043078862466365, "learning_rate": 0.0002458088956820727, "loss": 2.2188, "step": 3973 }, { "epoch": 2.6317880794701987, "grad_norm": 0.8589414189521787, "learning_rate": 0.00024575752821384635, "loss": 2.1562, "step": 3974 }, { "epoch": 2.6324503311258276, "grad_norm": 0.7455054163511606, "learning_rate": 0.00024570614178453154, "loss": 1.9141, "step": 3975 }, { "epoch": 2.633112582781457, "grad_norm": 0.8856875005289148, "learning_rate": 0.00024565473640430347, "loss": 2.4688, "step": 3976 }, { "epoch": 2.633774834437086, "grad_norm": 0.7536702614283786, "learning_rate": 0.0002456033120833409, "loss": 2.0156, "step": 3977 }, { "epoch": 2.6344370860927153, "grad_norm": 0.7445244313387426, "learning_rate": 0.00024555186883182646, "loss": 2.0938, "step": 3978 }, { "epoch": 2.6350993377483443, "grad_norm": 0.8744795133062511, "learning_rate": 0.00024550040665994655, "loss": 2.2031, "step": 3979 }, { "epoch": 2.6357615894039736, "grad_norm": 0.7934506137321837, "learning_rate": 0.0002454489255778913, "loss": 2.3125, "step": 3980 }, { "epoch": 2.6364238410596026, "grad_norm": 0.8092477343568119, "learning_rate": 0.00024539742559585447, "loss": 2.0781, "step": 3981 }, { "epoch": 2.637086092715232, "grad_norm": 0.79372374669347, "learning_rate": 0.00024534590672403364, "loss": 2.0312, "step": 3982 }, { "epoch": 2.637748344370861, "grad_norm": 0.8424438035006498, "learning_rate": 0.00024529436897263024, "loss": 2.3438, "step": 3983 }, { "epoch": 2.63841059602649, "grad_norm": 0.7398599092464678, "learning_rate": 0.0002452428123518492, "loss": 1.9531, "step": 3984 }, { "epoch": 2.6390728476821192, "grad_norm": 0.8203314886054506, "learning_rate": 0.00024519123687189945, "loss": 2.0, "step": 3985 }, { "epoch": 2.639735099337748, "grad_norm": 2.3659414962017173, "learning_rate": 0.0002451396425429934, "loss": 2.1094, "step": 3986 }, { "epoch": 2.6403973509933776, "grad_norm": 0.7733714356852375, "learning_rate": 0.00024508802937534745, "loss": 2.0938, "step": 3987 }, { "epoch": 2.6410596026490065, "grad_norm": 0.8700694156492562, "learning_rate": 0.0002450363973791815, "loss": 2.1875, "step": 3988 }, { "epoch": 2.641721854304636, "grad_norm": 0.8500432520360289, "learning_rate": 0.0002449847465647193, "loss": 2.1719, "step": 3989 }, { "epoch": 2.642384105960265, "grad_norm": 0.9578931012029728, "learning_rate": 0.0002449330769421884, "loss": 2.3125, "step": 3990 }, { "epoch": 2.6430463576158942, "grad_norm": 0.8358342069810146, "learning_rate": 0.00024488138852181977, "loss": 2.3438, "step": 3991 }, { "epoch": 2.643708609271523, "grad_norm": 0.8090430422725822, "learning_rate": 0.00024482968131384845, "loss": 2.2344, "step": 3992 }, { "epoch": 2.644370860927152, "grad_norm": 0.8180062559859658, "learning_rate": 0.00024477795532851313, "loss": 2.2656, "step": 3993 }, { "epoch": 2.6450331125827815, "grad_norm": 0.8476654346437353, "learning_rate": 0.000244726210576056, "loss": 2.2188, "step": 3994 }, { "epoch": 2.6456953642384105, "grad_norm": 0.7187959212732824, "learning_rate": 0.0002446744470667232, "loss": 1.6953, "step": 3995 }, { "epoch": 2.64635761589404, "grad_norm": 0.8217597341706413, "learning_rate": 0.0002446226648107645, "loss": 1.7734, "step": 3996 }, { "epoch": 2.647019867549669, "grad_norm": 0.8090516019370594, "learning_rate": 0.0002445708638184333, "loss": 2.1406, "step": 3997 }, { "epoch": 2.647682119205298, "grad_norm": 0.774686018576076, "learning_rate": 0.00024451904409998685, "loss": 1.9219, "step": 3998 }, { "epoch": 2.648344370860927, "grad_norm": 0.7417660107897827, "learning_rate": 0.00024446720566568604, "loss": 2.0312, "step": 3999 }, { "epoch": 2.6490066225165565, "grad_norm": 0.8030055785026562, "learning_rate": 0.0002444153485257955, "loss": 2.3125, "step": 4000 }, { "epoch": 2.6496688741721854, "grad_norm": 0.7537488544454412, "learning_rate": 0.0002443634726905835, "loss": 2.0625, "step": 4001 }, { "epoch": 2.6503311258278144, "grad_norm": 0.7683872660779115, "learning_rate": 0.0002443115781703221, "loss": 2.0938, "step": 4002 }, { "epoch": 2.6509933774834438, "grad_norm": 0.8186140957195706, "learning_rate": 0.00024425966497528695, "loss": 2.3125, "step": 4003 }, { "epoch": 2.6516556291390727, "grad_norm": 0.8579295748754989, "learning_rate": 0.0002442077331157575, "loss": 2.3281, "step": 4004 }, { "epoch": 2.652317880794702, "grad_norm": 0.8304913858395438, "learning_rate": 0.0002441557826020168, "loss": 2.2344, "step": 4005 }, { "epoch": 2.652980132450331, "grad_norm": 0.7733390407939369, "learning_rate": 0.00024410381344435172, "loss": 1.8516, "step": 4006 }, { "epoch": 2.6536423841059604, "grad_norm": 0.748787941146656, "learning_rate": 0.00024405182565305267, "loss": 1.7969, "step": 4007 }, { "epoch": 2.6543046357615894, "grad_norm": 0.7143804740604952, "learning_rate": 0.00024399981923841393, "loss": 1.9453, "step": 4008 }, { "epoch": 2.6549668874172188, "grad_norm": 0.8282545433536421, "learning_rate": 0.00024394779421073326, "loss": 2.0781, "step": 4009 }, { "epoch": 2.6556291390728477, "grad_norm": 0.7340440023406857, "learning_rate": 0.00024389575058031225, "loss": 2.0781, "step": 4010 }, { "epoch": 2.6562913907284766, "grad_norm": 0.7749512111393109, "learning_rate": 0.00024384368835745615, "loss": 2.2656, "step": 4011 }, { "epoch": 2.656953642384106, "grad_norm": 0.8580400754296054, "learning_rate": 0.00024379160755247392, "loss": 2.5312, "step": 4012 }, { "epoch": 2.657615894039735, "grad_norm": 0.8333838902818076, "learning_rate": 0.0002437395081756781, "loss": 2.4062, "step": 4013 }, { "epoch": 2.6582781456953644, "grad_norm": 0.8579084646959299, "learning_rate": 0.00024368739023738494, "loss": 2.3438, "step": 4014 }, { "epoch": 2.6589403973509933, "grad_norm": 0.7854927141630277, "learning_rate": 0.00024363525374791444, "loss": 2.2656, "step": 4015 }, { "epoch": 2.6596026490066222, "grad_norm": 0.7361428925110682, "learning_rate": 0.0002435830987175902, "loss": 2.1094, "step": 4016 }, { "epoch": 2.6602649006622516, "grad_norm": 0.7222168902269966, "learning_rate": 0.00024353092515673951, "loss": 2.1094, "step": 4017 }, { "epoch": 2.660927152317881, "grad_norm": 0.7049282195647464, "learning_rate": 0.00024347873307569334, "loss": 2.2812, "step": 4018 }, { "epoch": 2.66158940397351, "grad_norm": 0.780329558805766, "learning_rate": 0.00024342652248478635, "loss": 2.1094, "step": 4019 }, { "epoch": 2.662251655629139, "grad_norm": 0.7108235463720955, "learning_rate": 0.0002433742933943568, "loss": 1.7812, "step": 4020 }, { "epoch": 2.6629139072847683, "grad_norm": 0.7431386827151667, "learning_rate": 0.00024332204581474664, "loss": 2.2969, "step": 4021 }, { "epoch": 2.6635761589403972, "grad_norm": 0.827379805174418, "learning_rate": 0.00024326977975630152, "loss": 2.1875, "step": 4022 }, { "epoch": 2.6642384105960266, "grad_norm": 0.7440660791752031, "learning_rate": 0.0002432174952293707, "loss": 1.9609, "step": 4023 }, { "epoch": 2.6649006622516556, "grad_norm": 0.756484251795818, "learning_rate": 0.0002431651922443071, "loss": 2.2031, "step": 4024 }, { "epoch": 2.6655629139072845, "grad_norm": 0.8497077727828823, "learning_rate": 0.0002431128708114674, "loss": 2.3125, "step": 4025 }, { "epoch": 2.666225165562914, "grad_norm": 0.8406128928214927, "learning_rate": 0.00024306053094121173, "loss": 2.3594, "step": 4026 }, { "epoch": 2.6668874172185433, "grad_norm": 0.7341871201159577, "learning_rate": 0.00024300817264390402, "loss": 2.0938, "step": 4027 }, { "epoch": 2.667549668874172, "grad_norm": 0.8009228829165581, "learning_rate": 0.00024295579592991182, "loss": 2.1562, "step": 4028 }, { "epoch": 2.668211920529801, "grad_norm": 0.7767802387545462, "learning_rate": 0.00024290340080960633, "loss": 2.4219, "step": 4029 }, { "epoch": 2.6688741721854305, "grad_norm": 0.7821838881125265, "learning_rate": 0.00024285098729336237, "loss": 2.2031, "step": 4030 }, { "epoch": 2.6695364238410595, "grad_norm": 0.7676080396302879, "learning_rate": 0.00024279855539155842, "loss": 2.1875, "step": 4031 }, { "epoch": 2.670198675496689, "grad_norm": 0.7184472072671353, "learning_rate": 0.00024274610511457664, "loss": 1.7891, "step": 4032 }, { "epoch": 2.670860927152318, "grad_norm": 0.7743859240278532, "learning_rate": 0.0002426936364728027, "loss": 2.2031, "step": 4033 }, { "epoch": 2.6715231788079468, "grad_norm": 0.7921646708238572, "learning_rate": 0.00024264114947662607, "loss": 2.0781, "step": 4034 }, { "epoch": 2.672185430463576, "grad_norm": 0.7675419751042976, "learning_rate": 0.00024258864413643976, "loss": 1.9297, "step": 4035 }, { "epoch": 2.6728476821192055, "grad_norm": 0.7522121126870421, "learning_rate": 0.00024253612046264042, "loss": 1.8438, "step": 4036 }, { "epoch": 2.6735099337748345, "grad_norm": 0.7766544368418868, "learning_rate": 0.00024248357846562828, "loss": 1.8047, "step": 4037 }, { "epoch": 2.6741721854304634, "grad_norm": 0.8508738094951402, "learning_rate": 0.00024243101815580735, "loss": 2.375, "step": 4038 }, { "epoch": 2.674834437086093, "grad_norm": 0.8154811171532356, "learning_rate": 0.00024237843954358514, "loss": 2.125, "step": 4039 }, { "epoch": 2.6754966887417218, "grad_norm": 0.8250995363598963, "learning_rate": 0.00024232584263937282, "loss": 2.0938, "step": 4040 }, { "epoch": 2.676158940397351, "grad_norm": 0.7586389656041453, "learning_rate": 0.00024227322745358517, "loss": 2.0781, "step": 4041 }, { "epoch": 2.67682119205298, "grad_norm": 0.7548252587657602, "learning_rate": 0.00024222059399664063, "loss": 2.0312, "step": 4042 }, { "epoch": 2.677483443708609, "grad_norm": 0.8453310966408427, "learning_rate": 0.00024216794227896113, "loss": 2.0781, "step": 4043 }, { "epoch": 2.6781456953642384, "grad_norm": 0.7950819885686008, "learning_rate": 0.00024211527231097243, "loss": 2.1719, "step": 4044 }, { "epoch": 2.678807947019868, "grad_norm": 0.9130463106753197, "learning_rate": 0.0002420625841031038, "loss": 2.3438, "step": 4045 }, { "epoch": 2.6794701986754967, "grad_norm": 0.8087856731625019, "learning_rate": 0.00024200987766578798, "loss": 2.2031, "step": 4046 }, { "epoch": 2.6801324503311257, "grad_norm": 0.7543452574716366, "learning_rate": 0.00024195715300946152, "loss": 2.0312, "step": 4047 }, { "epoch": 2.680794701986755, "grad_norm": 0.7674752929308315, "learning_rate": 0.00024190441014456459, "loss": 2.0469, "step": 4048 }, { "epoch": 2.681456953642384, "grad_norm": 0.8765161929005814, "learning_rate": 0.0002418516490815407, "loss": 2.3438, "step": 4049 }, { "epoch": 2.6821192052980134, "grad_norm": 0.7862718953691918, "learning_rate": 0.00024179886983083734, "loss": 2.3125, "step": 4050 }, { "epoch": 2.6827814569536423, "grad_norm": 0.8337623188986159, "learning_rate": 0.00024174607240290524, "loss": 2.2812, "step": 4051 }, { "epoch": 2.6834437086092713, "grad_norm": 0.855831125107986, "learning_rate": 0.000241693256808199, "loss": 2.25, "step": 4052 }, { "epoch": 2.6841059602649007, "grad_norm": 0.8105043828491679, "learning_rate": 0.0002416404230571767, "loss": 2.2031, "step": 4053 }, { "epoch": 2.68476821192053, "grad_norm": 0.747480014295251, "learning_rate": 0.0002415875711603, "loss": 1.9766, "step": 4054 }, { "epoch": 2.685430463576159, "grad_norm": 0.7977605909946265, "learning_rate": 0.0002415347011280342, "loss": 2.3281, "step": 4055 }, { "epoch": 2.686092715231788, "grad_norm": 0.8573859371801149, "learning_rate": 0.00024148181297084815, "loss": 2.4688, "step": 4056 }, { "epoch": 2.6867549668874173, "grad_norm": 0.7949748565568895, "learning_rate": 0.00024142890669921432, "loss": 2.0312, "step": 4057 }, { "epoch": 2.6874172185430463, "grad_norm": 0.7581592734645108, "learning_rate": 0.0002413759823236088, "loss": 2.0938, "step": 4058 }, { "epoch": 2.6880794701986757, "grad_norm": 0.7348714802604301, "learning_rate": 0.00024132303985451115, "loss": 1.9453, "step": 4059 }, { "epoch": 2.6887417218543046, "grad_norm": 0.8397994809086442, "learning_rate": 0.00024127007930240462, "loss": 1.8906, "step": 4060 }, { "epoch": 2.6894039735099335, "grad_norm": 0.7951997520906008, "learning_rate": 0.00024121710067777604, "loss": 2.2031, "step": 4061 }, { "epoch": 2.690066225165563, "grad_norm": 0.7796389670076017, "learning_rate": 0.00024116410399111575, "loss": 2.0156, "step": 4062 }, { "epoch": 2.6907284768211923, "grad_norm": 0.7922145430230525, "learning_rate": 0.00024111108925291774, "loss": 2.2656, "step": 4063 }, { "epoch": 2.6913907284768213, "grad_norm": 0.8331854757390216, "learning_rate": 0.00024105805647367946, "loss": 2.3906, "step": 4064 }, { "epoch": 2.69205298013245, "grad_norm": 0.7918863150750985, "learning_rate": 0.00024100500566390207, "loss": 2.2031, "step": 4065 }, { "epoch": 2.6927152317880796, "grad_norm": 0.752840711001402, "learning_rate": 0.0002409519368340902, "loss": 2.2969, "step": 4066 }, { "epoch": 2.6933774834437085, "grad_norm": 0.7106618693281918, "learning_rate": 0.0002408988499947521, "loss": 1.9141, "step": 4067 }, { "epoch": 2.694039735099338, "grad_norm": 0.7737054514426718, "learning_rate": 0.0002408457451563996, "loss": 2.0625, "step": 4068 }, { "epoch": 2.694701986754967, "grad_norm": 0.6913925395782106, "learning_rate": 0.00024079262232954804, "loss": 1.8516, "step": 4069 }, { "epoch": 2.695364238410596, "grad_norm": 0.7366946121495257, "learning_rate": 0.00024073948152471633, "loss": 2.1875, "step": 4070 }, { "epoch": 2.696026490066225, "grad_norm": 0.7768428160721613, "learning_rate": 0.00024068632275242707, "loss": 2.3594, "step": 4071 }, { "epoch": 2.6966887417218546, "grad_norm": 0.7396938121941722, "learning_rate": 0.00024063314602320614, "loss": 1.9922, "step": 4072 }, { "epoch": 2.6973509933774835, "grad_norm": 0.7397235321847527, "learning_rate": 0.00024057995134758325, "loss": 2.0781, "step": 4073 }, { "epoch": 2.6980132450331125, "grad_norm": 0.7385817568598578, "learning_rate": 0.00024052673873609155, "loss": 2.2031, "step": 4074 }, { "epoch": 2.698675496688742, "grad_norm": 0.7766812521645137, "learning_rate": 0.00024047350819926766, "loss": 2.0625, "step": 4075 }, { "epoch": 2.699337748344371, "grad_norm": 0.7229966825822027, "learning_rate": 0.00024042025974765194, "loss": 1.8125, "step": 4076 }, { "epoch": 2.7, "grad_norm": 0.7041930470206346, "learning_rate": 0.00024036699339178815, "loss": 1.8594, "step": 4077 }, { "epoch": 2.700662251655629, "grad_norm": 0.747757219408075, "learning_rate": 0.00024031370914222365, "loss": 2.2031, "step": 4078 }, { "epoch": 2.701324503311258, "grad_norm": 0.7295858143041319, "learning_rate": 0.00024026040700950933, "loss": 1.9062, "step": 4079 }, { "epoch": 2.7019867549668874, "grad_norm": 0.8448136950281003, "learning_rate": 0.00024020708700419962, "loss": 2.0938, "step": 4080 }, { "epoch": 2.7026490066225164, "grad_norm": 0.8024408689093352, "learning_rate": 0.0002401537491368525, "loss": 2.0469, "step": 4081 }, { "epoch": 2.703311258278146, "grad_norm": 0.777671545450392, "learning_rate": 0.00024010039341802945, "loss": 2.1562, "step": 4082 }, { "epoch": 2.7039735099337747, "grad_norm": 0.7864116301787656, "learning_rate": 0.00024004701985829555, "loss": 2.375, "step": 4083 }, { "epoch": 2.704635761589404, "grad_norm": 0.820749465471322, "learning_rate": 0.00023999362846821934, "loss": 2.2031, "step": 4084 }, { "epoch": 2.705298013245033, "grad_norm": 0.6788171367973583, "learning_rate": 0.000239940219258373, "loss": 1.9141, "step": 4085 }, { "epoch": 2.7059602649006624, "grad_norm": 0.7762608211396803, "learning_rate": 0.00023988679223933205, "loss": 1.9141, "step": 4086 }, { "epoch": 2.7066225165562914, "grad_norm": 0.7785415522662342, "learning_rate": 0.00023983334742167577, "loss": 2.2188, "step": 4087 }, { "epoch": 2.7072847682119203, "grad_norm": 0.7513878047620702, "learning_rate": 0.00023977988481598675, "loss": 2.0781, "step": 4088 }, { "epoch": 2.7079470198675497, "grad_norm": 0.8292747168228135, "learning_rate": 0.0002397264044328513, "loss": 2.2188, "step": 4089 }, { "epoch": 2.7086092715231787, "grad_norm": 0.7144389068603944, "learning_rate": 0.00023967290628285908, "loss": 2.1094, "step": 4090 }, { "epoch": 2.709271523178808, "grad_norm": 0.7015301683643281, "learning_rate": 0.00023961939037660333, "loss": 1.7578, "step": 4091 }, { "epoch": 2.709933774834437, "grad_norm": 0.7604320477329195, "learning_rate": 0.0002395658567246808, "loss": 2.2812, "step": 4092 }, { "epoch": 2.7105960264900664, "grad_norm": 0.6960850765348368, "learning_rate": 0.00023951230533769186, "loss": 1.9844, "step": 4093 }, { "epoch": 2.7112582781456953, "grad_norm": 0.7404614022994347, "learning_rate": 0.00023945873622624022, "loss": 2.1094, "step": 4094 }, { "epoch": 2.7119205298013247, "grad_norm": 0.7436867083058829, "learning_rate": 0.00023940514940093318, "loss": 1.9141, "step": 4095 }, { "epoch": 2.7125827814569536, "grad_norm": 0.7693258187055638, "learning_rate": 0.0002393515448723816, "loss": 2.1875, "step": 4096 }, { "epoch": 2.7132450331125826, "grad_norm": 0.7990908124917558, "learning_rate": 0.00023929792265119971, "loss": 2.0781, "step": 4097 }, { "epoch": 2.713907284768212, "grad_norm": 0.8083064697611291, "learning_rate": 0.00023924428274800538, "loss": 2.0938, "step": 4098 }, { "epoch": 2.714569536423841, "grad_norm": 0.749556259702483, "learning_rate": 0.00023919062517341991, "loss": 2.2031, "step": 4099 }, { "epoch": 2.7152317880794703, "grad_norm": 0.8235749489946774, "learning_rate": 0.00023913694993806813, "loss": 2.2031, "step": 4100 }, { "epoch": 2.7158940397350992, "grad_norm": 0.8172201680805598, "learning_rate": 0.00023908325705257835, "loss": 2.1875, "step": 4101 }, { "epoch": 2.7165562913907286, "grad_norm": 0.7881146108637637, "learning_rate": 0.00023902954652758234, "loss": 2.1406, "step": 4102 }, { "epoch": 2.7172185430463576, "grad_norm": 0.7826072574917247, "learning_rate": 0.00023897581837371547, "loss": 2.1875, "step": 4103 }, { "epoch": 2.717880794701987, "grad_norm": 0.768843424187108, "learning_rate": 0.00023892207260161645, "loss": 2.0938, "step": 4104 }, { "epoch": 2.718543046357616, "grad_norm": 0.7755798806671983, "learning_rate": 0.00023886830922192757, "loss": 2.0469, "step": 4105 }, { "epoch": 2.719205298013245, "grad_norm": 0.7965834623867012, "learning_rate": 0.00023881452824529468, "loss": 2.1406, "step": 4106 }, { "epoch": 2.7198675496688742, "grad_norm": 0.7628454897438512, "learning_rate": 0.00023876072968236694, "loss": 2.0, "step": 4107 }, { "epoch": 2.720529801324503, "grad_norm": 0.8022246004686868, "learning_rate": 0.00023870691354379713, "loss": 2.375, "step": 4108 }, { "epoch": 2.7211920529801326, "grad_norm": 0.8422897363595795, "learning_rate": 0.00023865307984024145, "loss": 2.1875, "step": 4109 }, { "epoch": 2.7218543046357615, "grad_norm": 0.768094710039755, "learning_rate": 0.0002385992285823596, "loss": 2.0469, "step": 4110 }, { "epoch": 2.722516556291391, "grad_norm": 0.7653191461578709, "learning_rate": 0.00023854535978081465, "loss": 2.0, "step": 4111 }, { "epoch": 2.72317880794702, "grad_norm": 0.8050306173241116, "learning_rate": 0.00023849147344627341, "loss": 2.3594, "step": 4112 }, { "epoch": 2.723841059602649, "grad_norm": 0.7611514146372146, "learning_rate": 0.00023843756958940584, "loss": 1.9609, "step": 4113 }, { "epoch": 2.724503311258278, "grad_norm": 0.7287454424721603, "learning_rate": 0.00023838364822088566, "loss": 2.0, "step": 4114 }, { "epoch": 2.725165562913907, "grad_norm": 0.8622139876781421, "learning_rate": 0.00023832970935138984, "loss": 2.2344, "step": 4115 }, { "epoch": 2.7258278145695365, "grad_norm": 0.7317501265036545, "learning_rate": 0.00023827575299159885, "loss": 1.8281, "step": 4116 }, { "epoch": 2.7264900662251654, "grad_norm": 0.7808965905253575, "learning_rate": 0.00023822177915219675, "loss": 2.2344, "step": 4117 }, { "epoch": 2.727152317880795, "grad_norm": 0.8252352185844266, "learning_rate": 0.00023816778784387094, "loss": 2.3125, "step": 4118 }, { "epoch": 2.7278145695364238, "grad_norm": 0.8291781239868556, "learning_rate": 0.00023811377907731235, "loss": 2.2812, "step": 4119 }, { "epoch": 2.7284768211920527, "grad_norm": 0.8874748026170409, "learning_rate": 0.0002380597528632153, "loss": 2.4062, "step": 4120 }, { "epoch": 2.729139072847682, "grad_norm": 0.6895970878053972, "learning_rate": 0.0002380057092122776, "loss": 1.6328, "step": 4121 }, { "epoch": 2.7298013245033115, "grad_norm": 0.7610437522268005, "learning_rate": 0.00023795164813520054, "loss": 2.0938, "step": 4122 }, { "epoch": 2.7304635761589404, "grad_norm": 0.8267448102852251, "learning_rate": 0.00023789756964268875, "loss": 2.3125, "step": 4123 }, { "epoch": 2.7311258278145694, "grad_norm": 0.8067400875975469, "learning_rate": 0.00023784347374545056, "loss": 2.2031, "step": 4124 }, { "epoch": 2.7317880794701987, "grad_norm": 0.7904023176339622, "learning_rate": 0.00023778936045419738, "loss": 2.125, "step": 4125 }, { "epoch": 2.7324503311258277, "grad_norm": 0.800327730970933, "learning_rate": 0.00023773522977964446, "loss": 2.2969, "step": 4126 }, { "epoch": 2.733112582781457, "grad_norm": 0.7622801319974138, "learning_rate": 0.0002376810817325101, "loss": 2.0938, "step": 4127 }, { "epoch": 2.733774834437086, "grad_norm": 0.7460122558919458, "learning_rate": 0.00023762691632351636, "loss": 1.7969, "step": 4128 }, { "epoch": 2.734437086092715, "grad_norm": 0.7928885464972585, "learning_rate": 0.0002375727335633886, "loss": 2.0156, "step": 4129 }, { "epoch": 2.7350993377483444, "grad_norm": 0.8190237958611799, "learning_rate": 0.00023751853346285558, "loss": 2.1094, "step": 4130 }, { "epoch": 2.7357615894039737, "grad_norm": 0.7659991012713004, "learning_rate": 0.00023746431603264954, "loss": 2.0625, "step": 4131 }, { "epoch": 2.7364238410596027, "grad_norm": 0.8855012206324189, "learning_rate": 0.00023741008128350626, "loss": 2.1719, "step": 4132 }, { "epoch": 2.7370860927152316, "grad_norm": 0.8167691295429508, "learning_rate": 0.00023735582922616472, "loss": 2.1094, "step": 4133 }, { "epoch": 2.737748344370861, "grad_norm": 0.8057211479065577, "learning_rate": 0.00023730155987136743, "loss": 2.1406, "step": 4134 }, { "epoch": 2.73841059602649, "grad_norm": 0.7788521888088263, "learning_rate": 0.00023724727322986048, "loss": 2.0312, "step": 4135 }, { "epoch": 2.7390728476821193, "grad_norm": 0.8019006083722068, "learning_rate": 0.00023719296931239316, "loss": 2.25, "step": 4136 }, { "epoch": 2.7397350993377483, "grad_norm": 0.7612376619734238, "learning_rate": 0.00023713864812971826, "loss": 2.1094, "step": 4137 }, { "epoch": 2.7403973509933772, "grad_norm": 0.7900531342585587, "learning_rate": 0.00023708430969259206, "loss": 1.9766, "step": 4138 }, { "epoch": 2.7410596026490066, "grad_norm": 0.8022937409151758, "learning_rate": 0.00023702995401177413, "loss": 2.2812, "step": 4139 }, { "epoch": 2.741721854304636, "grad_norm": 0.7681654125560227, "learning_rate": 0.00023697558109802753, "loss": 1.9766, "step": 4140 }, { "epoch": 2.742384105960265, "grad_norm": 0.8203828712682121, "learning_rate": 0.00023692119096211872, "loss": 2.4062, "step": 4141 }, { "epoch": 2.743046357615894, "grad_norm": 0.7952976963611031, "learning_rate": 0.0002368667836148176, "loss": 1.9844, "step": 4142 }, { "epoch": 2.7437086092715233, "grad_norm": 0.7979788039837987, "learning_rate": 0.00023681235906689743, "loss": 2.0625, "step": 4143 }, { "epoch": 2.744370860927152, "grad_norm": 0.7770300980122986, "learning_rate": 0.00023675791732913492, "loss": 2.125, "step": 4144 }, { "epoch": 2.7450331125827816, "grad_norm": 0.825271057452493, "learning_rate": 0.0002367034584123101, "loss": 2.1875, "step": 4145 }, { "epoch": 2.7456953642384105, "grad_norm": 0.8546449864986183, "learning_rate": 0.00023664898232720645, "loss": 2.2344, "step": 4146 }, { "epoch": 2.7463576158940395, "grad_norm": 0.9085639667267498, "learning_rate": 0.00023659448908461096, "loss": 2.2969, "step": 4147 }, { "epoch": 2.747019867549669, "grad_norm": 0.8099066864178545, "learning_rate": 0.00023653997869531383, "loss": 2.1406, "step": 4148 }, { "epoch": 2.7476821192052983, "grad_norm": 0.7199504634925541, "learning_rate": 0.00023648545117010882, "loss": 2.0938, "step": 4149 }, { "epoch": 2.748344370860927, "grad_norm": 0.8373375348999462, "learning_rate": 0.0002364309065197929, "loss": 2.125, "step": 4150 }, { "epoch": 2.749006622516556, "grad_norm": 0.8153055606046666, "learning_rate": 0.0002363763447551666, "loss": 2.0156, "step": 4151 }, { "epoch": 2.7496688741721855, "grad_norm": 0.6951663824441948, "learning_rate": 0.0002363217658870338, "loss": 1.5703, "step": 4152 }, { "epoch": 2.7503311258278145, "grad_norm": 0.7829422799569432, "learning_rate": 0.00023626716992620166, "loss": 2.0469, "step": 4153 }, { "epoch": 2.750993377483444, "grad_norm": 0.8518814257582603, "learning_rate": 0.0002362125568834809, "loss": 2.1094, "step": 4154 }, { "epoch": 2.751655629139073, "grad_norm": 0.8401338197097309, "learning_rate": 0.00023615792676968546, "loss": 2.1094, "step": 4155 }, { "epoch": 2.7523178807947017, "grad_norm": 0.731568523519124, "learning_rate": 0.00023610327959563276, "loss": 1.7188, "step": 4156 }, { "epoch": 2.752980132450331, "grad_norm": 0.7103965737552937, "learning_rate": 0.00023604861537214353, "loss": 1.7734, "step": 4157 }, { "epoch": 2.7536423841059605, "grad_norm": 0.7834559987829549, "learning_rate": 0.00023599393411004196, "loss": 2.125, "step": 4158 }, { "epoch": 2.7543046357615895, "grad_norm": 0.7904140165785916, "learning_rate": 0.00023593923582015554, "loss": 2.125, "step": 4159 }, { "epoch": 2.7549668874172184, "grad_norm": 0.8115993772466429, "learning_rate": 0.00023588452051331516, "loss": 2.2812, "step": 4160 }, { "epoch": 2.755629139072848, "grad_norm": 0.8571107663303452, "learning_rate": 0.0002358297882003551, "loss": 2.3125, "step": 4161 }, { "epoch": 2.7562913907284767, "grad_norm": 0.7036030250550355, "learning_rate": 0.00023577503889211294, "loss": 1.6172, "step": 4162 }, { "epoch": 2.756953642384106, "grad_norm": 0.7058883361115429, "learning_rate": 0.00023572027259942976, "loss": 1.6328, "step": 4163 }, { "epoch": 2.757615894039735, "grad_norm": 0.7823379611647142, "learning_rate": 0.00023566548933314985, "loss": 2.0156, "step": 4164 }, { "epoch": 2.758278145695364, "grad_norm": 0.805409006269033, "learning_rate": 0.00023561068910412088, "loss": 2.0781, "step": 4165 }, { "epoch": 2.7589403973509934, "grad_norm": 0.8134100214667909, "learning_rate": 0.00023555587192319397, "loss": 2.3125, "step": 4166 }, { "epoch": 2.7596026490066228, "grad_norm": 0.8046440179290408, "learning_rate": 0.00023550103780122357, "loss": 2.0312, "step": 4167 }, { "epoch": 2.7602649006622517, "grad_norm": 0.7702042433966213, "learning_rate": 0.00023544618674906749, "loss": 2.1875, "step": 4168 }, { "epoch": 2.7609271523178807, "grad_norm": 0.7530132771734991, "learning_rate": 0.00023539131877758676, "loss": 2.0312, "step": 4169 }, { "epoch": 2.76158940397351, "grad_norm": 0.7976650708330737, "learning_rate": 0.00023533643389764595, "loss": 2.2812, "step": 4170 }, { "epoch": 2.762251655629139, "grad_norm": 0.84395738741019, "learning_rate": 0.0002352815321201129, "loss": 2.0625, "step": 4171 }, { "epoch": 2.7629139072847684, "grad_norm": 0.8406471270033072, "learning_rate": 0.00023522661345585876, "loss": 2.2812, "step": 4172 }, { "epoch": 2.7635761589403973, "grad_norm": 0.7794261055320769, "learning_rate": 0.00023517167791575806, "loss": 2.25, "step": 4173 }, { "epoch": 2.7642384105960263, "grad_norm": 0.8372643972903571, "learning_rate": 0.0002351167255106887, "loss": 2.1406, "step": 4174 }, { "epoch": 2.7649006622516556, "grad_norm": 0.8290953114904558, "learning_rate": 0.0002350617562515318, "loss": 2.1719, "step": 4175 }, { "epoch": 2.765562913907285, "grad_norm": 0.6910806801093337, "learning_rate": 0.000235006770149172, "loss": 1.6328, "step": 4176 }, { "epoch": 2.766225165562914, "grad_norm": 0.8245155037960146, "learning_rate": 0.00023495176721449715, "loss": 2.2969, "step": 4177 }, { "epoch": 2.766887417218543, "grad_norm": 0.7120429958507726, "learning_rate": 0.00023489674745839843, "loss": 1.7734, "step": 4178 }, { "epoch": 2.7675496688741723, "grad_norm": 0.7986479086853632, "learning_rate": 0.00023484171089177043, "loss": 2.25, "step": 4179 }, { "epoch": 2.7682119205298013, "grad_norm": 0.8412779935967443, "learning_rate": 0.000234786657525511, "loss": 2.3906, "step": 4180 }, { "epoch": 2.7688741721854306, "grad_norm": 0.820590249860559, "learning_rate": 0.00023473158737052127, "loss": 2.3438, "step": 4181 }, { "epoch": 2.7695364238410596, "grad_norm": 0.7755238200801939, "learning_rate": 0.00023467650043770586, "loss": 2.2969, "step": 4182 }, { "epoch": 2.7701986754966885, "grad_norm": 0.7949863308464199, "learning_rate": 0.0002346213967379726, "loss": 2.3125, "step": 4183 }, { "epoch": 2.770860927152318, "grad_norm": 0.7354806776657277, "learning_rate": 0.0002345662762822326, "loss": 2.2969, "step": 4184 }, { "epoch": 2.7715231788079473, "grad_norm": 0.7888059961844744, "learning_rate": 0.0002345111390814004, "loss": 2.3438, "step": 4185 }, { "epoch": 2.7721854304635762, "grad_norm": 0.7944500906467029, "learning_rate": 0.00023445598514639377, "loss": 2.0938, "step": 4186 }, { "epoch": 2.772847682119205, "grad_norm": 0.8131129034010449, "learning_rate": 0.00023440081448813382, "loss": 2.2969, "step": 4187 }, { "epoch": 2.7735099337748346, "grad_norm": 0.7092598273813782, "learning_rate": 0.00023434562711754494, "loss": 1.9688, "step": 4188 }, { "epoch": 2.7741721854304635, "grad_norm": 0.7843541153039585, "learning_rate": 0.00023429042304555495, "loss": 2.0938, "step": 4189 }, { "epoch": 2.774834437086093, "grad_norm": 0.7826794456698766, "learning_rate": 0.00023423520228309483, "loss": 1.7422, "step": 4190 }, { "epoch": 2.775496688741722, "grad_norm": 0.7961359691973965, "learning_rate": 0.0002341799648410989, "loss": 2.2031, "step": 4191 }, { "epoch": 2.776158940397351, "grad_norm": 0.748431233819658, "learning_rate": 0.00023412471073050485, "loss": 2.3125, "step": 4192 }, { "epoch": 2.77682119205298, "grad_norm": 0.7205158254930283, "learning_rate": 0.00023406943996225367, "loss": 1.9375, "step": 4193 }, { "epoch": 2.777483443708609, "grad_norm": 0.7511280263729264, "learning_rate": 0.00023401415254728952, "loss": 2.2188, "step": 4194 }, { "epoch": 2.7781456953642385, "grad_norm": 0.7612973725551422, "learning_rate": 0.00023395884849655992, "loss": 2.0625, "step": 4195 }, { "epoch": 2.7788079470198674, "grad_norm": 0.7559634877051492, "learning_rate": 0.00023390352782101586, "loss": 2.2031, "step": 4196 }, { "epoch": 2.779470198675497, "grad_norm": 0.7534210084281672, "learning_rate": 0.0002338481905316113, "loss": 2.1562, "step": 4197 }, { "epoch": 2.7801324503311258, "grad_norm": 0.7440041169058349, "learning_rate": 0.00023379283663930374, "loss": 1.9453, "step": 4198 }, { "epoch": 2.780794701986755, "grad_norm": 0.7814918334930664, "learning_rate": 0.00023373746615505388, "loss": 2.375, "step": 4199 }, { "epoch": 2.781456953642384, "grad_norm": 0.7219610379705318, "learning_rate": 0.00023368207908982573, "loss": 2.0781, "step": 4200 }, { "epoch": 2.782119205298013, "grad_norm": 0.751270972980044, "learning_rate": 0.00023362667545458646, "loss": 1.8594, "step": 4201 }, { "epoch": 2.7827814569536424, "grad_norm": 0.9004341400571496, "learning_rate": 0.0002335712552603068, "loss": 2.2812, "step": 4202 }, { "epoch": 2.7834437086092714, "grad_norm": 0.8119038622886741, "learning_rate": 0.00023351581851796042, "loss": 2.2812, "step": 4203 }, { "epoch": 2.7841059602649008, "grad_norm": 0.8298367742046188, "learning_rate": 0.00023346036523852446, "loss": 2.2344, "step": 4204 }, { "epoch": 2.7847682119205297, "grad_norm": 0.7239688367435931, "learning_rate": 0.00023340489543297944, "loss": 2.0156, "step": 4205 }, { "epoch": 2.785430463576159, "grad_norm": 0.790849640489346, "learning_rate": 0.00023334940911230885, "loss": 2.1094, "step": 4206 }, { "epoch": 2.786092715231788, "grad_norm": 0.7634660143756492, "learning_rate": 0.00023329390628749972, "loss": 1.8672, "step": 4207 }, { "epoch": 2.7867549668874174, "grad_norm": 0.7258488312771718, "learning_rate": 0.0002332383869695422, "loss": 1.8438, "step": 4208 }, { "epoch": 2.7874172185430464, "grad_norm": 0.7432328433894279, "learning_rate": 0.00023318285116942977, "loss": 1.7188, "step": 4209 }, { "epoch": 2.7880794701986753, "grad_norm": 0.6934020614599951, "learning_rate": 0.00023312729889815912, "loss": 1.8984, "step": 4210 }, { "epoch": 2.7887417218543047, "grad_norm": 0.8702525910495136, "learning_rate": 0.00023307173016673028, "loss": 2.1562, "step": 4211 }, { "epoch": 2.7894039735099336, "grad_norm": 0.8630892164784327, "learning_rate": 0.0002330161449861465, "loss": 2.0625, "step": 4212 }, { "epoch": 2.790066225165563, "grad_norm": 0.8658686835071944, "learning_rate": 0.00023296054336741423, "loss": 2.3906, "step": 4213 }, { "epoch": 2.790728476821192, "grad_norm": 0.8193324912709612, "learning_rate": 0.00023290492532154327, "loss": 2.4688, "step": 4214 }, { "epoch": 2.7913907284768213, "grad_norm": 0.8372812840759414, "learning_rate": 0.00023284929085954663, "loss": 2.3281, "step": 4215 }, { "epoch": 2.7920529801324503, "grad_norm": 0.8040277739781813, "learning_rate": 0.00023279363999244057, "loss": 2.375, "step": 4216 }, { "epoch": 2.7927152317880797, "grad_norm": 0.8025280096389449, "learning_rate": 0.0002327379727312446, "loss": 2.0469, "step": 4217 }, { "epoch": 2.7933774834437086, "grad_norm": 0.7568735397504575, "learning_rate": 0.00023268228908698144, "loss": 2.25, "step": 4218 }, { "epoch": 2.7940397350993376, "grad_norm": 0.8232854899606435, "learning_rate": 0.00023262658907067716, "loss": 2.2344, "step": 4219 }, { "epoch": 2.794701986754967, "grad_norm": 0.7832514849770001, "learning_rate": 0.0002325708726933609, "loss": 2.4219, "step": 4220 }, { "epoch": 2.795364238410596, "grad_norm": 0.7821770340460427, "learning_rate": 0.00023251513996606525, "loss": 1.9844, "step": 4221 }, { "epoch": 2.7960264900662253, "grad_norm": 0.7492770406860467, "learning_rate": 0.00023245939089982593, "loss": 2.0469, "step": 4222 }, { "epoch": 2.796688741721854, "grad_norm": 0.7023535552829512, "learning_rate": 0.00023240362550568176, "loss": 1.8516, "step": 4223 }, { "epoch": 2.7973509933774836, "grad_norm": 0.7435860054478869, "learning_rate": 0.00023234784379467502, "loss": 2.0469, "step": 4224 }, { "epoch": 2.7980132450331126, "grad_norm": 0.7292393886325331, "learning_rate": 0.00023229204577785117, "loss": 1.7812, "step": 4225 }, { "epoch": 2.798675496688742, "grad_norm": 0.7764089202843507, "learning_rate": 0.0002322362314662588, "loss": 2.2188, "step": 4226 }, { "epoch": 2.799337748344371, "grad_norm": 0.804420319958684, "learning_rate": 0.00023218040087094973, "loss": 2.1406, "step": 4227 }, { "epoch": 2.8, "grad_norm": 0.8154269358915601, "learning_rate": 0.00023212455400297918, "loss": 2.3281, "step": 4228 }, { "epoch": 2.800662251655629, "grad_norm": 0.7926866844395691, "learning_rate": 0.00023206869087340537, "loss": 2.0312, "step": 4229 }, { "epoch": 2.801324503311258, "grad_norm": 0.7730473252715683, "learning_rate": 0.0002320128114932899, "loss": 1.9219, "step": 4230 }, { "epoch": 2.8019867549668875, "grad_norm": 0.8574153571066321, "learning_rate": 0.00023195691587369753, "loss": 2.3281, "step": 4231 }, { "epoch": 2.8026490066225165, "grad_norm": 0.7965322680777257, "learning_rate": 0.00023190100402569621, "loss": 2.125, "step": 4232 }, { "epoch": 2.8033112582781454, "grad_norm": 0.7403512501193728, "learning_rate": 0.00023184507596035706, "loss": 2.125, "step": 4233 }, { "epoch": 2.803973509933775, "grad_norm": 0.7262448849710594, "learning_rate": 0.0002317891316887546, "loss": 2.0469, "step": 4234 }, { "epoch": 2.804635761589404, "grad_norm": 0.796876405321901, "learning_rate": 0.0002317331712219664, "loss": 2.2344, "step": 4235 }, { "epoch": 2.805298013245033, "grad_norm": 0.8038449890552815, "learning_rate": 0.00023167719457107327, "loss": 2.0, "step": 4236 }, { "epoch": 2.805960264900662, "grad_norm": 0.8296033059326662, "learning_rate": 0.00023162120174715918, "loss": 2.1094, "step": 4237 }, { "epoch": 2.8066225165562915, "grad_norm": 0.8307762074808884, "learning_rate": 0.00023156519276131137, "loss": 2.2031, "step": 4238 }, { "epoch": 2.8072847682119204, "grad_norm": 0.8177472708275646, "learning_rate": 0.00023150916762462032, "loss": 2.2344, "step": 4239 }, { "epoch": 2.80794701986755, "grad_norm": 0.7533977738077938, "learning_rate": 0.00023145312634817957, "loss": 1.7734, "step": 4240 }, { "epoch": 2.8086092715231787, "grad_norm": 0.7999699882788476, "learning_rate": 0.000231397068943086, "loss": 2.375, "step": 4241 }, { "epoch": 2.8092715231788077, "grad_norm": 0.8064547779666964, "learning_rate": 0.0002313409954204396, "loss": 2.5469, "step": 4242 }, { "epoch": 2.809933774834437, "grad_norm": 0.7755315750614865, "learning_rate": 0.00023128490579134352, "loss": 2.0469, "step": 4243 }, { "epoch": 2.8105960264900665, "grad_norm": 0.7084488353709306, "learning_rate": 0.00023122880006690424, "loss": 1.7344, "step": 4244 }, { "epoch": 2.8112582781456954, "grad_norm": 0.7947730780597543, "learning_rate": 0.0002311726782582313, "loss": 2.1406, "step": 4245 }, { "epoch": 2.8119205298013243, "grad_norm": 0.8110006656971359, "learning_rate": 0.0002311165403764374, "loss": 2.4062, "step": 4246 }, { "epoch": 2.8125827814569537, "grad_norm": 0.809472967421911, "learning_rate": 0.00023106038643263855, "loss": 2.3594, "step": 4247 }, { "epoch": 2.8132450331125827, "grad_norm": 0.7726614726711921, "learning_rate": 0.00023100421643795384, "loss": 2.1719, "step": 4248 }, { "epoch": 2.813907284768212, "grad_norm": 0.7781945326326388, "learning_rate": 0.0002309480304035056, "loss": 1.6953, "step": 4249 }, { "epoch": 2.814569536423841, "grad_norm": 0.7463990337871235, "learning_rate": 0.00023089182834041934, "loss": 1.7734, "step": 4250 }, { "epoch": 2.81523178807947, "grad_norm": 0.8109852964760289, "learning_rate": 0.00023083561025982367, "loss": 2.0938, "step": 4251 }, { "epoch": 2.8158940397350993, "grad_norm": 0.7577545593568166, "learning_rate": 0.0002307793761728504, "loss": 1.9766, "step": 4252 }, { "epoch": 2.8165562913907287, "grad_norm": 0.8677207931541057, "learning_rate": 0.00023072312609063455, "loss": 2.2344, "step": 4253 }, { "epoch": 2.8172185430463577, "grad_norm": 0.7908274967945879, "learning_rate": 0.00023066686002431422, "loss": 2.2031, "step": 4254 }, { "epoch": 2.8178807947019866, "grad_norm": 0.7375562409068035, "learning_rate": 0.00023061057798503087, "loss": 1.7656, "step": 4255 }, { "epoch": 2.818543046357616, "grad_norm": 0.8453015759524933, "learning_rate": 0.0002305542799839289, "loss": 2.1719, "step": 4256 }, { "epoch": 2.819205298013245, "grad_norm": 0.7843556437360832, "learning_rate": 0.00023049796603215593, "loss": 1.9922, "step": 4257 }, { "epoch": 2.8198675496688743, "grad_norm": 0.901672951311254, "learning_rate": 0.00023044163614086276, "loss": 2.3906, "step": 4258 }, { "epoch": 2.8205298013245033, "grad_norm": 0.8081337320036998, "learning_rate": 0.0002303852903212035, "loss": 2.3438, "step": 4259 }, { "epoch": 2.821192052980132, "grad_norm": 0.749074456780704, "learning_rate": 0.00023032892858433512, "loss": 2.0781, "step": 4260 }, { "epoch": 2.8218543046357616, "grad_norm": 0.7442823720580068, "learning_rate": 0.00023027255094141796, "loss": 2.0938, "step": 4261 }, { "epoch": 2.822516556291391, "grad_norm": 0.7446990067807711, "learning_rate": 0.00023021615740361542, "loss": 2.0312, "step": 4262 }, { "epoch": 2.82317880794702, "grad_norm": 0.748039169482607, "learning_rate": 0.00023015974798209406, "loss": 2.0625, "step": 4263 }, { "epoch": 2.823841059602649, "grad_norm": 0.772051645178878, "learning_rate": 0.0002301033226880236, "loss": 2.1875, "step": 4264 }, { "epoch": 2.8245033112582782, "grad_norm": 0.8062786589231989, "learning_rate": 0.00023004688153257686, "loss": 2.3594, "step": 4265 }, { "epoch": 2.825165562913907, "grad_norm": 0.7238089584695271, "learning_rate": 0.0002299904245269299, "loss": 2.0938, "step": 4266 }, { "epoch": 2.8258278145695366, "grad_norm": 0.7901511011357996, "learning_rate": 0.00022993395168226181, "loss": 2.2656, "step": 4267 }, { "epoch": 2.8264900662251655, "grad_norm": 0.7569493336175531, "learning_rate": 0.00022987746300975488, "loss": 2.1875, "step": 4268 }, { "epoch": 2.8271523178807945, "grad_norm": 0.7019069743014565, "learning_rate": 0.00022982095852059454, "loss": 1.9141, "step": 4269 }, { "epoch": 2.827814569536424, "grad_norm": 0.754147260268682, "learning_rate": 0.00022976443822596927, "loss": 2.1094, "step": 4270 }, { "epoch": 2.8284768211920532, "grad_norm": 0.7727647126383128, "learning_rate": 0.00022970790213707075, "loss": 2.1562, "step": 4271 }, { "epoch": 2.829139072847682, "grad_norm": 0.7663979618079061, "learning_rate": 0.0002296513502650938, "loss": 2.2188, "step": 4272 }, { "epoch": 2.829801324503311, "grad_norm": 0.7963869587440184, "learning_rate": 0.00022959478262123627, "loss": 2.4062, "step": 4273 }, { "epoch": 2.8304635761589405, "grad_norm": 0.7863170127716259, "learning_rate": 0.00022953819921669934, "loss": 2.1562, "step": 4274 }, { "epoch": 2.8311258278145695, "grad_norm": 0.7436850672127658, "learning_rate": 0.00022948160006268703, "loss": 2.0, "step": 4275 }, { "epoch": 2.831788079470199, "grad_norm": 0.8183350955159097, "learning_rate": 0.0002294249851704067, "loss": 2.3594, "step": 4276 }, { "epoch": 2.832450331125828, "grad_norm": 0.7268090199507281, "learning_rate": 0.00022936835455106876, "loss": 2.0469, "step": 4277 }, { "epoch": 2.8331125827814567, "grad_norm": 0.7554174818230153, "learning_rate": 0.00022931170821588665, "loss": 2.2031, "step": 4278 }, { "epoch": 2.833774834437086, "grad_norm": 0.8212927948745228, "learning_rate": 0.00022925504617607703, "loss": 2.4219, "step": 4279 }, { "epoch": 2.8344370860927155, "grad_norm": 0.8338430484338789, "learning_rate": 0.0002291983684428597, "loss": 2.1875, "step": 4280 }, { "epoch": 2.8350993377483444, "grad_norm": 0.8443267893319859, "learning_rate": 0.00022914167502745736, "loss": 2.0781, "step": 4281 }, { "epoch": 2.8357615894039734, "grad_norm": 0.7461390764597494, "learning_rate": 0.00022908496594109608, "loss": 2.0312, "step": 4282 }, { "epoch": 2.8364238410596028, "grad_norm": 0.847820641004548, "learning_rate": 0.00022902824119500485, "loss": 2.1875, "step": 4283 }, { "epoch": 2.8370860927152317, "grad_norm": 0.7877234222825876, "learning_rate": 0.00022897150080041585, "loss": 2.2188, "step": 4284 }, { "epoch": 2.837748344370861, "grad_norm": 0.7795510068562231, "learning_rate": 0.00022891474476856427, "loss": 2.1406, "step": 4285 }, { "epoch": 2.83841059602649, "grad_norm": 0.6357193554690472, "learning_rate": 0.00022885797311068855, "loss": 1.6406, "step": 4286 }, { "epoch": 2.839072847682119, "grad_norm": 2.7534272466246597, "learning_rate": 0.00022880118583803004, "loss": 2.0781, "step": 4287 }, { "epoch": 2.8397350993377484, "grad_norm": 0.7569681892451441, "learning_rate": 0.0002287443829618333, "loss": 1.6562, "step": 4288 }, { "epoch": 2.8403973509933778, "grad_norm": 0.8361652217296667, "learning_rate": 0.00022868756449334602, "loss": 2.0938, "step": 4289 }, { "epoch": 2.8410596026490067, "grad_norm": 0.6898048692278205, "learning_rate": 0.0002286307304438188, "loss": 1.7031, "step": 4290 }, { "epoch": 2.8417218543046356, "grad_norm": 0.7978115771443376, "learning_rate": 0.00022857388082450546, "loss": 2.2656, "step": 4291 }, { "epoch": 2.842384105960265, "grad_norm": 0.815597262414709, "learning_rate": 0.00022851701564666297, "loss": 2.1719, "step": 4292 }, { "epoch": 2.843046357615894, "grad_norm": 0.7722317800372288, "learning_rate": 0.00022846013492155116, "loss": 2.1406, "step": 4293 }, { "epoch": 2.8437086092715234, "grad_norm": 0.8040546933072937, "learning_rate": 0.00022840323866043315, "loss": 2.4375, "step": 4294 }, { "epoch": 2.8443708609271523, "grad_norm": 0.8414941405427296, "learning_rate": 0.000228346326874575, "loss": 2.2344, "step": 4295 }, { "epoch": 2.8450331125827812, "grad_norm": 0.7571066148674479, "learning_rate": 0.0002282893995752459, "loss": 2.1562, "step": 4296 }, { "epoch": 2.8456953642384106, "grad_norm": 0.8581498371455727, "learning_rate": 0.00022823245677371817, "loss": 2.4062, "step": 4297 }, { "epoch": 2.8463576158940396, "grad_norm": 0.7389811800639423, "learning_rate": 0.00022817549848126704, "loss": 2.1406, "step": 4298 }, { "epoch": 2.847019867549669, "grad_norm": 0.7816227614483723, "learning_rate": 0.000228118524709171, "loss": 2.2812, "step": 4299 }, { "epoch": 2.847682119205298, "grad_norm": 0.779748630393211, "learning_rate": 0.00022806153546871141, "loss": 2.2656, "step": 4300 }, { "epoch": 2.8483443708609273, "grad_norm": 0.8140458559140917, "learning_rate": 0.0002280045307711729, "loss": 2.2812, "step": 4301 }, { "epoch": 2.8490066225165562, "grad_norm": 0.7533871208436292, "learning_rate": 0.00022794751062784298, "loss": 2.2812, "step": 4302 }, { "epoch": 2.8496688741721856, "grad_norm": 0.8561658084528757, "learning_rate": 0.0002278904750500124, "loss": 2.2969, "step": 4303 }, { "epoch": 2.8503311258278146, "grad_norm": 0.9174690832625928, "learning_rate": 0.00022783342404897466, "loss": 2.4062, "step": 4304 }, { "epoch": 2.8509933774834435, "grad_norm": 0.7892232529139972, "learning_rate": 0.00022777635763602668, "loss": 2.125, "step": 4305 }, { "epoch": 2.851655629139073, "grad_norm": 0.7830108639125648, "learning_rate": 0.0002277192758224682, "loss": 2.1562, "step": 4306 }, { "epoch": 2.852317880794702, "grad_norm": 0.8714669452056019, "learning_rate": 0.0002276621786196021, "loss": 2.1562, "step": 4307 }, { "epoch": 2.852980132450331, "grad_norm": 0.8183496932179902, "learning_rate": 0.00022760506603873424, "loss": 2.4688, "step": 4308 }, { "epoch": 2.85364238410596, "grad_norm": 0.8637671451819452, "learning_rate": 0.00022754793809117362, "loss": 2.375, "step": 4309 }, { "epoch": 2.8543046357615895, "grad_norm": 0.7191367872969532, "learning_rate": 0.0002274907947882322, "loss": 2.0312, "step": 4310 }, { "epoch": 2.8549668874172185, "grad_norm": 0.9191924080651089, "learning_rate": 0.00022743363614122496, "loss": 2.2969, "step": 4311 }, { "epoch": 2.855629139072848, "grad_norm": 0.8355350713643267, "learning_rate": 0.0002273764621614701, "loss": 2.2656, "step": 4312 }, { "epoch": 2.856291390728477, "grad_norm": 0.8078895178755016, "learning_rate": 0.0002273192728602886, "loss": 2.4062, "step": 4313 }, { "epoch": 2.8569536423841058, "grad_norm": 0.7620700589753664, "learning_rate": 0.00022726206824900459, "loss": 2.1562, "step": 4314 }, { "epoch": 2.857615894039735, "grad_norm": 0.7995824727746912, "learning_rate": 0.00022720484833894534, "loss": 2.0156, "step": 4315 }, { "epoch": 2.858278145695364, "grad_norm": 0.7845277744047192, "learning_rate": 0.00022714761314144094, "loss": 2.1719, "step": 4316 }, { "epoch": 2.8589403973509935, "grad_norm": 0.7322900159291628, "learning_rate": 0.00022709036266782472, "loss": 2.125, "step": 4317 }, { "epoch": 2.8596026490066224, "grad_norm": 0.7694931190787067, "learning_rate": 0.0002270330969294329, "loss": 2.0625, "step": 4318 }, { "epoch": 2.860264900662252, "grad_norm": 0.8118855161875603, "learning_rate": 0.00022697581593760467, "loss": 2.0938, "step": 4319 }, { "epoch": 2.8609271523178808, "grad_norm": 0.8222520650845837, "learning_rate": 0.00022691851970368243, "loss": 2.1875, "step": 4320 }, { "epoch": 2.86158940397351, "grad_norm": 0.7920715466540987, "learning_rate": 0.00022686120823901148, "loss": 2.0156, "step": 4321 }, { "epoch": 2.862251655629139, "grad_norm": 0.7375070520555159, "learning_rate": 0.00022680388155494005, "loss": 2.0938, "step": 4322 }, { "epoch": 2.862913907284768, "grad_norm": 0.7413996166535582, "learning_rate": 0.00022674653966281956, "loss": 1.8672, "step": 4323 }, { "epoch": 2.8635761589403974, "grad_norm": 0.8045071123117419, "learning_rate": 0.00022668918257400436, "loss": 2.0938, "step": 4324 }, { "epoch": 2.8642384105960264, "grad_norm": 0.7838641917039068, "learning_rate": 0.00022663181029985175, "loss": 2.1094, "step": 4325 }, { "epoch": 2.8649006622516557, "grad_norm": 0.7712171496477863, "learning_rate": 0.0002265744228517222, "loss": 2.0469, "step": 4326 }, { "epoch": 2.8655629139072847, "grad_norm": 0.8891649412909731, "learning_rate": 0.00022651702024097903, "loss": 2.3594, "step": 4327 }, { "epoch": 2.866225165562914, "grad_norm": 0.80274163503277, "learning_rate": 0.00022645960247898858, "loss": 2.2031, "step": 4328 }, { "epoch": 2.866887417218543, "grad_norm": 0.765788969812281, "learning_rate": 0.00022640216957712025, "loss": 1.8594, "step": 4329 }, { "epoch": 2.8675496688741724, "grad_norm": 0.7909543757650036, "learning_rate": 0.00022634472154674646, "loss": 2.0625, "step": 4330 }, { "epoch": 2.8682119205298013, "grad_norm": 0.7820752640135349, "learning_rate": 0.0002262872583992425, "loss": 2.2031, "step": 4331 }, { "epoch": 2.8688741721854303, "grad_norm": 0.7748591209060456, "learning_rate": 0.00022622978014598684, "loss": 2.0312, "step": 4332 }, { "epoch": 2.8695364238410597, "grad_norm": 0.7784011873118347, "learning_rate": 0.00022617228679836068, "loss": 1.9766, "step": 4333 }, { "epoch": 2.8701986754966886, "grad_norm": 0.7875525626763957, "learning_rate": 0.0002261147783677485, "loss": 2.2656, "step": 4334 }, { "epoch": 2.870860927152318, "grad_norm": 0.6849342536124694, "learning_rate": 0.00022605725486553758, "loss": 1.8594, "step": 4335 }, { "epoch": 2.871523178807947, "grad_norm": 0.7565642297294322, "learning_rate": 0.00022599971630311822, "loss": 2.0312, "step": 4336 }, { "epoch": 2.872185430463576, "grad_norm": 0.6705695012665237, "learning_rate": 0.00022594216269188365, "loss": 1.7422, "step": 4337 }, { "epoch": 2.8728476821192053, "grad_norm": 0.8470494697029474, "learning_rate": 0.0002258845940432303, "loss": 2.4062, "step": 4338 }, { "epoch": 2.8735099337748347, "grad_norm": 0.764785123387304, "learning_rate": 0.00022582701036855734, "loss": 2.1406, "step": 4339 }, { "epoch": 2.8741721854304636, "grad_norm": 0.7943550482459245, "learning_rate": 0.00022576941167926697, "loss": 2.1719, "step": 4340 }, { "epoch": 2.8748344370860925, "grad_norm": 0.7530760275827229, "learning_rate": 0.00022571179798676444, "loss": 1.9922, "step": 4341 }, { "epoch": 2.875496688741722, "grad_norm": 0.730586421804568, "learning_rate": 0.00022565416930245792, "loss": 1.7109, "step": 4342 }, { "epoch": 2.876158940397351, "grad_norm": 0.8256585877735351, "learning_rate": 0.00022559652563775847, "loss": 2.125, "step": 4343 }, { "epoch": 2.8768211920529803, "grad_norm": 0.7843769049176272, "learning_rate": 0.0002255388670040803, "loss": 2.0469, "step": 4344 }, { "epoch": 2.877483443708609, "grad_norm": 0.7322742060131598, "learning_rate": 0.0002254811934128404, "loss": 2.1406, "step": 4345 }, { "epoch": 2.878145695364238, "grad_norm": 0.8760053975187727, "learning_rate": 0.00022542350487545892, "loss": 2.1875, "step": 4346 }, { "epoch": 2.8788079470198675, "grad_norm": 0.7857875978118242, "learning_rate": 0.0002253658014033587, "loss": 2.1875, "step": 4347 }, { "epoch": 2.879470198675497, "grad_norm": 0.7911363324108679, "learning_rate": 0.0002253080830079658, "loss": 2.1562, "step": 4348 }, { "epoch": 2.880132450331126, "grad_norm": 0.7964718063339266, "learning_rate": 0.00022525034970070906, "loss": 1.7031, "step": 4349 }, { "epoch": 2.880794701986755, "grad_norm": 0.736117924383515, "learning_rate": 0.00022519260149302038, "loss": 2.1094, "step": 4350 }, { "epoch": 2.881456953642384, "grad_norm": 0.8001213062960864, "learning_rate": 0.00022513483839633455, "loss": 1.9062, "step": 4351 }, { "epoch": 2.882119205298013, "grad_norm": 0.823698029731299, "learning_rate": 0.00022507706042208936, "loss": 2.2969, "step": 4352 }, { "epoch": 2.8827814569536425, "grad_norm": 0.765387513114179, "learning_rate": 0.00022501926758172538, "loss": 2.2188, "step": 4353 }, { "epoch": 2.8834437086092715, "grad_norm": 0.8016743427599137, "learning_rate": 0.00022496145988668643, "loss": 2.2031, "step": 4354 }, { "epoch": 2.8841059602649004, "grad_norm": 0.7342281739638434, "learning_rate": 0.00022490363734841898, "loss": 2.125, "step": 4355 }, { "epoch": 2.88476821192053, "grad_norm": 0.8330067617330862, "learning_rate": 0.0002248457999783726, "loss": 2.25, "step": 4356 }, { "epoch": 2.885430463576159, "grad_norm": 0.7497081607502295, "learning_rate": 0.00022478794778799978, "loss": 2.0938, "step": 4357 }, { "epoch": 2.886092715231788, "grad_norm": 0.8236186269902726, "learning_rate": 0.00022473008078875584, "loss": 2.25, "step": 4358 }, { "epoch": 2.886754966887417, "grad_norm": 0.8053080230822051, "learning_rate": 0.00022467219899209917, "loss": 2.0781, "step": 4359 }, { "epoch": 2.8874172185430464, "grad_norm": 0.7164664587738089, "learning_rate": 0.00022461430240949096, "loss": 1.8516, "step": 4360 }, { "epoch": 2.8880794701986754, "grad_norm": 0.8142001223518115, "learning_rate": 0.00022455639105239554, "loss": 2.25, "step": 4361 }, { "epoch": 2.888741721854305, "grad_norm": 0.7040437330515209, "learning_rate": 0.00022449846493227985, "loss": 1.8828, "step": 4362 }, { "epoch": 2.8894039735099337, "grad_norm": 0.7220505009445266, "learning_rate": 0.00022444052406061404, "loss": 1.8828, "step": 4363 }, { "epoch": 2.8900662251655627, "grad_norm": 0.7856552309358815, "learning_rate": 0.00022438256844887103, "loss": 2.2031, "step": 4364 }, { "epoch": 2.890728476821192, "grad_norm": 0.8502959740448913, "learning_rate": 0.0002243245981085267, "loss": 2.3281, "step": 4365 }, { "epoch": 2.8913907284768214, "grad_norm": 0.8097844518806273, "learning_rate": 0.00022426661305105982, "loss": 1.7422, "step": 4366 }, { "epoch": 2.8920529801324504, "grad_norm": 0.8382135619585039, "learning_rate": 0.0002242086132879522, "loss": 2.2812, "step": 4367 }, { "epoch": 2.8927152317880793, "grad_norm": 0.8040259502722577, "learning_rate": 0.00022415059883068827, "loss": 2.1562, "step": 4368 }, { "epoch": 2.8933774834437087, "grad_norm": 0.7487370394253613, "learning_rate": 0.00022409256969075573, "loss": 1.8984, "step": 4369 }, { "epoch": 2.8940397350993377, "grad_norm": 0.8121435968410278, "learning_rate": 0.00022403452587964491, "loss": 2.2188, "step": 4370 }, { "epoch": 2.894701986754967, "grad_norm": 0.8112797344685503, "learning_rate": 0.00022397646740884923, "loss": 2.3438, "step": 4371 }, { "epoch": 2.895364238410596, "grad_norm": 0.7445266467259728, "learning_rate": 0.00022391839428986482, "loss": 2.1094, "step": 4372 }, { "epoch": 2.896026490066225, "grad_norm": 0.7983038417800611, "learning_rate": 0.00022386030653419098, "loss": 2.3906, "step": 4373 }, { "epoch": 2.8966887417218543, "grad_norm": 0.8238803405875503, "learning_rate": 0.00022380220415332957, "loss": 2.3438, "step": 4374 }, { "epoch": 2.8973509933774837, "grad_norm": 0.7338443097363743, "learning_rate": 0.0002237440871587857, "loss": 1.7656, "step": 4375 }, { "epoch": 2.8980132450331126, "grad_norm": 0.7664758917022869, "learning_rate": 0.0002236859555620671, "loss": 1.7344, "step": 4376 }, { "epoch": 2.8986754966887416, "grad_norm": 0.7991264301220364, "learning_rate": 0.00022362780937468452, "loss": 2.2188, "step": 4377 }, { "epoch": 2.899337748344371, "grad_norm": 0.7684386742476127, "learning_rate": 0.00022356964860815155, "loss": 2.0938, "step": 4378 }, { "epoch": 2.9, "grad_norm": 0.8132228349696328, "learning_rate": 0.00022351147327398475, "loss": 1.8672, "step": 4379 }, { "epoch": 2.9006622516556293, "grad_norm": 0.7545679171857776, "learning_rate": 0.00022345328338370342, "loss": 2.2188, "step": 4380 }, { "epoch": 2.9013245033112582, "grad_norm": 0.8098721462421289, "learning_rate": 0.00022339507894882985, "loss": 2.0156, "step": 4381 }, { "epoch": 2.901986754966887, "grad_norm": 0.7250274957792093, "learning_rate": 0.00022333685998088924, "loss": 1.9922, "step": 4382 }, { "epoch": 2.9026490066225166, "grad_norm": 0.7199057703051756, "learning_rate": 0.00022327862649140957, "loss": 2.0938, "step": 4383 }, { "epoch": 2.903311258278146, "grad_norm": 0.804408619705093, "learning_rate": 0.00022322037849192172, "loss": 2.2656, "step": 4384 }, { "epoch": 2.903973509933775, "grad_norm": 0.7489591907282805, "learning_rate": 0.0002231621159939595, "loss": 2.1875, "step": 4385 }, { "epoch": 2.904635761589404, "grad_norm": 0.7652456168806213, "learning_rate": 0.00022310383900905957, "loss": 2.1719, "step": 4386 }, { "epoch": 2.9052980132450332, "grad_norm": 0.9299591974475737, "learning_rate": 0.00022304554754876135, "loss": 2.5781, "step": 4387 }, { "epoch": 2.905960264900662, "grad_norm": 0.7269200016588775, "learning_rate": 0.0002229872416246073, "loss": 2.2812, "step": 4388 }, { "epoch": 2.9066225165562916, "grad_norm": 0.7246332908206276, "learning_rate": 0.00022292892124814264, "loss": 2.1406, "step": 4389 }, { "epoch": 2.9072847682119205, "grad_norm": 0.750095856647597, "learning_rate": 0.00022287058643091548, "loss": 2.2031, "step": 4390 }, { "epoch": 2.9079470198675494, "grad_norm": 0.7689055995694445, "learning_rate": 0.00022281223718447672, "loss": 2.2344, "step": 4391 }, { "epoch": 2.908609271523179, "grad_norm": 0.7094383988468619, "learning_rate": 0.00022275387352038028, "loss": 2.0469, "step": 4392 }, { "epoch": 2.909271523178808, "grad_norm": 0.7671209839447317, "learning_rate": 0.00022269549545018283, "loss": 2.3594, "step": 4393 }, { "epoch": 2.909933774834437, "grad_norm": 0.8075845535906568, "learning_rate": 0.0002226371029854438, "loss": 2.1094, "step": 4394 }, { "epoch": 2.910596026490066, "grad_norm": 0.7566121971854343, "learning_rate": 0.00022257869613772556, "loss": 2.0938, "step": 4395 }, { "epoch": 2.9112582781456955, "grad_norm": 0.8101089447041504, "learning_rate": 0.0002225202749185935, "loss": 2.1094, "step": 4396 }, { "epoch": 2.9119205298013244, "grad_norm": 0.7826169549376614, "learning_rate": 0.00022246183933961555, "loss": 2.3906, "step": 4397 }, { "epoch": 2.912582781456954, "grad_norm": 0.7770810780765592, "learning_rate": 0.00022240338941236263, "loss": 2.2031, "step": 4398 }, { "epoch": 2.9132450331125828, "grad_norm": 0.7546483542485276, "learning_rate": 0.0002223449251484086, "loss": 1.7188, "step": 4399 }, { "epoch": 2.9139072847682117, "grad_norm": 0.9016660213290708, "learning_rate": 0.00022228644655932994, "loss": 2.25, "step": 4400 }, { "epoch": 2.914569536423841, "grad_norm": 0.7817928476538343, "learning_rate": 0.0002222279536567061, "loss": 2.1562, "step": 4401 }, { "epoch": 2.91523178807947, "grad_norm": 0.7832703143428481, "learning_rate": 0.00022216944645211942, "loss": 2.1875, "step": 4402 }, { "epoch": 2.9158940397350994, "grad_norm": 0.8236398574245598, "learning_rate": 0.0002221109249571549, "loss": 2.0, "step": 4403 }, { "epoch": 2.9165562913907284, "grad_norm": 0.8161552589500917, "learning_rate": 0.00022205238918340055, "loss": 2.25, "step": 4404 }, { "epoch": 2.9172185430463577, "grad_norm": 0.7605307896669022, "learning_rate": 0.00022199383914244707, "loss": 2.2812, "step": 4405 }, { "epoch": 2.9178807947019867, "grad_norm": 0.8444774078955247, "learning_rate": 0.00022193527484588806, "loss": 2.3125, "step": 4406 }, { "epoch": 2.918543046357616, "grad_norm": 0.787730212352518, "learning_rate": 0.00022187669630531994, "loss": 1.9844, "step": 4407 }, { "epoch": 2.919205298013245, "grad_norm": 0.7084323012307221, "learning_rate": 0.0002218181035323419, "loss": 2.0312, "step": 4408 }, { "epoch": 2.919867549668874, "grad_norm": 0.6715426835901356, "learning_rate": 0.00022175949653855596, "loss": 1.7578, "step": 4409 }, { "epoch": 2.9205298013245033, "grad_norm": 0.822227804796344, "learning_rate": 0.00022170087533556707, "loss": 2.1875, "step": 4410 }, { "epoch": 2.9211920529801323, "grad_norm": 0.7979246488949965, "learning_rate": 0.00022164223993498277, "loss": 2.4219, "step": 4411 }, { "epoch": 2.9218543046357617, "grad_norm": 0.7761589318783694, "learning_rate": 0.00022158359034841368, "loss": 2.1562, "step": 4412 }, { "epoch": 2.9225165562913906, "grad_norm": 0.800356755520214, "learning_rate": 0.00022152492658747297, "loss": 2.2188, "step": 4413 }, { "epoch": 2.92317880794702, "grad_norm": 0.6884868023473854, "learning_rate": 0.00022146624866377685, "loss": 1.875, "step": 4414 }, { "epoch": 2.923841059602649, "grad_norm": 0.8360567391001918, "learning_rate": 0.00022140755658894409, "loss": 2.3438, "step": 4415 }, { "epoch": 2.9245033112582783, "grad_norm": 0.8314932754420089, "learning_rate": 0.0002213488503745965, "loss": 2.2969, "step": 4416 }, { "epoch": 2.9251655629139073, "grad_norm": 0.772391265499767, "learning_rate": 0.00022129013003235857, "loss": 2.2188, "step": 4417 }, { "epoch": 2.9258278145695362, "grad_norm": 0.7565697862116512, "learning_rate": 0.0002212313955738575, "loss": 1.9844, "step": 4418 }, { "epoch": 2.9264900662251656, "grad_norm": 0.8194897347181506, "learning_rate": 0.00022117264701072354, "loss": 2.2969, "step": 4419 }, { "epoch": 2.9271523178807946, "grad_norm": 0.7623189541295161, "learning_rate": 0.0002211138843545895, "loss": 1.9922, "step": 4420 }, { "epoch": 2.927814569536424, "grad_norm": 0.8139847421132483, "learning_rate": 0.00022105510761709102, "loss": 2.1562, "step": 4421 }, { "epoch": 2.928476821192053, "grad_norm": 0.7486468355834555, "learning_rate": 0.00022099631680986667, "loss": 1.9219, "step": 4422 }, { "epoch": 2.9291390728476823, "grad_norm": 0.7684198728068913, "learning_rate": 0.0002209375119445576, "loss": 2.125, "step": 4423 }, { "epoch": 2.929801324503311, "grad_norm": 0.7402609538144354, "learning_rate": 0.0002208786930328079, "loss": 2.1875, "step": 4424 }, { "epoch": 2.9304635761589406, "grad_norm": 0.6872091441898548, "learning_rate": 0.00022081986008626442, "loss": 1.8516, "step": 4425 }, { "epoch": 2.9311258278145695, "grad_norm": 0.8060554889522482, "learning_rate": 0.00022076101311657665, "loss": 2.1406, "step": 4426 }, { "epoch": 2.9317880794701985, "grad_norm": 0.782523512720665, "learning_rate": 0.00022070215213539708, "loss": 2.1094, "step": 4427 }, { "epoch": 2.932450331125828, "grad_norm": 0.7797477108745386, "learning_rate": 0.0002206432771543808, "loss": 2.0156, "step": 4428 }, { "epoch": 2.933112582781457, "grad_norm": 0.7406389955235828, "learning_rate": 0.00022058438818518575, "loss": 2.0938, "step": 4429 }, { "epoch": 2.933774834437086, "grad_norm": 0.7917561317572084, "learning_rate": 0.00022052548523947258, "loss": 2.1875, "step": 4430 }, { "epoch": 2.934437086092715, "grad_norm": 0.7673806799573446, "learning_rate": 0.00022046656832890485, "loss": 1.9531, "step": 4431 }, { "epoch": 2.9350993377483445, "grad_norm": 0.7592415231873181, "learning_rate": 0.00022040763746514867, "loss": 2.25, "step": 4432 }, { "epoch": 2.9357615894039735, "grad_norm": 0.68968495370084, "learning_rate": 0.0002203486926598731, "loss": 1.7891, "step": 4433 }, { "epoch": 2.936423841059603, "grad_norm": 0.8189087607148038, "learning_rate": 0.00022028973392474986, "loss": 2.2031, "step": 4434 }, { "epoch": 2.937086092715232, "grad_norm": 0.7820649968477431, "learning_rate": 0.00022023076127145345, "loss": 2.0781, "step": 4435 }, { "epoch": 2.9377483443708607, "grad_norm": 0.8106576243652908, "learning_rate": 0.00022017177471166113, "loss": 2.3594, "step": 4436 }, { "epoch": 2.93841059602649, "grad_norm": 0.7665170914068747, "learning_rate": 0.00022011277425705293, "loss": 2.25, "step": 4437 }, { "epoch": 2.939072847682119, "grad_norm": 0.7489571250743352, "learning_rate": 0.00022005375991931162, "loss": 2.0938, "step": 4438 }, { "epoch": 2.9397350993377485, "grad_norm": 0.7709425541830434, "learning_rate": 0.00021999473171012272, "loss": 2.0625, "step": 4439 }, { "epoch": 2.9403973509933774, "grad_norm": 0.7076182706472636, "learning_rate": 0.00021993568964117445, "loss": 2.0625, "step": 4440 }, { "epoch": 2.941059602649007, "grad_norm": 0.768315105440346, "learning_rate": 0.00021987663372415785, "loss": 2.0938, "step": 4441 }, { "epoch": 2.9417218543046357, "grad_norm": 0.760324880109208, "learning_rate": 0.00021981756397076665, "loss": 2.1719, "step": 4442 }, { "epoch": 2.942384105960265, "grad_norm": 0.7820826706849016, "learning_rate": 0.00021975848039269738, "loss": 2.0781, "step": 4443 }, { "epoch": 2.943046357615894, "grad_norm": 0.8109947859070346, "learning_rate": 0.00021969938300164923, "loss": 2.1406, "step": 4444 }, { "epoch": 2.943708609271523, "grad_norm": 0.786749196896755, "learning_rate": 0.00021964027180932416, "loss": 2.0625, "step": 4445 }, { "epoch": 2.9443708609271524, "grad_norm": 0.7957003400491892, "learning_rate": 0.00021958114682742687, "loss": 2.3281, "step": 4446 }, { "epoch": 2.9450331125827813, "grad_norm": 0.8195127519653472, "learning_rate": 0.0002195220080676648, "loss": 2.3594, "step": 4447 }, { "epoch": 2.9456953642384107, "grad_norm": 0.7735375450136942, "learning_rate": 0.00021946285554174806, "loss": 2.0781, "step": 4448 }, { "epoch": 2.9463576158940397, "grad_norm": 0.7419608431128963, "learning_rate": 0.0002194036892613896, "loss": 2.1719, "step": 4449 }, { "epoch": 2.9470198675496686, "grad_norm": 0.7199208229335173, "learning_rate": 0.00021934450923830496, "loss": 2.0625, "step": 4450 }, { "epoch": 2.947682119205298, "grad_norm": 0.7840832086790385, "learning_rate": 0.00021928531548421246, "loss": 2.2812, "step": 4451 }, { "epoch": 2.9483443708609274, "grad_norm": 0.7604315819547849, "learning_rate": 0.0002192261080108332, "loss": 1.9609, "step": 4452 }, { "epoch": 2.9490066225165563, "grad_norm": 0.7922142817023269, "learning_rate": 0.00021916688682989085, "loss": 2.2812, "step": 4453 }, { "epoch": 2.9496688741721853, "grad_norm": 0.721428609954887, "learning_rate": 0.000219107651953112, "loss": 2.0469, "step": 4454 }, { "epoch": 2.9503311258278146, "grad_norm": 0.7598234997873866, "learning_rate": 0.00021904840339222572, "loss": 2.1406, "step": 4455 }, { "epoch": 2.9509933774834436, "grad_norm": 0.7188071465148412, "learning_rate": 0.000218989141158964, "loss": 2.0625, "step": 4456 }, { "epoch": 2.951655629139073, "grad_norm": 0.743277619605507, "learning_rate": 0.00021892986526506135, "loss": 2.2969, "step": 4457 }, { "epoch": 2.952317880794702, "grad_norm": 0.682162637320931, "learning_rate": 0.00021887057572225516, "loss": 1.75, "step": 4458 }, { "epoch": 2.952980132450331, "grad_norm": 0.7834358923324569, "learning_rate": 0.00021881127254228533, "loss": 2.0781, "step": 4459 }, { "epoch": 2.9536423841059603, "grad_norm": 0.8120382484396265, "learning_rate": 0.00021875195573689473, "loss": 2.1719, "step": 4460 }, { "epoch": 2.9543046357615896, "grad_norm": 0.844829226007287, "learning_rate": 0.00021869262531782864, "loss": 2.1094, "step": 4461 }, { "epoch": 2.9549668874172186, "grad_norm": 0.7011440957275106, "learning_rate": 0.0002186332812968352, "loss": 1.7969, "step": 4462 }, { "epoch": 2.9556291390728475, "grad_norm": 0.7489062805397397, "learning_rate": 0.0002185739236856652, "loss": 2.0781, "step": 4463 }, { "epoch": 2.956291390728477, "grad_norm": 0.7525872556271725, "learning_rate": 0.00021851455249607217, "loss": 2.0469, "step": 4464 }, { "epoch": 2.956953642384106, "grad_norm": 0.7351881522733897, "learning_rate": 0.00021845516773981224, "loss": 1.7969, "step": 4465 }, { "epoch": 2.9576158940397352, "grad_norm": 0.7571555038065063, "learning_rate": 0.00021839576942864428, "loss": 1.8438, "step": 4466 }, { "epoch": 2.958278145695364, "grad_norm": 0.8291127019506365, "learning_rate": 0.0002183363575743298, "loss": 1.7344, "step": 4467 }, { "epoch": 2.958940397350993, "grad_norm": 0.7421690997261874, "learning_rate": 0.00021827693218863314, "loss": 2.0781, "step": 4468 }, { "epoch": 2.9596026490066225, "grad_norm": 0.7572631436657897, "learning_rate": 0.0002182174932833211, "loss": 2.1562, "step": 4469 }, { "epoch": 2.960264900662252, "grad_norm": 0.8328869960295198, "learning_rate": 0.0002181580408701633, "loss": 2.3594, "step": 4470 }, { "epoch": 2.960927152317881, "grad_norm": 0.8274603927309834, "learning_rate": 0.00021809857496093199, "loss": 2.1406, "step": 4471 }, { "epoch": 2.96158940397351, "grad_norm": 0.8031250321268061, "learning_rate": 0.00021803909556740214, "loss": 2.1719, "step": 4472 }, { "epoch": 2.962251655629139, "grad_norm": 0.7429841355674442, "learning_rate": 0.00021797960270135132, "loss": 2.0781, "step": 4473 }, { "epoch": 2.962913907284768, "grad_norm": 0.757421386297501, "learning_rate": 0.00021792009637455984, "loss": 2.0781, "step": 4474 }, { "epoch": 2.9635761589403975, "grad_norm": 0.7973023639367308, "learning_rate": 0.0002178605765988105, "loss": 2.3906, "step": 4475 }, { "epoch": 2.9642384105960264, "grad_norm": 0.794854240603579, "learning_rate": 0.00021780104338588906, "loss": 2.1562, "step": 4476 }, { "epoch": 2.9649006622516554, "grad_norm": 0.7059275976064484, "learning_rate": 0.00021774149674758371, "loss": 1.7188, "step": 4477 }, { "epoch": 2.9655629139072848, "grad_norm": 0.7881509581186316, "learning_rate": 0.00021768193669568538, "loss": 2.1875, "step": 4478 }, { "epoch": 2.966225165562914, "grad_norm": 0.77250826076815, "learning_rate": 0.00021762236324198762, "loss": 2.0938, "step": 4479 }, { "epoch": 2.966887417218543, "grad_norm": 0.7604436987816658, "learning_rate": 0.0002175627763982867, "loss": 1.9375, "step": 4480 }, { "epoch": 2.967549668874172, "grad_norm": 0.7646219472725809, "learning_rate": 0.00021750317617638147, "loss": 2.2188, "step": 4481 }, { "epoch": 2.9682119205298014, "grad_norm": 0.8201232309200533, "learning_rate": 0.00021744356258807343, "loss": 2.0938, "step": 4482 }, { "epoch": 2.9688741721854304, "grad_norm": 0.7506324416273996, "learning_rate": 0.00021738393564516682, "loss": 2.0156, "step": 4483 }, { "epoch": 2.9695364238410598, "grad_norm": 0.726410201903873, "learning_rate": 0.00021732429535946838, "loss": 2.0312, "step": 4484 }, { "epoch": 2.9701986754966887, "grad_norm": 0.7807045033605472, "learning_rate": 0.00021726464174278763, "loss": 2.0, "step": 4485 }, { "epoch": 2.9708609271523176, "grad_norm": 0.7726129997291804, "learning_rate": 0.00021720497480693664, "loss": 1.7578, "step": 4486 }, { "epoch": 2.971523178807947, "grad_norm": 0.8420518665642285, "learning_rate": 0.00021714529456373015, "loss": 2.0469, "step": 4487 }, { "epoch": 2.9721854304635764, "grad_norm": 0.8070035576329213, "learning_rate": 0.00021708560102498556, "loss": 2.1875, "step": 4488 }, { "epoch": 2.9728476821192054, "grad_norm": 0.7524861357345887, "learning_rate": 0.00021702589420252288, "loss": 2.0469, "step": 4489 }, { "epoch": 2.9735099337748343, "grad_norm": 0.7677955518242366, "learning_rate": 0.00021696617410816465, "loss": 2.0312, "step": 4490 }, { "epoch": 2.9741721854304637, "grad_norm": 0.7504578366731383, "learning_rate": 0.0002169064407537363, "loss": 2.0469, "step": 4491 }, { "epoch": 2.9748344370860926, "grad_norm": 0.8033570538067956, "learning_rate": 0.00021684669415106554, "loss": 2.2656, "step": 4492 }, { "epoch": 2.975496688741722, "grad_norm": 0.7635119197812412, "learning_rate": 0.000216786934311983, "loss": 2.1094, "step": 4493 }, { "epoch": 2.976158940397351, "grad_norm": 0.7896517448651882, "learning_rate": 0.0002167271612483218, "loss": 2.5156, "step": 4494 }, { "epoch": 2.97682119205298, "grad_norm": 0.7779784166007824, "learning_rate": 0.00021666737497191767, "loss": 2.2031, "step": 4495 }, { "epoch": 2.9774834437086093, "grad_norm": 0.7818042099696694, "learning_rate": 0.00021660757549460893, "loss": 2.1406, "step": 4496 }, { "epoch": 2.9781456953642387, "grad_norm": 0.8344879510920917, "learning_rate": 0.00021654776282823663, "loss": 2.0469, "step": 4497 }, { "epoch": 2.9788079470198676, "grad_norm": 0.7479499473487909, "learning_rate": 0.00021648793698464435, "loss": 1.9922, "step": 4498 }, { "epoch": 2.9794701986754966, "grad_norm": 0.7103081948807429, "learning_rate": 0.00021642809797567828, "loss": 2.0781, "step": 4499 }, { "epoch": 2.980132450331126, "grad_norm": 0.7847505325973558, "learning_rate": 0.00021636824581318726, "loss": 1.7891, "step": 4500 }, { "epoch": 2.980794701986755, "grad_norm": 0.7530241136721917, "learning_rate": 0.0002163083805090227, "loss": 2.0781, "step": 4501 }, { "epoch": 2.9814569536423843, "grad_norm": 0.8347279485319191, "learning_rate": 0.00021624850207503852, "loss": 2.3906, "step": 4502 }, { "epoch": 2.982119205298013, "grad_norm": 0.7465069649434877, "learning_rate": 0.0002161886105230915, "loss": 2.0156, "step": 4503 }, { "epoch": 2.982781456953642, "grad_norm": 0.7524288090210923, "learning_rate": 0.0002161287058650407, "loss": 2.0469, "step": 4504 }, { "epoch": 2.9834437086092715, "grad_norm": 0.7707613736899056, "learning_rate": 0.00021606878811274806, "loss": 2.0312, "step": 4505 }, { "epoch": 2.984105960264901, "grad_norm": 0.7549465315258881, "learning_rate": 0.00021600885727807788, "loss": 2.0, "step": 4506 }, { "epoch": 2.98476821192053, "grad_norm": 0.793532980187094, "learning_rate": 0.0002159489133728972, "loss": 2.1562, "step": 4507 }, { "epoch": 2.985430463576159, "grad_norm": 0.7851455886902055, "learning_rate": 0.00021588895640907557, "loss": 1.9922, "step": 4508 }, { "epoch": 2.986092715231788, "grad_norm": 0.8073792135091857, "learning_rate": 0.00021582898639848524, "loss": 2.2656, "step": 4509 }, { "epoch": 2.986754966887417, "grad_norm": 0.80474386683991, "learning_rate": 0.00021576900335300086, "loss": 1.9297, "step": 4510 }, { "epoch": 2.9874172185430465, "grad_norm": 0.7546028011285714, "learning_rate": 0.00021570900728449976, "loss": 2.1406, "step": 4511 }, { "epoch": 2.9880794701986755, "grad_norm": 0.7605544670390771, "learning_rate": 0.00021564899820486195, "loss": 1.8438, "step": 4512 }, { "epoch": 2.9887417218543044, "grad_norm": 0.8689870558522502, "learning_rate": 0.00021558897612596983, "loss": 2.3125, "step": 4513 }, { "epoch": 2.989403973509934, "grad_norm": 0.8289683484887721, "learning_rate": 0.0002155289410597085, "loss": 2.0781, "step": 4514 }, { "epoch": 2.9900662251655628, "grad_norm": 0.7891927341014031, "learning_rate": 0.00021546889301796556, "loss": 2.3906, "step": 4515 }, { "epoch": 2.990728476821192, "grad_norm": 0.7293865657255246, "learning_rate": 0.0002154088320126312, "loss": 1.9844, "step": 4516 }, { "epoch": 2.991390728476821, "grad_norm": 0.7793330282525184, "learning_rate": 0.00021534875805559825, "loss": 2.0938, "step": 4517 }, { "epoch": 2.9920529801324505, "grad_norm": 0.7307010919065026, "learning_rate": 0.00021528867115876203, "loss": 1.8438, "step": 4518 }, { "epoch": 2.9927152317880794, "grad_norm": 0.8255527419719854, "learning_rate": 0.00021522857133402033, "loss": 2.2188, "step": 4519 }, { "epoch": 2.993377483443709, "grad_norm": 0.7635729103734045, "learning_rate": 0.0002151684585932738, "loss": 1.6406, "step": 4520 }, { "epoch": 2.9940397350993377, "grad_norm": 0.7534430832238876, "learning_rate": 0.00021510833294842522, "loss": 2.0938, "step": 4521 }, { "epoch": 2.9947019867549667, "grad_norm": 0.7715012642016528, "learning_rate": 0.00021504819441138032, "loss": 2.1094, "step": 4522 }, { "epoch": 2.995364238410596, "grad_norm": 0.8744439778229881, "learning_rate": 0.0002149880429940472, "loss": 2.3438, "step": 4523 }, { "epoch": 2.996026490066225, "grad_norm": 0.7186175424265849, "learning_rate": 0.0002149278787083365, "loss": 2.0469, "step": 4524 }, { "epoch": 2.9966887417218544, "grad_norm": 0.6673242627878568, "learning_rate": 0.00021486770156616138, "loss": 1.6719, "step": 4525 }, { "epoch": 2.9973509933774833, "grad_norm": 0.737447693511495, "learning_rate": 0.00021480751157943777, "loss": 2.2188, "step": 4526 }, { "epoch": 2.9980132450331127, "grad_norm": 0.7752458871831162, "learning_rate": 0.00021474730876008377, "loss": 2.1406, "step": 4527 }, { "epoch": 2.9986754966887417, "grad_norm": 0.7023238930648998, "learning_rate": 0.00021468709312002035, "loss": 2.1094, "step": 4528 }, { "epoch": 2.999337748344371, "grad_norm": 0.7547587852404817, "learning_rate": 0.00021462686467117094, "loss": 2.125, "step": 4529 }, { "epoch": 3.0, "grad_norm": 0.7491532614618133, "learning_rate": 0.00021456662342546137, "loss": 2.0781, "step": 4530 }, { "epoch": 3.0, "eval_loss": 2.2021772861480713, "eval_runtime": 33.9942, "eval_samples_per_second": 9.943, "eval_steps_per_second": 9.943, "step": 4530 }, { "epoch": 3.000662251655629, "grad_norm": 0.7131464399924161, "learning_rate": 0.00021450636939482007, "loss": 1.8281, "step": 4531 }, { "epoch": 3.0013245033112583, "grad_norm": 0.7315825488786015, "learning_rate": 0.0002144461025911782, "loss": 1.8438, "step": 4532 }, { "epoch": 3.0019867549668873, "grad_norm": 0.7627443914973644, "learning_rate": 0.00021438582302646907, "loss": 1.8984, "step": 4533 }, { "epoch": 3.0026490066225167, "grad_norm": 0.6372938112303863, "learning_rate": 0.00021432553071262886, "loss": 1.3594, "step": 4534 }, { "epoch": 3.0033112582781456, "grad_norm": 0.7312805323907999, "learning_rate": 0.0002142652256615961, "loss": 1.7031, "step": 4535 }, { "epoch": 3.003973509933775, "grad_norm": 0.8102738333912007, "learning_rate": 0.0002142049078853119, "loss": 1.7188, "step": 4536 }, { "epoch": 3.004635761589404, "grad_norm": 0.733655360612381, "learning_rate": 0.00021414457739571985, "loss": 1.5938, "step": 4537 }, { "epoch": 3.0052980132450333, "grad_norm": 0.8222375491316457, "learning_rate": 0.00021408423420476606, "loss": 1.8047, "step": 4538 }, { "epoch": 3.0059602649006623, "grad_norm": 0.9004397607782434, "learning_rate": 0.00021402387832439922, "loss": 1.75, "step": 4539 }, { "epoch": 3.006622516556291, "grad_norm": 0.9069935129797175, "learning_rate": 0.0002139635097665704, "loss": 1.9453, "step": 4540 }, { "epoch": 3.0072847682119206, "grad_norm": 0.8466033725269645, "learning_rate": 0.0002139031285432334, "loss": 1.6797, "step": 4541 }, { "epoch": 3.0079470198675495, "grad_norm": 0.8723546670868306, "learning_rate": 0.00021384273466634428, "loss": 1.7656, "step": 4542 }, { "epoch": 3.008609271523179, "grad_norm": 0.8375098653039175, "learning_rate": 0.00021378232814786178, "loss": 1.6797, "step": 4543 }, { "epoch": 3.009271523178808, "grad_norm": 0.6548608054206255, "learning_rate": 0.0002137219089997471, "loss": 1.0469, "step": 4544 }, { "epoch": 3.0099337748344372, "grad_norm": 0.9014710478179863, "learning_rate": 0.00021366147723396385, "loss": 1.9531, "step": 4545 }, { "epoch": 3.010596026490066, "grad_norm": 0.8049453506205523, "learning_rate": 0.00021360103286247826, "loss": 1.4375, "step": 4546 }, { "epoch": 3.0112582781456956, "grad_norm": 0.8959935929308002, "learning_rate": 0.000213540575897259, "loss": 1.9688, "step": 4547 }, { "epoch": 3.0119205298013245, "grad_norm": 0.7435247641972823, "learning_rate": 0.00021348010635027724, "loss": 1.3125, "step": 4548 }, { "epoch": 3.0125827814569535, "grad_norm": 0.923209184977155, "learning_rate": 0.00021341962423350667, "loss": 1.8594, "step": 4549 }, { "epoch": 3.013245033112583, "grad_norm": 0.9202244523983142, "learning_rate": 0.0002133591295589234, "loss": 1.9375, "step": 4550 }, { "epoch": 3.013907284768212, "grad_norm": 0.844216680828011, "learning_rate": 0.00021329862233850606, "loss": 1.7031, "step": 4551 }, { "epoch": 3.014569536423841, "grad_norm": 0.8039576780111451, "learning_rate": 0.00021323810258423585, "loss": 1.625, "step": 4552 }, { "epoch": 3.01523178807947, "grad_norm": 0.8756934912306449, "learning_rate": 0.00021317757030809632, "loss": 1.8281, "step": 4553 }, { "epoch": 3.0158940397350995, "grad_norm": 0.8519920743999857, "learning_rate": 0.00021311702552207352, "loss": 1.8672, "step": 4554 }, { "epoch": 3.0165562913907285, "grad_norm": 0.8499042001838856, "learning_rate": 0.0002130564682381561, "loss": 1.5703, "step": 4555 }, { "epoch": 3.017218543046358, "grad_norm": 0.8222492742116871, "learning_rate": 0.000212995898468335, "loss": 1.5312, "step": 4556 }, { "epoch": 3.017880794701987, "grad_norm": 0.8052348626261567, "learning_rate": 0.00021293531622460386, "loss": 1.6094, "step": 4557 }, { "epoch": 3.0185430463576157, "grad_norm": 0.8028596088386369, "learning_rate": 0.00021287472151895853, "loss": 1.6484, "step": 4558 }, { "epoch": 3.019205298013245, "grad_norm": 0.8643759578492328, "learning_rate": 0.0002128141143633975, "loss": 2.0, "step": 4559 }, { "epoch": 3.019867549668874, "grad_norm": 0.9014997843574897, "learning_rate": 0.00021275349476992166, "loss": 1.7969, "step": 4560 }, { "epoch": 3.0205298013245034, "grad_norm": 0.7359853546995025, "learning_rate": 0.00021269286275053446, "loss": 1.3906, "step": 4561 }, { "epoch": 3.0211920529801324, "grad_norm": 0.884493549163444, "learning_rate": 0.00021263221831724165, "loss": 1.8438, "step": 4562 }, { "epoch": 3.0218543046357618, "grad_norm": 0.8785724865075689, "learning_rate": 0.00021257156148205158, "loss": 1.8828, "step": 4563 }, { "epoch": 3.0225165562913907, "grad_norm": 0.8061396655751379, "learning_rate": 0.00021251089225697497, "loss": 1.5469, "step": 4564 }, { "epoch": 3.0231788079470197, "grad_norm": 0.8747469273803027, "learning_rate": 0.00021245021065402503, "loss": 1.6719, "step": 4565 }, { "epoch": 3.023841059602649, "grad_norm": 0.9443017460997651, "learning_rate": 0.0002123895166852174, "loss": 1.9375, "step": 4566 }, { "epoch": 3.024503311258278, "grad_norm": 0.7679332518947736, "learning_rate": 0.00021232881036257022, "loss": 1.4688, "step": 4567 }, { "epoch": 3.0251655629139074, "grad_norm": 0.8406131648650397, "learning_rate": 0.000212268091698104, "loss": 1.6016, "step": 4568 }, { "epoch": 3.0258278145695363, "grad_norm": 0.9384893661620362, "learning_rate": 0.00021220736070384172, "loss": 1.9375, "step": 4569 }, { "epoch": 3.0264900662251657, "grad_norm": 0.7703089869103229, "learning_rate": 0.00021214661739180884, "loss": 1.3906, "step": 4570 }, { "epoch": 3.0271523178807946, "grad_norm": 0.8066307329530482, "learning_rate": 0.00021208586177403328, "loss": 1.4844, "step": 4571 }, { "epoch": 3.027814569536424, "grad_norm": 1.1132677778480813, "learning_rate": 0.00021202509386254525, "loss": 1.9219, "step": 4572 }, { "epoch": 3.028476821192053, "grad_norm": 0.890771902168676, "learning_rate": 0.0002119643136693776, "loss": 1.8359, "step": 4573 }, { "epoch": 3.029139072847682, "grad_norm": 0.7472995897435486, "learning_rate": 0.0002119035212065654, "loss": 1.2656, "step": 4574 }, { "epoch": 3.0298013245033113, "grad_norm": 0.8828345557916857, "learning_rate": 0.00021184271648614625, "loss": 1.7969, "step": 4575 }, { "epoch": 3.0304635761589402, "grad_norm": 0.841822873400544, "learning_rate": 0.00021178189952016034, "loss": 1.6328, "step": 4576 }, { "epoch": 3.0311258278145696, "grad_norm": 0.9198778704679523, "learning_rate": 0.0002117210703206499, "loss": 1.7891, "step": 4577 }, { "epoch": 3.0317880794701986, "grad_norm": 0.8884872629891888, "learning_rate": 0.00021166022889966004, "loss": 1.8438, "step": 4578 }, { "epoch": 3.032450331125828, "grad_norm": 0.7570907170547456, "learning_rate": 0.00021159937526923786, "loss": 1.3281, "step": 4579 }, { "epoch": 3.033112582781457, "grad_norm": 0.7928908586603545, "learning_rate": 0.00021153850944143323, "loss": 1.6953, "step": 4580 }, { "epoch": 3.0337748344370863, "grad_norm": 0.8569221305524763, "learning_rate": 0.00021147763142829817, "loss": 1.7969, "step": 4581 }, { "epoch": 3.0344370860927152, "grad_norm": 0.8089661198031223, "learning_rate": 0.00021141674124188728, "loss": 1.4219, "step": 4582 }, { "epoch": 3.035099337748344, "grad_norm": 0.973653013340457, "learning_rate": 0.00021135583889425748, "loss": 1.8516, "step": 4583 }, { "epoch": 3.0357615894039736, "grad_norm": 0.9640560781507477, "learning_rate": 0.0002112949243974682, "loss": 1.9922, "step": 4584 }, { "epoch": 3.0364238410596025, "grad_norm": 0.7471467931816759, "learning_rate": 0.00021123399776358114, "loss": 1.4141, "step": 4585 }, { "epoch": 3.037086092715232, "grad_norm": 0.8675873300238508, "learning_rate": 0.00021117305900466052, "loss": 1.7422, "step": 4586 }, { "epoch": 3.037748344370861, "grad_norm": 0.7666140223996569, "learning_rate": 0.0002111121081327729, "loss": 1.3984, "step": 4587 }, { "epoch": 3.03841059602649, "grad_norm": 0.9673253935338466, "learning_rate": 0.00021105114515998722, "loss": 1.7578, "step": 4588 }, { "epoch": 3.039072847682119, "grad_norm": 0.8238607407814207, "learning_rate": 0.00021099017009837485, "loss": 1.4141, "step": 4589 }, { "epoch": 3.0397350993377485, "grad_norm": 0.799145687643931, "learning_rate": 0.0002109291829600096, "loss": 1.3594, "step": 4590 }, { "epoch": 3.0403973509933775, "grad_norm": 0.9106099219160904, "learning_rate": 0.00021086818375696756, "loss": 1.6328, "step": 4591 }, { "epoch": 3.0410596026490064, "grad_norm": 0.8384710172189688, "learning_rate": 0.00021080717250132735, "loss": 1.2969, "step": 4592 }, { "epoch": 3.041721854304636, "grad_norm": 0.8832237967610664, "learning_rate": 0.0002107461492051698, "loss": 1.4531, "step": 4593 }, { "epoch": 3.0423841059602648, "grad_norm": 0.9935895311907262, "learning_rate": 0.00021068511388057837, "loss": 1.7266, "step": 4594 }, { "epoch": 3.043046357615894, "grad_norm": 0.8499970704099936, "learning_rate": 0.0002106240665396385, "loss": 1.3984, "step": 4595 }, { "epoch": 3.043708609271523, "grad_norm": 0.9769769336712841, "learning_rate": 0.0002105630071944386, "loss": 1.6328, "step": 4596 }, { "epoch": 3.0443708609271525, "grad_norm": 1.0832507525221649, "learning_rate": 0.00021050193585706882, "loss": 2.0156, "step": 4597 }, { "epoch": 3.0450331125827814, "grad_norm": 0.9635711526587869, "learning_rate": 0.00021044085253962217, "loss": 1.6797, "step": 4598 }, { "epoch": 3.045695364238411, "grad_norm": 0.8859375874305845, "learning_rate": 0.00021037975725419377, "loss": 1.4844, "step": 4599 }, { "epoch": 3.0463576158940397, "grad_norm": 0.8787299114979397, "learning_rate": 0.0002103186500128812, "loss": 1.5859, "step": 4600 }, { "epoch": 3.0470198675496687, "grad_norm": 0.8284845545333706, "learning_rate": 0.00021025753082778441, "loss": 1.6172, "step": 4601 }, { "epoch": 3.047682119205298, "grad_norm": 0.900813420277025, "learning_rate": 0.00021019639971100567, "loss": 1.7031, "step": 4602 }, { "epoch": 3.048344370860927, "grad_norm": 0.8666937738309455, "learning_rate": 0.00021013525667464965, "loss": 1.6016, "step": 4603 }, { "epoch": 3.0490066225165564, "grad_norm": 0.7283491076238269, "learning_rate": 0.00021007410173082337, "loss": 1.3984, "step": 4604 }, { "epoch": 3.0496688741721854, "grad_norm": 0.9705945905861213, "learning_rate": 0.00021001293489163628, "loss": 1.7656, "step": 4605 }, { "epoch": 3.0503311258278147, "grad_norm": 0.7777081469681147, "learning_rate": 0.0002099517561692, "loss": 1.4219, "step": 4606 }, { "epoch": 3.0509933774834437, "grad_norm": 0.9182666753062396, "learning_rate": 0.00020989056557562871, "loss": 1.8359, "step": 4607 }, { "epoch": 3.0516556291390726, "grad_norm": 0.9007214848063517, "learning_rate": 0.00020982936312303877, "loss": 1.6719, "step": 4608 }, { "epoch": 3.052317880794702, "grad_norm": 0.9561937945465591, "learning_rate": 0.00020976814882354907, "loss": 1.625, "step": 4609 }, { "epoch": 3.052980132450331, "grad_norm": 0.9039706530854671, "learning_rate": 0.00020970692268928063, "loss": 1.5781, "step": 4610 }, { "epoch": 3.0536423841059603, "grad_norm": 0.8564175378020064, "learning_rate": 0.00020964568473235703, "loss": 1.4375, "step": 4611 }, { "epoch": 3.0543046357615893, "grad_norm": 0.9223629094108217, "learning_rate": 0.00020958443496490396, "loss": 1.7734, "step": 4612 }, { "epoch": 3.0549668874172187, "grad_norm": 0.9474363510730253, "learning_rate": 0.00020952317339904974, "loss": 1.8359, "step": 4613 }, { "epoch": 3.0556291390728476, "grad_norm": 0.9071396190587373, "learning_rate": 0.0002094619000469247, "loss": 1.7578, "step": 4614 }, { "epoch": 3.056291390728477, "grad_norm": 0.8504704858586286, "learning_rate": 0.0002094006149206618, "loss": 1.4766, "step": 4615 }, { "epoch": 3.056953642384106, "grad_norm": 0.7386474272079273, "learning_rate": 0.0002093393180323961, "loss": 1.2812, "step": 4616 }, { "epoch": 3.057615894039735, "grad_norm": 0.8537485674543925, "learning_rate": 0.00020927800939426518, "loss": 1.7031, "step": 4617 }, { "epoch": 3.0582781456953643, "grad_norm": 0.8304351279541017, "learning_rate": 0.00020921668901840873, "loss": 1.5078, "step": 4618 }, { "epoch": 3.058940397350993, "grad_norm": 0.9152434371276499, "learning_rate": 0.000209155356916969, "loss": 1.5, "step": 4619 }, { "epoch": 3.0596026490066226, "grad_norm": 1.0246764449055963, "learning_rate": 0.00020909401310209038, "loss": 2.125, "step": 4620 }, { "epoch": 3.0602649006622515, "grad_norm": 0.8619681759175868, "learning_rate": 0.00020903265758591972, "loss": 1.4609, "step": 4621 }, { "epoch": 3.060927152317881, "grad_norm": 0.986774357536471, "learning_rate": 0.00020897129038060602, "loss": 1.8984, "step": 4622 }, { "epoch": 3.06158940397351, "grad_norm": 0.8832867703865735, "learning_rate": 0.0002089099114983008, "loss": 1.7422, "step": 4623 }, { "epoch": 3.0622516556291393, "grad_norm": 1.1136145066037855, "learning_rate": 0.00020884852095115762, "loss": 2.0156, "step": 4624 }, { "epoch": 3.062913907284768, "grad_norm": 0.7701724040348383, "learning_rate": 0.0002087871187513327, "loss": 1.2344, "step": 4625 }, { "epoch": 3.063576158940397, "grad_norm": 1.0510210699670206, "learning_rate": 0.00020872570491098428, "loss": 1.8984, "step": 4626 }, { "epoch": 3.0642384105960265, "grad_norm": 0.7827687804137577, "learning_rate": 0.00020866427944227304, "loss": 1.3906, "step": 4627 }, { "epoch": 3.0649006622516555, "grad_norm": 0.9443186185881881, "learning_rate": 0.00020860284235736188, "loss": 2.0156, "step": 4628 }, { "epoch": 3.065562913907285, "grad_norm": 0.8229279260942582, "learning_rate": 0.00020854139366841606, "loss": 1.5938, "step": 4629 }, { "epoch": 3.066225165562914, "grad_norm": 0.7990088074615567, "learning_rate": 0.00020847993338760318, "loss": 1.3672, "step": 4630 }, { "epoch": 3.066887417218543, "grad_norm": 0.8550011324768562, "learning_rate": 0.000208418461527093, "loss": 1.7188, "step": 4631 }, { "epoch": 3.067549668874172, "grad_norm": 0.8590525042001437, "learning_rate": 0.00020835697809905773, "loss": 1.5, "step": 4632 }, { "epoch": 3.0682119205298015, "grad_norm": 0.8336930317846128, "learning_rate": 0.00020829548311567168, "loss": 1.5391, "step": 4633 }, { "epoch": 3.0688741721854305, "grad_norm": 1.012098809369901, "learning_rate": 0.00020823397658911169, "loss": 1.8359, "step": 4634 }, { "epoch": 3.0695364238410594, "grad_norm": 0.963966179292962, "learning_rate": 0.0002081724585315567, "loss": 1.8125, "step": 4635 }, { "epoch": 3.070198675496689, "grad_norm": 0.9021894892752844, "learning_rate": 0.000208110928955188, "loss": 1.7969, "step": 4636 }, { "epoch": 3.0708609271523177, "grad_norm": 0.780321266236924, "learning_rate": 0.00020804938787218917, "loss": 1.1953, "step": 4637 }, { "epoch": 3.071523178807947, "grad_norm": 0.9471282440675518, "learning_rate": 0.000207987835294746, "loss": 1.7656, "step": 4638 }, { "epoch": 3.072185430463576, "grad_norm": 0.8910302017289745, "learning_rate": 0.00020792627123504667, "loss": 1.4531, "step": 4639 }, { "epoch": 3.0728476821192054, "grad_norm": 0.9561560252734238, "learning_rate": 0.00020786469570528148, "loss": 1.7656, "step": 4640 }, { "epoch": 3.0735099337748344, "grad_norm": 0.8374932546815229, "learning_rate": 0.00020780310871764313, "loss": 1.3125, "step": 4641 }, { "epoch": 3.0741721854304638, "grad_norm": 0.9404439357163489, "learning_rate": 0.00020774151028432662, "loss": 1.7812, "step": 4642 }, { "epoch": 3.0748344370860927, "grad_norm": 0.810624391049294, "learning_rate": 0.00020767990041752906, "loss": 1.375, "step": 4643 }, { "epoch": 3.0754966887417217, "grad_norm": 0.8775971798246673, "learning_rate": 0.00020761827912944993, "loss": 1.4531, "step": 4644 }, { "epoch": 3.076158940397351, "grad_norm": 0.8160371525473371, "learning_rate": 0.00020755664643229099, "loss": 1.4453, "step": 4645 }, { "epoch": 3.07682119205298, "grad_norm": 0.9290192120049136, "learning_rate": 0.00020749500233825617, "loss": 1.4922, "step": 4646 }, { "epoch": 3.0774834437086094, "grad_norm": 0.9942809664727316, "learning_rate": 0.0002074333468595517, "loss": 1.8438, "step": 4647 }, { "epoch": 3.0781456953642383, "grad_norm": 0.9910832449205357, "learning_rate": 0.00020737168000838617, "loss": 1.625, "step": 4648 }, { "epoch": 3.0788079470198677, "grad_norm": 0.9002561202049684, "learning_rate": 0.00020731000179697017, "loss": 1.5469, "step": 4649 }, { "epoch": 3.0794701986754967, "grad_norm": 0.8982270775150784, "learning_rate": 0.00020724831223751684, "loss": 1.75, "step": 4650 }, { "epoch": 3.080132450331126, "grad_norm": 1.0866889880308153, "learning_rate": 0.00020718661134224136, "loss": 1.8672, "step": 4651 }, { "epoch": 3.080794701986755, "grad_norm": 0.933369095818204, "learning_rate": 0.00020712489912336118, "loss": 1.6562, "step": 4652 }, { "epoch": 3.081456953642384, "grad_norm": 1.0057196713202659, "learning_rate": 0.00020706317559309603, "loss": 1.7734, "step": 4653 }, { "epoch": 3.0821192052980133, "grad_norm": 0.8813842125980536, "learning_rate": 0.00020700144076366797, "loss": 1.4688, "step": 4654 }, { "epoch": 3.0827814569536423, "grad_norm": 0.9609692115211225, "learning_rate": 0.00020693969464730108, "loss": 1.7188, "step": 4655 }, { "epoch": 3.0834437086092716, "grad_norm": 0.8972580430866657, "learning_rate": 0.00020687793725622188, "loss": 1.5859, "step": 4656 }, { "epoch": 3.0841059602649006, "grad_norm": 0.9109002493638131, "learning_rate": 0.00020681616860265906, "loss": 1.6641, "step": 4657 }, { "epoch": 3.08476821192053, "grad_norm": 0.8807301057891866, "learning_rate": 0.00020675438869884342, "loss": 1.6875, "step": 4658 }, { "epoch": 3.085430463576159, "grad_norm": 0.9106604670290821, "learning_rate": 0.00020669259755700818, "loss": 1.7422, "step": 4659 }, { "epoch": 3.0860927152317883, "grad_norm": 0.8628574401047973, "learning_rate": 0.00020663079518938867, "loss": 1.5703, "step": 4660 }, { "epoch": 3.0867549668874172, "grad_norm": 1.0409016032812368, "learning_rate": 0.00020656898160822249, "loss": 2.0625, "step": 4661 }, { "epoch": 3.087417218543046, "grad_norm": 0.8719933766753399, "learning_rate": 0.00020650715682574936, "loss": 1.6562, "step": 4662 }, { "epoch": 3.0880794701986756, "grad_norm": 0.9706143624576691, "learning_rate": 0.00020644532085421132, "loss": 1.7891, "step": 4663 }, { "epoch": 3.0887417218543045, "grad_norm": 0.8872837828024954, "learning_rate": 0.00020638347370585265, "loss": 1.7344, "step": 4664 }, { "epoch": 3.089403973509934, "grad_norm": 0.9966242375420337, "learning_rate": 0.00020632161539291982, "loss": 1.8281, "step": 4665 }, { "epoch": 3.090066225165563, "grad_norm": 0.9232214697770519, "learning_rate": 0.0002062597459276614, "loss": 1.7266, "step": 4666 }, { "epoch": 3.0907284768211922, "grad_norm": 0.8506549686225917, "learning_rate": 0.00020619786532232828, "loss": 1.5547, "step": 4667 }, { "epoch": 3.091390728476821, "grad_norm": 0.8510794844870859, "learning_rate": 0.00020613597358917359, "loss": 1.4688, "step": 4668 }, { "epoch": 3.0920529801324506, "grad_norm": 0.8539137687031026, "learning_rate": 0.0002060740707404525, "loss": 1.7578, "step": 4669 }, { "epoch": 3.0927152317880795, "grad_norm": 0.7947955576786425, "learning_rate": 0.0002060121567884225, "loss": 1.4688, "step": 4670 }, { "epoch": 3.0933774834437084, "grad_norm": 0.7698068357013594, "learning_rate": 0.0002059502317453434, "loss": 1.3281, "step": 4671 }, { "epoch": 3.094039735099338, "grad_norm": 0.9037490549810397, "learning_rate": 0.00020588829562347683, "loss": 1.6797, "step": 4672 }, { "epoch": 3.0947019867549668, "grad_norm": 0.9421652208257584, "learning_rate": 0.00020582634843508712, "loss": 1.7188, "step": 4673 }, { "epoch": 3.095364238410596, "grad_norm": 0.9059003261302294, "learning_rate": 0.00020576439019244033, "loss": 1.6797, "step": 4674 }, { "epoch": 3.096026490066225, "grad_norm": 0.9401419305067541, "learning_rate": 0.00020570242090780496, "loss": 1.6797, "step": 4675 }, { "epoch": 3.0966887417218545, "grad_norm": 0.8776207593802571, "learning_rate": 0.00020564044059345163, "loss": 1.5625, "step": 4676 }, { "epoch": 3.0973509933774834, "grad_norm": 0.8427510199871955, "learning_rate": 0.00020557844926165323, "loss": 1.4375, "step": 4677 }, { "epoch": 3.0980132450331124, "grad_norm": 0.9095356597994694, "learning_rate": 0.00020551644692468464, "loss": 1.6328, "step": 4678 }, { "epoch": 3.0986754966887418, "grad_norm": 1.0074786021084423, "learning_rate": 0.00020545443359482307, "loss": 1.8594, "step": 4679 }, { "epoch": 3.0993377483443707, "grad_norm": 0.7677534564652018, "learning_rate": 0.0002053924092843479, "loss": 1.3047, "step": 4680 }, { "epoch": 3.1, "grad_norm": 0.8658906503570044, "learning_rate": 0.0002053303740055407, "loss": 1.5156, "step": 4681 }, { "epoch": 3.100662251655629, "grad_norm": 0.9739836221842648, "learning_rate": 0.000205268327770685, "loss": 1.8984, "step": 4682 }, { "epoch": 3.1013245033112584, "grad_norm": 0.9467927576376061, "learning_rate": 0.00020520627059206686, "loss": 1.7656, "step": 4683 }, { "epoch": 3.1019867549668874, "grad_norm": 0.8780570662627344, "learning_rate": 0.00020514420248197415, "loss": 1.6562, "step": 4684 }, { "epoch": 3.1026490066225167, "grad_norm": 0.9108640336754732, "learning_rate": 0.0002050821234526972, "loss": 1.6484, "step": 4685 }, { "epoch": 3.1033112582781457, "grad_norm": 0.9921169495990961, "learning_rate": 0.00020502003351652827, "loss": 1.8125, "step": 4686 }, { "epoch": 3.1039735099337746, "grad_norm": 0.9812881323981125, "learning_rate": 0.00020495793268576193, "loss": 1.7891, "step": 4687 }, { "epoch": 3.104635761589404, "grad_norm": 0.9528464449426903, "learning_rate": 0.00020489582097269485, "loss": 1.8047, "step": 4688 }, { "epoch": 3.105298013245033, "grad_norm": 0.8829663942487612, "learning_rate": 0.00020483369838962588, "loss": 1.7109, "step": 4689 }, { "epoch": 3.1059602649006623, "grad_norm": 0.9429364684072947, "learning_rate": 0.00020477156494885592, "loss": 1.9062, "step": 4690 }, { "epoch": 3.1066225165562913, "grad_norm": 0.743973398140737, "learning_rate": 0.00020470942066268827, "loss": 1.4766, "step": 4691 }, { "epoch": 3.1072847682119207, "grad_norm": 0.8439504905370184, "learning_rate": 0.00020464726554342798, "loss": 1.6484, "step": 4692 }, { "epoch": 3.1079470198675496, "grad_norm": 0.8708859799859928, "learning_rate": 0.00020458509960338266, "loss": 1.6953, "step": 4693 }, { "epoch": 3.108609271523179, "grad_norm": 0.8758578885670828, "learning_rate": 0.00020452292285486184, "loss": 1.8047, "step": 4694 }, { "epoch": 3.109271523178808, "grad_norm": 0.8138431188831488, "learning_rate": 0.0002044607353101772, "loss": 1.5312, "step": 4695 }, { "epoch": 3.109933774834437, "grad_norm": 0.7454445131733027, "learning_rate": 0.00020439853698164259, "loss": 1.4062, "step": 4696 }, { "epoch": 3.1105960264900663, "grad_norm": 0.8581127802867281, "learning_rate": 0.00020433632788157403, "loss": 1.6719, "step": 4697 }, { "epoch": 3.111258278145695, "grad_norm": 0.995488758376394, "learning_rate": 0.00020427410802228958, "loss": 2.0469, "step": 4698 }, { "epoch": 3.1119205298013246, "grad_norm": 0.9041309430174965, "learning_rate": 0.00020421187741610945, "loss": 1.5234, "step": 4699 }, { "epoch": 3.1125827814569536, "grad_norm": 0.8969536875233088, "learning_rate": 0.00020414963607535616, "loss": 1.6328, "step": 4700 }, { "epoch": 3.113245033112583, "grad_norm": 0.9864085304226945, "learning_rate": 0.00020408738401235405, "loss": 1.9453, "step": 4701 }, { "epoch": 3.113907284768212, "grad_norm": 1.00770183925664, "learning_rate": 0.00020402512123942989, "loss": 1.9922, "step": 4702 }, { "epoch": 3.1145695364238413, "grad_norm": 0.9464208092604262, "learning_rate": 0.00020396284776891228, "loss": 1.7812, "step": 4703 }, { "epoch": 3.11523178807947, "grad_norm": 0.9373626234723549, "learning_rate": 0.0002039005636131322, "loss": 1.6641, "step": 4704 }, { "epoch": 3.115894039735099, "grad_norm": 0.8730635001507833, "learning_rate": 0.00020383826878442253, "loss": 1.5078, "step": 4705 }, { "epoch": 3.1165562913907285, "grad_norm": 0.9852556376590144, "learning_rate": 0.00020377596329511843, "loss": 1.8125, "step": 4706 }, { "epoch": 3.1172185430463575, "grad_norm": 0.9307282502607125, "learning_rate": 0.00020371364715755707, "loss": 1.9297, "step": 4707 }, { "epoch": 3.117880794701987, "grad_norm": 0.9247479426248992, "learning_rate": 0.00020365132038407775, "loss": 1.9141, "step": 4708 }, { "epoch": 3.118543046357616, "grad_norm": 0.9707305264364012, "learning_rate": 0.00020358898298702193, "loss": 1.8359, "step": 4709 }, { "epoch": 3.119205298013245, "grad_norm": 0.8132560641434111, "learning_rate": 0.0002035266349787331, "loss": 1.625, "step": 4710 }, { "epoch": 3.119867549668874, "grad_norm": 0.983083712009347, "learning_rate": 0.0002034642763715568, "loss": 1.8828, "step": 4711 }, { "epoch": 3.120529801324503, "grad_norm": 0.8893176483203356, "learning_rate": 0.00020340190717784099, "loss": 1.6172, "step": 4712 }, { "epoch": 3.1211920529801325, "grad_norm": 0.9781223684790548, "learning_rate": 0.0002033395274099352, "loss": 1.9219, "step": 4713 }, { "epoch": 3.1218543046357614, "grad_norm": 0.8302557960962095, "learning_rate": 0.0002032771370801915, "loss": 1.5391, "step": 4714 }, { "epoch": 3.122516556291391, "grad_norm": 0.939152866317544, "learning_rate": 0.00020321473620096386, "loss": 1.9062, "step": 4715 }, { "epoch": 3.1231788079470197, "grad_norm": 0.8070751496203296, "learning_rate": 0.00020315232478460837, "loss": 1.4688, "step": 4716 }, { "epoch": 3.123841059602649, "grad_norm": 0.8605305543412525, "learning_rate": 0.00020308990284348315, "loss": 1.6719, "step": 4717 }, { "epoch": 3.124503311258278, "grad_norm": 1.488738921551124, "learning_rate": 0.0002030274703899486, "loss": 1.5703, "step": 4718 }, { "epoch": 3.1251655629139075, "grad_norm": 0.9093921971000104, "learning_rate": 0.00020296502743636692, "loss": 1.7109, "step": 4719 }, { "epoch": 3.1258278145695364, "grad_norm": 0.8938371419431302, "learning_rate": 0.00020290257399510262, "loss": 1.7344, "step": 4720 }, { "epoch": 3.1264900662251653, "grad_norm": 0.8135085818442961, "learning_rate": 0.00020284011007852212, "loss": 1.3984, "step": 4721 }, { "epoch": 3.1271523178807947, "grad_norm": 0.9250476231939163, "learning_rate": 0.00020277763569899403, "loss": 1.7891, "step": 4722 }, { "epoch": 3.1278145695364237, "grad_norm": 1.0292301280420675, "learning_rate": 0.000202715150868889, "loss": 1.7812, "step": 4723 }, { "epoch": 3.128476821192053, "grad_norm": 0.8435942079855326, "learning_rate": 0.0002026526556005798, "loss": 1.4766, "step": 4724 }, { "epoch": 3.129139072847682, "grad_norm": 0.8479999093634796, "learning_rate": 0.00020259014990644107, "loss": 1.4531, "step": 4725 }, { "epoch": 3.1298013245033114, "grad_norm": 0.9081233401945791, "learning_rate": 0.00020252763379884978, "loss": 1.5391, "step": 4726 }, { "epoch": 3.1304635761589403, "grad_norm": 0.8418863332855729, "learning_rate": 0.00020246510729018477, "loss": 1.4297, "step": 4727 }, { "epoch": 3.1311258278145697, "grad_norm": 0.9531005290172077, "learning_rate": 0.00020240257039282698, "loss": 1.6562, "step": 4728 }, { "epoch": 3.1317880794701987, "grad_norm": 0.9498994217447143, "learning_rate": 0.00020234002311915955, "loss": 1.7031, "step": 4729 }, { "epoch": 3.1324503311258276, "grad_norm": 0.9122095478336788, "learning_rate": 0.00020227746548156744, "loss": 1.6016, "step": 4730 }, { "epoch": 3.133112582781457, "grad_norm": 0.8667728389547882, "learning_rate": 0.00020221489749243783, "loss": 1.6562, "step": 4731 }, { "epoch": 3.133774834437086, "grad_norm": 0.8730820325350501, "learning_rate": 0.00020215231916415988, "loss": 1.5625, "step": 4732 }, { "epoch": 3.1344370860927153, "grad_norm": 0.8724884287334662, "learning_rate": 0.00020208973050912487, "loss": 1.5391, "step": 4733 }, { "epoch": 3.1350993377483443, "grad_norm": 0.8976583527234036, "learning_rate": 0.00020202713153972597, "loss": 1.5312, "step": 4734 }, { "epoch": 3.1357615894039736, "grad_norm": 1.0609405924304658, "learning_rate": 0.00020196452226835864, "loss": 2.1406, "step": 4735 }, { "epoch": 3.1364238410596026, "grad_norm": 0.9866013172667113, "learning_rate": 0.00020190190270742006, "loss": 1.9062, "step": 4736 }, { "epoch": 3.137086092715232, "grad_norm": 1.0038374344526333, "learning_rate": 0.00020183927286930974, "loss": 1.8672, "step": 4737 }, { "epoch": 3.137748344370861, "grad_norm": 0.926491647796575, "learning_rate": 0.00020177663276642907, "loss": 1.6641, "step": 4738 }, { "epoch": 3.13841059602649, "grad_norm": 0.9604939795846617, "learning_rate": 0.00020171398241118155, "loss": 1.7812, "step": 4739 }, { "epoch": 3.1390728476821192, "grad_norm": 0.7811555641962808, "learning_rate": 0.00020165132181597256, "loss": 1.4062, "step": 4740 }, { "epoch": 3.139735099337748, "grad_norm": 0.8726268745451699, "learning_rate": 0.00020158865099320975, "loss": 1.625, "step": 4741 }, { "epoch": 3.1403973509933776, "grad_norm": 0.9145235732264867, "learning_rate": 0.00020152596995530255, "loss": 1.7031, "step": 4742 }, { "epoch": 3.1410596026490065, "grad_norm": 0.949005804819896, "learning_rate": 0.0002014632787146626, "loss": 1.9062, "step": 4743 }, { "epoch": 3.141721854304636, "grad_norm": 0.9539903473673941, "learning_rate": 0.0002014005772837035, "loss": 1.8516, "step": 4744 }, { "epoch": 3.142384105960265, "grad_norm": 0.78762212482454, "learning_rate": 0.00020133786567484076, "loss": 1.3125, "step": 4745 }, { "epoch": 3.1430463576158942, "grad_norm": 0.8680657528549294, "learning_rate": 0.00020127514390049203, "loss": 1.5781, "step": 4746 }, { "epoch": 3.143708609271523, "grad_norm": 0.9599630427991712, "learning_rate": 0.000201212411973077, "loss": 1.7188, "step": 4747 }, { "epoch": 3.144370860927152, "grad_norm": 1.0601707812336376, "learning_rate": 0.00020114966990501722, "loss": 1.8125, "step": 4748 }, { "epoch": 3.1450331125827815, "grad_norm": 0.9135483200718882, "learning_rate": 0.00020108691770873646, "loss": 1.5312, "step": 4749 }, { "epoch": 3.1456953642384105, "grad_norm": 0.9371944369931705, "learning_rate": 0.00020102415539666023, "loss": 1.7812, "step": 4750 }, { "epoch": 3.14635761589404, "grad_norm": 0.9380896744405436, "learning_rate": 0.00020096138298121628, "loss": 1.6719, "step": 4751 }, { "epoch": 3.147019867549669, "grad_norm": 1.0001339059853116, "learning_rate": 0.00020089860047483427, "loss": 1.7422, "step": 4752 }, { "epoch": 3.147682119205298, "grad_norm": 0.943004838918222, "learning_rate": 0.00020083580788994582, "loss": 1.5859, "step": 4753 }, { "epoch": 3.148344370860927, "grad_norm": 0.9761495831594614, "learning_rate": 0.00020077300523898458, "loss": 1.7422, "step": 4754 }, { "epoch": 3.1490066225165565, "grad_norm": 0.9450951009824943, "learning_rate": 0.00020071019253438621, "loss": 1.5938, "step": 4755 }, { "epoch": 3.1496688741721854, "grad_norm": 0.9445845698512724, "learning_rate": 0.00020064736978858832, "loss": 1.7109, "step": 4756 }, { "epoch": 3.1503311258278144, "grad_norm": 0.8032985757460158, "learning_rate": 0.00020058453701403062, "loss": 1.3906, "step": 4757 }, { "epoch": 3.1509933774834438, "grad_norm": 0.9568367853722892, "learning_rate": 0.00020052169422315466, "loss": 1.5312, "step": 4758 }, { "epoch": 3.1516556291390727, "grad_norm": 0.9080326510309713, "learning_rate": 0.00020045884142840396, "loss": 1.5469, "step": 4759 }, { "epoch": 3.152317880794702, "grad_norm": 0.9668411272037408, "learning_rate": 0.00020039597864222422, "loss": 1.6953, "step": 4760 }, { "epoch": 3.152980132450331, "grad_norm": 0.951547293110932, "learning_rate": 0.00020033310587706294, "loss": 1.5938, "step": 4761 }, { "epoch": 3.1536423841059604, "grad_norm": 0.8661842339621213, "learning_rate": 0.00020027022314536962, "loss": 1.4453, "step": 4762 }, { "epoch": 3.1543046357615894, "grad_norm": 0.9888373280493069, "learning_rate": 0.0002002073304595958, "loss": 1.6484, "step": 4763 }, { "epoch": 3.1549668874172188, "grad_norm": 0.9665658068523724, "learning_rate": 0.00020014442783219496, "loss": 1.6484, "step": 4764 }, { "epoch": 3.1556291390728477, "grad_norm": 0.9573797818516224, "learning_rate": 0.00020008151527562247, "loss": 1.6719, "step": 4765 }, { "epoch": 3.1562913907284766, "grad_norm": 0.8399673916289535, "learning_rate": 0.00020001859280233583, "loss": 1.3047, "step": 4766 }, { "epoch": 3.156953642384106, "grad_norm": 0.9025927986643074, "learning_rate": 0.00019995566042479437, "loss": 1.5859, "step": 4767 }, { "epoch": 3.157615894039735, "grad_norm": 0.9912661131344919, "learning_rate": 0.00019989271815545943, "loss": 1.7656, "step": 4768 }, { "epoch": 3.1582781456953644, "grad_norm": 0.875713571107221, "learning_rate": 0.00019982976600679424, "loss": 1.4688, "step": 4769 }, { "epoch": 3.1589403973509933, "grad_norm": 0.8986893238647694, "learning_rate": 0.00019976680399126417, "loss": 1.5703, "step": 4770 }, { "epoch": 3.1596026490066227, "grad_norm": 0.9639289276921501, "learning_rate": 0.0001997038321213363, "loss": 1.6641, "step": 4771 }, { "epoch": 3.1602649006622516, "grad_norm": 0.8914560321893221, "learning_rate": 0.00019964085040947988, "loss": 1.6094, "step": 4772 }, { "epoch": 3.160927152317881, "grad_norm": 0.9993083039168085, "learning_rate": 0.00019957785886816592, "loss": 1.7656, "step": 4773 }, { "epoch": 3.16158940397351, "grad_norm": 0.8849672561838005, "learning_rate": 0.00019951485750986753, "loss": 1.5938, "step": 4774 }, { "epoch": 3.162251655629139, "grad_norm": 1.2790459220560648, "learning_rate": 0.00019945184634705967, "loss": 1.8359, "step": 4775 }, { "epoch": 3.1629139072847683, "grad_norm": 0.8416389287272015, "learning_rate": 0.00019938882539221936, "loss": 1.4062, "step": 4776 }, { "epoch": 3.1635761589403972, "grad_norm": 0.9469331724518958, "learning_rate": 0.00019932579465782528, "loss": 1.7344, "step": 4777 }, { "epoch": 3.1642384105960266, "grad_norm": 0.9152258051597071, "learning_rate": 0.00019926275415635847, "loss": 1.6094, "step": 4778 }, { "epoch": 3.1649006622516556, "grad_norm": 0.9925922054764895, "learning_rate": 0.00019919970390030146, "loss": 1.875, "step": 4779 }, { "epoch": 3.165562913907285, "grad_norm": 0.8513714862427627, "learning_rate": 0.00019913664390213909, "loss": 1.6875, "step": 4780 }, { "epoch": 3.166225165562914, "grad_norm": 0.8285205626472368, "learning_rate": 0.00019907357417435788, "loss": 1.4453, "step": 4781 }, { "epoch": 3.1668874172185433, "grad_norm": 0.9512535072851382, "learning_rate": 0.00019901049472944635, "loss": 1.75, "step": 4782 }, { "epoch": 3.167549668874172, "grad_norm": 1.0026635643529767, "learning_rate": 0.00019894740557989496, "loss": 1.8828, "step": 4783 }, { "epoch": 3.168211920529801, "grad_norm": 0.9973596357614611, "learning_rate": 0.0001988843067381962, "loss": 1.8672, "step": 4784 }, { "epoch": 3.1688741721854305, "grad_norm": 1.017454552496059, "learning_rate": 0.00019882119821684416, "loss": 1.7578, "step": 4785 }, { "epoch": 3.1695364238410595, "grad_norm": 0.951756919833561, "learning_rate": 0.00019875808002833522, "loss": 1.8594, "step": 4786 }, { "epoch": 3.170198675496689, "grad_norm": 0.8446962452081238, "learning_rate": 0.0001986949521851674, "loss": 1.625, "step": 4787 }, { "epoch": 3.170860927152318, "grad_norm": 0.9277982495163843, "learning_rate": 0.0001986318146998408, "loss": 1.8125, "step": 4788 }, { "epoch": 3.171523178807947, "grad_norm": 0.8757118669745091, "learning_rate": 0.00019856866758485734, "loss": 1.5391, "step": 4789 }, { "epoch": 3.172185430463576, "grad_norm": 0.9288180630813149, "learning_rate": 0.00019850551085272085, "loss": 1.7109, "step": 4790 }, { "epoch": 3.172847682119205, "grad_norm": 0.9044629273475588, "learning_rate": 0.00019844234451593716, "loss": 1.5859, "step": 4791 }, { "epoch": 3.1735099337748345, "grad_norm": 0.9763842476388541, "learning_rate": 0.00019837916858701386, "loss": 1.6406, "step": 4792 }, { "epoch": 3.1741721854304634, "grad_norm": 0.9899297525287235, "learning_rate": 0.00019831598307846056, "loss": 1.7969, "step": 4793 }, { "epoch": 3.174834437086093, "grad_norm": 0.9008710636987658, "learning_rate": 0.00019825278800278864, "loss": 1.4297, "step": 4794 }, { "epoch": 3.1754966887417218, "grad_norm": 0.9612942292523843, "learning_rate": 0.0001981895833725115, "loss": 1.6562, "step": 4795 }, { "epoch": 3.176158940397351, "grad_norm": 0.9149025571555022, "learning_rate": 0.00019812636920014438, "loss": 1.5547, "step": 4796 }, { "epoch": 3.17682119205298, "grad_norm": 0.9577596693959226, "learning_rate": 0.00019806314549820442, "loss": 1.6172, "step": 4797 }, { "epoch": 3.1774834437086095, "grad_norm": 0.9773002412922633, "learning_rate": 0.0001979999122792106, "loss": 1.6797, "step": 4798 }, { "epoch": 3.1781456953642384, "grad_norm": 0.9553474025727391, "learning_rate": 0.0001979366695556839, "loss": 1.5781, "step": 4799 }, { "epoch": 3.1788079470198674, "grad_norm": 0.9840394769545622, "learning_rate": 0.000197873417340147, "loss": 1.6641, "step": 4800 }, { "epoch": 3.1794701986754967, "grad_norm": 0.8771929524206465, "learning_rate": 0.00019781015564512464, "loss": 1.4922, "step": 4801 }, { "epoch": 3.1801324503311257, "grad_norm": 0.8007004073227335, "learning_rate": 0.00019774688448314336, "loss": 1.3984, "step": 4802 }, { "epoch": 3.180794701986755, "grad_norm": 0.9827267918991061, "learning_rate": 0.0001976836038667315, "loss": 1.9297, "step": 4803 }, { "epoch": 3.181456953642384, "grad_norm": 1.198526836331655, "learning_rate": 0.00019762031380841943, "loss": 1.6797, "step": 4804 }, { "epoch": 3.1821192052980134, "grad_norm": 0.9509696245265932, "learning_rate": 0.0001975570143207393, "loss": 1.7266, "step": 4805 }, { "epoch": 3.1827814569536423, "grad_norm": 0.9104250293045867, "learning_rate": 0.00019749370541622503, "loss": 1.7031, "step": 4806 }, { "epoch": 3.1834437086092717, "grad_norm": 0.8189778202197172, "learning_rate": 0.00019743038710741268, "loss": 1.4844, "step": 4807 }, { "epoch": 3.1841059602649007, "grad_norm": 0.8190698815898578, "learning_rate": 0.00019736705940683985, "loss": 1.5469, "step": 4808 }, { "epoch": 3.1847682119205296, "grad_norm": 0.9366395358577256, "learning_rate": 0.00019730372232704626, "loss": 1.6875, "step": 4809 }, { "epoch": 3.185430463576159, "grad_norm": 0.9400742396935188, "learning_rate": 0.0001972403758805733, "loss": 1.7188, "step": 4810 }, { "epoch": 3.186092715231788, "grad_norm": 0.9493753619356124, "learning_rate": 0.00019717702007996432, "loss": 1.625, "step": 4811 }, { "epoch": 3.1867549668874173, "grad_norm": 0.9398627229211451, "learning_rate": 0.0001971136549377645, "loss": 1.5781, "step": 4812 }, { "epoch": 3.1874172185430463, "grad_norm": 0.8928465578514219, "learning_rate": 0.0001970502804665209, "loss": 1.4922, "step": 4813 }, { "epoch": 3.1880794701986757, "grad_norm": 0.9960723709747267, "learning_rate": 0.0001969868966787823, "loss": 1.6641, "step": 4814 }, { "epoch": 3.1887417218543046, "grad_norm": 1.0131731735292189, "learning_rate": 0.00019692350358709948, "loss": 1.7969, "step": 4815 }, { "epoch": 3.1894039735099335, "grad_norm": 0.9652077287315285, "learning_rate": 0.00019686010120402501, "loss": 1.5, "step": 4816 }, { "epoch": 3.190066225165563, "grad_norm": 1.0335359633173542, "learning_rate": 0.00019679668954211327, "loss": 1.8047, "step": 4817 }, { "epoch": 3.190728476821192, "grad_norm": 1.0330314983797757, "learning_rate": 0.00019673326861392048, "loss": 1.8438, "step": 4818 }, { "epoch": 3.1913907284768213, "grad_norm": 1.0485981242686409, "learning_rate": 0.00019666983843200472, "loss": 2.0156, "step": 4819 }, { "epoch": 3.19205298013245, "grad_norm": 1.017382082816668, "learning_rate": 0.0001966063990089259, "loss": 1.7812, "step": 4820 }, { "epoch": 3.1927152317880796, "grad_norm": 0.7998309235828397, "learning_rate": 0.00019654295035724577, "loss": 1.4844, "step": 4821 }, { "epoch": 3.1933774834437085, "grad_norm": 0.7705291381186155, "learning_rate": 0.0001964794924895279, "loss": 1.3281, "step": 4822 }, { "epoch": 3.194039735099338, "grad_norm": 0.9038680546967081, "learning_rate": 0.00019641602541833759, "loss": 1.4688, "step": 4823 }, { "epoch": 3.194701986754967, "grad_norm": 0.922974383023784, "learning_rate": 0.0001963525491562421, "loss": 1.7969, "step": 4824 }, { "epoch": 3.195364238410596, "grad_norm": 0.9750477134622206, "learning_rate": 0.0001962890637158105, "loss": 1.6953, "step": 4825 }, { "epoch": 3.196026490066225, "grad_norm": 0.9725357180803608, "learning_rate": 0.0001962255691096136, "loss": 1.6953, "step": 4826 }, { "epoch": 3.196688741721854, "grad_norm": 0.9464508723550148, "learning_rate": 0.000196162065350224, "loss": 1.7422, "step": 4827 }, { "epoch": 3.1973509933774835, "grad_norm": 0.9764154860961823, "learning_rate": 0.0001960985524502163, "loss": 1.875, "step": 4828 }, { "epoch": 3.1980132450331125, "grad_norm": 0.9582627945669965, "learning_rate": 0.00019603503042216666, "loss": 1.6797, "step": 4829 }, { "epoch": 3.198675496688742, "grad_norm": 0.9346992952607505, "learning_rate": 0.00019597149927865325, "loss": 1.5859, "step": 4830 }, { "epoch": 3.199337748344371, "grad_norm": 0.9215032683868045, "learning_rate": 0.00019590795903225592, "loss": 1.6406, "step": 4831 }, { "epoch": 3.2, "grad_norm": 0.8530564093005187, "learning_rate": 0.0001958444096955564, "loss": 1.4375, "step": 4832 }, { "epoch": 3.200662251655629, "grad_norm": 0.9649602027120932, "learning_rate": 0.0001957808512811382, "loss": 1.7656, "step": 4833 }, { "epoch": 3.201324503311258, "grad_norm": 0.8746722212516136, "learning_rate": 0.00019571728380158652, "loss": 1.4844, "step": 4834 }, { "epoch": 3.2019867549668874, "grad_norm": 1.1491697946409087, "learning_rate": 0.0001956537072694885, "loss": 1.7109, "step": 4835 }, { "epoch": 3.2026490066225164, "grad_norm": 0.9876531246334254, "learning_rate": 0.00019559012169743315, "loss": 1.5703, "step": 4836 }, { "epoch": 3.203311258278146, "grad_norm": 0.9253618621675296, "learning_rate": 0.00019552652709801092, "loss": 1.5, "step": 4837 }, { "epoch": 3.2039735099337747, "grad_norm": 0.9842876306317241, "learning_rate": 0.00019546292348381444, "loss": 1.8359, "step": 4838 }, { "epoch": 3.204635761589404, "grad_norm": 0.9477480348255984, "learning_rate": 0.0001953993108674379, "loss": 1.6562, "step": 4839 }, { "epoch": 3.205298013245033, "grad_norm": 0.9077160498000002, "learning_rate": 0.00019533568926147728, "loss": 1.5781, "step": 4840 }, { "epoch": 3.2059602649006624, "grad_norm": 0.9533461266022754, "learning_rate": 0.00019527205867853043, "loss": 1.7109, "step": 4841 }, { "epoch": 3.2066225165562914, "grad_norm": 7.053061669844967, "learning_rate": 0.000195208419131197, "loss": 1.5078, "step": 4842 }, { "epoch": 3.2072847682119203, "grad_norm": 0.9584480507770043, "learning_rate": 0.0001951447706320782, "loss": 1.7656, "step": 4843 }, { "epoch": 3.2079470198675497, "grad_norm": 0.9185393859922065, "learning_rate": 0.0001950811131937773, "loss": 1.6484, "step": 4844 }, { "epoch": 3.2086092715231787, "grad_norm": 0.9244197866272448, "learning_rate": 0.00019501744682889915, "loss": 1.5391, "step": 4845 }, { "epoch": 3.209271523178808, "grad_norm": 0.865482180345394, "learning_rate": 0.0001949537715500504, "loss": 1.6172, "step": 4846 }, { "epoch": 3.209933774834437, "grad_norm": 0.7972364654115054, "learning_rate": 0.00019489008736983953, "loss": 1.4375, "step": 4847 }, { "epoch": 3.2105960264900664, "grad_norm": 0.9292399844848587, "learning_rate": 0.0001948263943008767, "loss": 1.6562, "step": 4848 }, { "epoch": 3.2112582781456953, "grad_norm": 0.9460124097628286, "learning_rate": 0.00019476269235577388, "loss": 1.6484, "step": 4849 }, { "epoch": 3.2119205298013247, "grad_norm": 1.0260904401138844, "learning_rate": 0.00019469898154714474, "loss": 1.8984, "step": 4850 }, { "epoch": 3.2125827814569536, "grad_norm": 0.9544954952010148, "learning_rate": 0.00019463526188760483, "loss": 1.7188, "step": 4851 }, { "epoch": 3.2132450331125826, "grad_norm": 0.9408366216465428, "learning_rate": 0.00019457153338977136, "loss": 1.6797, "step": 4852 }, { "epoch": 3.213907284768212, "grad_norm": 0.952872436657239, "learning_rate": 0.00019450779606626327, "loss": 1.5234, "step": 4853 }, { "epoch": 3.214569536423841, "grad_norm": 0.9296375596840306, "learning_rate": 0.0001944440499297013, "loss": 1.6797, "step": 4854 }, { "epoch": 3.2152317880794703, "grad_norm": 1.1188789260980818, "learning_rate": 0.0001943802949927079, "loss": 1.8672, "step": 4855 }, { "epoch": 3.2158940397350992, "grad_norm": 1.0000600783203344, "learning_rate": 0.00019431653126790726, "loss": 1.6172, "step": 4856 }, { "epoch": 3.2165562913907286, "grad_norm": 0.8707557676502273, "learning_rate": 0.00019425275876792545, "loss": 1.5156, "step": 4857 }, { "epoch": 3.2172185430463576, "grad_norm": 0.9132577220630183, "learning_rate": 0.00019418897750538998, "loss": 1.4766, "step": 4858 }, { "epoch": 3.217880794701987, "grad_norm": 0.9260817991994212, "learning_rate": 0.00019412518749293038, "loss": 1.6562, "step": 4859 }, { "epoch": 3.218543046357616, "grad_norm": 0.9708688863840168, "learning_rate": 0.00019406138874317774, "loss": 1.7891, "step": 4860 }, { "epoch": 3.219205298013245, "grad_norm": 1.041455062859568, "learning_rate": 0.00019399758126876503, "loss": 1.8359, "step": 4861 }, { "epoch": 3.2198675496688742, "grad_norm": 0.8751066218062601, "learning_rate": 0.00019393376508232679, "loss": 1.5078, "step": 4862 }, { "epoch": 3.220529801324503, "grad_norm": 0.8636007075942181, "learning_rate": 0.0001938699401964994, "loss": 1.4141, "step": 4863 }, { "epoch": 3.2211920529801326, "grad_norm": 1.027546781883396, "learning_rate": 0.00019380610662392085, "loss": 1.7578, "step": 4864 }, { "epoch": 3.2218543046357615, "grad_norm": 0.868202554367135, "learning_rate": 0.00019374226437723105, "loss": 1.5312, "step": 4865 }, { "epoch": 3.222516556291391, "grad_norm": 1.055183901110289, "learning_rate": 0.00019367841346907134, "loss": 1.8047, "step": 4866 }, { "epoch": 3.22317880794702, "grad_norm": 1.018924114009745, "learning_rate": 0.00019361455391208503, "loss": 1.7656, "step": 4867 }, { "epoch": 3.223841059602649, "grad_norm": 1.0403146046277116, "learning_rate": 0.00019355068571891702, "loss": 2.0156, "step": 4868 }, { "epoch": 3.224503311258278, "grad_norm": 0.9036726133713865, "learning_rate": 0.00019348680890221397, "loss": 1.5547, "step": 4869 }, { "epoch": 3.225165562913907, "grad_norm": 0.8921035589227718, "learning_rate": 0.00019342292347462414, "loss": 1.7031, "step": 4870 }, { "epoch": 3.2258278145695365, "grad_norm": 0.9171277730736541, "learning_rate": 0.00019335902944879772, "loss": 1.8438, "step": 4871 }, { "epoch": 3.2264900662251654, "grad_norm": 0.9393102022935743, "learning_rate": 0.0001932951268373863, "loss": 2.0469, "step": 4872 }, { "epoch": 3.227152317880795, "grad_norm": 0.8816303229639194, "learning_rate": 0.00019323121565304342, "loss": 1.75, "step": 4873 }, { "epoch": 3.2278145695364238, "grad_norm": 0.9452546115838456, "learning_rate": 0.00019316729590842423, "loss": 2.0, "step": 4874 }, { "epoch": 3.228476821192053, "grad_norm": 0.7631561615764948, "learning_rate": 0.00019310336761618554, "loss": 1.4609, "step": 4875 }, { "epoch": 3.229139072847682, "grad_norm": 0.816914132084238, "learning_rate": 0.0001930394307889859, "loss": 1.6094, "step": 4876 }, { "epoch": 3.2298013245033115, "grad_norm": 0.8185009443276723, "learning_rate": 0.00019297548543948555, "loss": 1.6953, "step": 4877 }, { "epoch": 3.2304635761589404, "grad_norm": 0.9336564258908694, "learning_rate": 0.00019291153158034635, "loss": 1.9766, "step": 4878 }, { "epoch": 3.2311258278145694, "grad_norm": 0.9364367756976485, "learning_rate": 0.00019284756922423196, "loss": 1.6875, "step": 4879 }, { "epoch": 3.2317880794701987, "grad_norm": 0.9525706359250368, "learning_rate": 0.00019278359838380762, "loss": 1.8516, "step": 4880 }, { "epoch": 3.2324503311258277, "grad_norm": 0.9759224580732908, "learning_rate": 0.00019271961907174032, "loss": 1.9219, "step": 4881 }, { "epoch": 3.233112582781457, "grad_norm": 0.9100532853953787, "learning_rate": 0.00019265563130069867, "loss": 1.7344, "step": 4882 }, { "epoch": 3.233774834437086, "grad_norm": 0.8145191419596924, "learning_rate": 0.00019259163508335297, "loss": 1.4453, "step": 4883 }, { "epoch": 3.2344370860927154, "grad_norm": 0.8990821578691979, "learning_rate": 0.00019252763043237526, "loss": 1.4922, "step": 4884 }, { "epoch": 3.2350993377483444, "grad_norm": 1.0125083867996316, "learning_rate": 0.0001924636173604391, "loss": 1.9688, "step": 4885 }, { "epoch": 3.2357615894039737, "grad_norm": 1.0307807269538847, "learning_rate": 0.00019239959588021998, "loss": 1.8828, "step": 4886 }, { "epoch": 3.2364238410596027, "grad_norm": 1.0119292700872482, "learning_rate": 0.00019233556600439466, "loss": 1.8281, "step": 4887 }, { "epoch": 3.2370860927152316, "grad_norm": 0.9640582265653731, "learning_rate": 0.000192271527745642, "loss": 1.75, "step": 4888 }, { "epoch": 3.237748344370861, "grad_norm": 0.9785363733124639, "learning_rate": 0.00019220748111664217, "loss": 1.7422, "step": 4889 }, { "epoch": 3.23841059602649, "grad_norm": 0.8596427652146855, "learning_rate": 0.00019214342613007722, "loss": 1.6172, "step": 4890 }, { "epoch": 3.2390728476821193, "grad_norm": 0.888870099348229, "learning_rate": 0.0001920793627986307, "loss": 1.7109, "step": 4891 }, { "epoch": 3.2397350993377483, "grad_norm": 0.9808046427816547, "learning_rate": 0.00019201529113498797, "loss": 1.6719, "step": 4892 }, { "epoch": 3.2403973509933777, "grad_norm": 0.8934102956514723, "learning_rate": 0.00019195121115183584, "loss": 1.7812, "step": 4893 }, { "epoch": 3.2410596026490066, "grad_norm": 0.9304499820236418, "learning_rate": 0.00019188712286186303, "loss": 1.8359, "step": 4894 }, { "epoch": 3.241721854304636, "grad_norm": 0.9409039542920326, "learning_rate": 0.0001918230262777596, "loss": 1.6719, "step": 4895 }, { "epoch": 3.242384105960265, "grad_norm": 0.9542723786867001, "learning_rate": 0.00019175892141221748, "loss": 1.7891, "step": 4896 }, { "epoch": 3.243046357615894, "grad_norm": 0.9535822380521166, "learning_rate": 0.00019169480827793022, "loss": 1.7266, "step": 4897 }, { "epoch": 3.2437086092715233, "grad_norm": 0.9435524082120601, "learning_rate": 0.0001916306868875929, "loss": 1.7188, "step": 4898 }, { "epoch": 3.244370860927152, "grad_norm": 0.9908033868567754, "learning_rate": 0.00019156655725390226, "loss": 2.0156, "step": 4899 }, { "epoch": 3.2450331125827816, "grad_norm": 1.0063589420231038, "learning_rate": 0.00019150241938955677, "loss": 1.8594, "step": 4900 }, { "epoch": 3.2456953642384105, "grad_norm": 0.8974342434456295, "learning_rate": 0.0001914382733072564, "loss": 1.6094, "step": 4901 }, { "epoch": 3.24635761589404, "grad_norm": 0.9373137441108849, "learning_rate": 0.00019137411901970288, "loss": 1.9297, "step": 4902 }, { "epoch": 3.247019867549669, "grad_norm": 0.9536229261645489, "learning_rate": 0.00019130995653959942, "loss": 1.7578, "step": 4903 }, { "epoch": 3.247682119205298, "grad_norm": 0.9185011624949272, "learning_rate": 0.000191245785879651, "loss": 1.7422, "step": 4904 }, { "epoch": 3.248344370860927, "grad_norm": 0.7804035594968938, "learning_rate": 0.00019118160705256407, "loss": 1.3281, "step": 4905 }, { "epoch": 3.249006622516556, "grad_norm": 0.8481157369974262, "learning_rate": 0.00019111742007104686, "loss": 1.4688, "step": 4906 }, { "epoch": 3.2496688741721855, "grad_norm": 0.885660031580941, "learning_rate": 0.000191053224947809, "loss": 1.4844, "step": 4907 }, { "epoch": 3.2503311258278145, "grad_norm": 1.0517835858787712, "learning_rate": 0.000190989021695562, "loss": 1.8672, "step": 4908 }, { "epoch": 3.250993377483444, "grad_norm": 0.8942583848239999, "learning_rate": 0.0001909248103270188, "loss": 1.6094, "step": 4909 }, { "epoch": 3.251655629139073, "grad_norm": 0.9875719823218626, "learning_rate": 0.00019086059085489393, "loss": 1.7266, "step": 4910 }, { "epoch": 3.2523178807947017, "grad_norm": 0.9240386474487559, "learning_rate": 0.0001907963632919036, "loss": 1.7266, "step": 4911 }, { "epoch": 3.252980132450331, "grad_norm": 0.8921565048279185, "learning_rate": 0.00019073212765076565, "loss": 1.6094, "step": 4912 }, { "epoch": 3.25364238410596, "grad_norm": 0.9130339713731725, "learning_rate": 0.0001906678839441995, "loss": 1.6406, "step": 4913 }, { "epoch": 3.2543046357615895, "grad_norm": 1.0209129976535385, "learning_rate": 0.00019060363218492598, "loss": 1.8906, "step": 4914 }, { "epoch": 3.2549668874172184, "grad_norm": 0.9560073960212097, "learning_rate": 0.0001905393723856679, "loss": 1.6641, "step": 4915 }, { "epoch": 3.255629139072848, "grad_norm": 1.0409209189966806, "learning_rate": 0.0001904751045591492, "loss": 1.875, "step": 4916 }, { "epoch": 3.2562913907284767, "grad_norm": 0.8815186191235089, "learning_rate": 0.00019041082871809582, "loss": 1.625, "step": 4917 }, { "epoch": 3.256953642384106, "grad_norm": 0.9658119113973413, "learning_rate": 0.00019034654487523508, "loss": 1.625, "step": 4918 }, { "epoch": 3.257615894039735, "grad_norm": 0.9434182707562291, "learning_rate": 0.00019028225304329588, "loss": 1.7812, "step": 4919 }, { "epoch": 3.258278145695364, "grad_norm": 0.9784490209456781, "learning_rate": 0.00019021795323500873, "loss": 1.7656, "step": 4920 }, { "epoch": 3.2589403973509934, "grad_norm": 0.7987677486872828, "learning_rate": 0.0001901536454631058, "loss": 1.2734, "step": 4921 }, { "epoch": 3.2596026490066223, "grad_norm": 1.2162966490012375, "learning_rate": 0.00019008932974032066, "loss": 1.5078, "step": 4922 }, { "epoch": 3.2602649006622517, "grad_norm": 1.0779355197509937, "learning_rate": 0.0001900250060793887, "loss": 1.8359, "step": 4923 }, { "epoch": 3.2609271523178807, "grad_norm": 1.0704686680880526, "learning_rate": 0.0001899606744930466, "loss": 1.8984, "step": 4924 }, { "epoch": 3.26158940397351, "grad_norm": 0.9026616374879265, "learning_rate": 0.00018989633499403284, "loss": 1.625, "step": 4925 }, { "epoch": 3.262251655629139, "grad_norm": 1.049832751285563, "learning_rate": 0.00018983198759508737, "loss": 1.75, "step": 4926 }, { "epoch": 3.2629139072847684, "grad_norm": 0.8690844459982967, "learning_rate": 0.00018976763230895167, "loss": 1.4219, "step": 4927 }, { "epoch": 3.2635761589403973, "grad_norm": 0.9560964681118392, "learning_rate": 0.00018970326914836884, "loss": 1.5859, "step": 4928 }, { "epoch": 3.2642384105960263, "grad_norm": 0.9878997188060116, "learning_rate": 0.0001896388981260836, "loss": 1.8047, "step": 4929 }, { "epoch": 3.2649006622516556, "grad_norm": 0.9672244983198531, "learning_rate": 0.00018957451925484203, "loss": 1.6797, "step": 4930 }, { "epoch": 3.2655629139072846, "grad_norm": 0.9635703770644034, "learning_rate": 0.00018951013254739195, "loss": 1.8047, "step": 4931 }, { "epoch": 3.266225165562914, "grad_norm": 0.9019249306808154, "learning_rate": 0.00018944573801648267, "loss": 1.625, "step": 4932 }, { "epoch": 3.266887417218543, "grad_norm": 1.025381551015555, "learning_rate": 0.00018938133567486501, "loss": 1.8516, "step": 4933 }, { "epoch": 3.2675496688741723, "grad_norm": 1.003662708592068, "learning_rate": 0.00018931692553529142, "loss": 1.8594, "step": 4934 }, { "epoch": 3.2682119205298013, "grad_norm": 1.0074175809342523, "learning_rate": 0.00018925250761051582, "loss": 1.8672, "step": 4935 }, { "epoch": 3.2688741721854306, "grad_norm": 0.8485323414378746, "learning_rate": 0.00018918808191329366, "loss": 1.4922, "step": 4936 }, { "epoch": 3.2695364238410596, "grad_norm": 1.02746066209781, "learning_rate": 0.00018912364845638204, "loss": 2.0156, "step": 4937 }, { "epoch": 3.2701986754966885, "grad_norm": 1.036354689109393, "learning_rate": 0.0001890592072525395, "loss": 2.0156, "step": 4938 }, { "epoch": 3.270860927152318, "grad_norm": 0.8701163608368833, "learning_rate": 0.0001889947583145261, "loss": 1.6094, "step": 4939 }, { "epoch": 3.271523178807947, "grad_norm": 0.8703096632149003, "learning_rate": 0.00018893030165510353, "loss": 1.5312, "step": 4940 }, { "epoch": 3.2721854304635762, "grad_norm": 0.9407916867551112, "learning_rate": 0.00018886583728703487, "loss": 1.7109, "step": 4941 }, { "epoch": 3.272847682119205, "grad_norm": 0.9323798438595865, "learning_rate": 0.00018880136522308484, "loss": 1.6719, "step": 4942 }, { "epoch": 3.2735099337748346, "grad_norm": 0.9184087167045276, "learning_rate": 0.00018873688547601966, "loss": 1.5391, "step": 4943 }, { "epoch": 3.2741721854304635, "grad_norm": 0.8492989005262241, "learning_rate": 0.00018867239805860707, "loss": 1.4375, "step": 4944 }, { "epoch": 3.274834437086093, "grad_norm": 1.0555789870980972, "learning_rate": 0.0001886079029836163, "loss": 1.8281, "step": 4945 }, { "epoch": 3.275496688741722, "grad_norm": 0.952399223300368, "learning_rate": 0.00018854340026381808, "loss": 1.5859, "step": 4946 }, { "epoch": 3.276158940397351, "grad_norm": 0.9521886941476095, "learning_rate": 0.00018847888991198477, "loss": 1.6406, "step": 4947 }, { "epoch": 3.27682119205298, "grad_norm": 1.0454151990248994, "learning_rate": 0.00018841437194089006, "loss": 1.8281, "step": 4948 }, { "epoch": 3.277483443708609, "grad_norm": 1.0383014133253852, "learning_rate": 0.00018834984636330936, "loss": 1.8281, "step": 4949 }, { "epoch": 3.2781456953642385, "grad_norm": 0.9871398546157205, "learning_rate": 0.0001882853131920194, "loss": 1.7734, "step": 4950 }, { "epoch": 3.2788079470198674, "grad_norm": 0.8235358582185949, "learning_rate": 0.00018822077243979844, "loss": 1.4375, "step": 4951 }, { "epoch": 3.279470198675497, "grad_norm": 0.8652282082550177, "learning_rate": 0.0001881562241194264, "loss": 1.5234, "step": 4952 }, { "epoch": 3.2801324503311258, "grad_norm": 0.8796517015548668, "learning_rate": 0.00018809166824368452, "loss": 1.5938, "step": 4953 }, { "epoch": 3.280794701986755, "grad_norm": 0.8918248184392334, "learning_rate": 0.00018802710482535562, "loss": 1.6484, "step": 4954 }, { "epoch": 3.281456953642384, "grad_norm": 0.9027303931693318, "learning_rate": 0.00018796253387722402, "loss": 1.6406, "step": 4955 }, { "epoch": 3.282119205298013, "grad_norm": 0.9102768669930302, "learning_rate": 0.00018789795541207548, "loss": 1.7109, "step": 4956 }, { "epoch": 3.2827814569536424, "grad_norm": 0.8809029837344639, "learning_rate": 0.00018783336944269723, "loss": 1.5469, "step": 4957 }, { "epoch": 3.2834437086092714, "grad_norm": 0.8798101989204969, "learning_rate": 0.00018776877598187817, "loss": 1.6016, "step": 4958 }, { "epoch": 3.2841059602649008, "grad_norm": 0.9967923148959271, "learning_rate": 0.0001877041750424084, "loss": 1.8438, "step": 4959 }, { "epoch": 3.2847682119205297, "grad_norm": 0.9344468974112051, "learning_rate": 0.00018763956663707976, "loss": 1.8203, "step": 4960 }, { "epoch": 3.285430463576159, "grad_norm": 1.0223131026847796, "learning_rate": 0.00018757495077868536, "loss": 1.8203, "step": 4961 }, { "epoch": 3.286092715231788, "grad_norm": 0.943235459345669, "learning_rate": 0.0001875103274800199, "loss": 1.5781, "step": 4962 }, { "epoch": 3.2867549668874174, "grad_norm": 0.8851655098542037, "learning_rate": 0.0001874456967538796, "loss": 1.5625, "step": 4963 }, { "epoch": 3.2874172185430464, "grad_norm": 0.9680180748877525, "learning_rate": 0.00018738105861306208, "loss": 1.5312, "step": 4964 }, { "epoch": 3.2880794701986753, "grad_norm": 0.9269914596287598, "learning_rate": 0.00018731641307036628, "loss": 1.7188, "step": 4965 }, { "epoch": 3.2887417218543047, "grad_norm": 0.8303317714731553, "learning_rate": 0.00018725176013859295, "loss": 1.4062, "step": 4966 }, { "epoch": 3.2894039735099336, "grad_norm": 0.8663931058472737, "learning_rate": 0.00018718709983054404, "loss": 1.5156, "step": 4967 }, { "epoch": 3.290066225165563, "grad_norm": 0.903862199090456, "learning_rate": 0.000187122432159023, "loss": 1.5234, "step": 4968 }, { "epoch": 3.290728476821192, "grad_norm": 0.9895259617786025, "learning_rate": 0.00018705775713683483, "loss": 1.7188, "step": 4969 }, { "epoch": 3.2913907284768213, "grad_norm": 0.9833364379227905, "learning_rate": 0.00018699307477678588, "loss": 1.7188, "step": 4970 }, { "epoch": 3.2920529801324503, "grad_norm": 0.940702014814214, "learning_rate": 0.000186928385091684, "loss": 1.75, "step": 4971 }, { "epoch": 3.2927152317880797, "grad_norm": 0.9986038652067661, "learning_rate": 0.0001868636880943385, "loss": 1.6094, "step": 4972 }, { "epoch": 3.2933774834437086, "grad_norm": 0.9622274174238085, "learning_rate": 0.00018679898379756014, "loss": 1.6328, "step": 4973 }, { "epoch": 3.2940397350993376, "grad_norm": 1.042295730814121, "learning_rate": 0.0001867342722141611, "loss": 1.8125, "step": 4974 }, { "epoch": 3.294701986754967, "grad_norm": 1.0940875053193069, "learning_rate": 0.00018666955335695502, "loss": 2.0156, "step": 4975 }, { "epoch": 3.295364238410596, "grad_norm": 1.0595773742522536, "learning_rate": 0.00018660482723875698, "loss": 1.8281, "step": 4976 }, { "epoch": 3.2960264900662253, "grad_norm": 0.8607914897807912, "learning_rate": 0.00018654009387238349, "loss": 1.5078, "step": 4977 }, { "epoch": 3.296688741721854, "grad_norm": 0.9447447461810932, "learning_rate": 0.00018647535327065247, "loss": 1.7031, "step": 4978 }, { "epoch": 3.2973509933774836, "grad_norm": 0.9803853128274533, "learning_rate": 0.00018641060544638332, "loss": 1.8125, "step": 4979 }, { "epoch": 3.2980132450331126, "grad_norm": 0.9776586241238924, "learning_rate": 0.00018634585041239684, "loss": 1.8281, "step": 4980 }, { "epoch": 3.298675496688742, "grad_norm": 0.8806533123957256, "learning_rate": 0.0001862810881815154, "loss": 1.8203, "step": 4981 }, { "epoch": 3.299337748344371, "grad_norm": 0.821539990769179, "learning_rate": 0.00018621631876656242, "loss": 1.375, "step": 4982 }, { "epoch": 3.3, "grad_norm": 0.954521860008864, "learning_rate": 0.00018615154218036317, "loss": 1.8984, "step": 4983 }, { "epoch": 3.300662251655629, "grad_norm": 0.9274993237580643, "learning_rate": 0.00018608675843574413, "loss": 1.7891, "step": 4984 }, { "epoch": 3.301324503311258, "grad_norm": 0.8524896713381565, "learning_rate": 0.00018602196754553314, "loss": 1.5625, "step": 4985 }, { "epoch": 3.3019867549668875, "grad_norm": 0.8518998129475894, "learning_rate": 0.0001859571695225596, "loss": 1.4844, "step": 4986 }, { "epoch": 3.3026490066225165, "grad_norm": 0.985641121847216, "learning_rate": 0.00018589236437965435, "loss": 1.8516, "step": 4987 }, { "epoch": 3.303311258278146, "grad_norm": 0.9162685674783501, "learning_rate": 0.00018582755212964935, "loss": 1.6641, "step": 4988 }, { "epoch": 3.303973509933775, "grad_norm": 0.8836736930529665, "learning_rate": 0.00018576273278537837, "loss": 1.6484, "step": 4989 }, { "epoch": 3.304635761589404, "grad_norm": 0.7997712518198936, "learning_rate": 0.00018569790635967624, "loss": 1.3672, "step": 4990 }, { "epoch": 3.305298013245033, "grad_norm": 0.9483711102590494, "learning_rate": 0.0001856330728653794, "loss": 1.5078, "step": 4991 }, { "epoch": 3.305960264900662, "grad_norm": 0.8960855638603586, "learning_rate": 0.00018556823231532556, "loss": 1.625, "step": 4992 }, { "epoch": 3.3066225165562915, "grad_norm": 1.0905356204584076, "learning_rate": 0.00018550338472235407, "loss": 1.8984, "step": 4993 }, { "epoch": 3.3072847682119204, "grad_norm": 1.049593545490903, "learning_rate": 0.00018543853009930525, "loss": 1.7109, "step": 4994 }, { "epoch": 3.30794701986755, "grad_norm": 0.9577978630492061, "learning_rate": 0.00018537366845902125, "loss": 1.7344, "step": 4995 }, { "epoch": 3.3086092715231787, "grad_norm": 0.9651349669733016, "learning_rate": 0.0001853087998143453, "loss": 1.7266, "step": 4996 }, { "epoch": 3.309271523178808, "grad_norm": 0.9237230639236741, "learning_rate": 0.00018524392417812222, "loss": 1.7109, "step": 4997 }, { "epoch": 3.309933774834437, "grad_norm": 0.9145469599746658, "learning_rate": 0.00018517904156319808, "loss": 1.6641, "step": 4998 }, { "epoch": 3.3105960264900665, "grad_norm": 0.9335655630951304, "learning_rate": 0.0001851141519824204, "loss": 1.7812, "step": 4999 }, { "epoch": 3.3112582781456954, "grad_norm": 0.9339079484818458, "learning_rate": 0.00018504925544863806, "loss": 1.75, "step": 5000 }, { "epoch": 3.3119205298013243, "grad_norm": 0.9574596902728908, "learning_rate": 0.0001849843519747013, "loss": 1.8047, "step": 5001 }, { "epoch": 3.3125827814569537, "grad_norm": 0.9622339369788938, "learning_rate": 0.0001849194415734618, "loss": 1.8281, "step": 5002 }, { "epoch": 3.3132450331125827, "grad_norm": 1.0108825522391516, "learning_rate": 0.0001848545242577725, "loss": 1.8047, "step": 5003 }, { "epoch": 3.313907284768212, "grad_norm": 0.9059615359449151, "learning_rate": 0.00018478960004048785, "loss": 1.7266, "step": 5004 }, { "epoch": 3.314569536423841, "grad_norm": 0.9031193684197732, "learning_rate": 0.00018472466893446347, "loss": 1.7109, "step": 5005 }, { "epoch": 3.3152317880794704, "grad_norm": 0.8582760031650488, "learning_rate": 0.00018465973095255656, "loss": 1.5547, "step": 5006 }, { "epoch": 3.3158940397350993, "grad_norm": 0.8941598387946792, "learning_rate": 0.00018459478610762558, "loss": 1.5312, "step": 5007 }, { "epoch": 3.3165562913907287, "grad_norm": 0.9574608327299471, "learning_rate": 0.00018452983441253034, "loss": 1.8281, "step": 5008 }, { "epoch": 3.3172185430463577, "grad_norm": 0.8337914799636005, "learning_rate": 0.000184464875880132, "loss": 1.5391, "step": 5009 }, { "epoch": 3.3178807947019866, "grad_norm": 0.9120848933452215, "learning_rate": 0.00018439991052329315, "loss": 1.6953, "step": 5010 }, { "epoch": 3.318543046357616, "grad_norm": 0.9481178581214507, "learning_rate": 0.00018433493835487755, "loss": 1.6484, "step": 5011 }, { "epoch": 3.319205298013245, "grad_norm": 0.880103436228372, "learning_rate": 0.00018426995938775057, "loss": 1.4844, "step": 5012 }, { "epoch": 3.3198675496688743, "grad_norm": 0.9941958887331018, "learning_rate": 0.00018420497363477874, "loss": 1.6875, "step": 5013 }, { "epoch": 3.3205298013245033, "grad_norm": 0.9811146122730885, "learning_rate": 0.00018413998110882997, "loss": 1.6328, "step": 5014 }, { "epoch": 3.321192052980132, "grad_norm": 0.8690144622538178, "learning_rate": 0.0001840749818227735, "loss": 1.5469, "step": 5015 }, { "epoch": 3.3218543046357616, "grad_norm": 1.0421103787115655, "learning_rate": 0.00018400997578948004, "loss": 1.7188, "step": 5016 }, { "epoch": 3.322516556291391, "grad_norm": 0.9776795557393647, "learning_rate": 0.00018394496302182143, "loss": 1.7344, "step": 5017 }, { "epoch": 3.32317880794702, "grad_norm": 0.8167499405481764, "learning_rate": 0.00018387994353267096, "loss": 1.3828, "step": 5018 }, { "epoch": 3.323841059602649, "grad_norm": 1.0179867303789103, "learning_rate": 0.00018381491733490332, "loss": 1.7422, "step": 5019 }, { "epoch": 3.3245033112582782, "grad_norm": 0.8771750048790115, "learning_rate": 0.00018374988444139431, "loss": 1.5156, "step": 5020 }, { "epoch": 3.325165562913907, "grad_norm": 1.0817401158917053, "learning_rate": 0.00018368484486502124, "loss": 2.1094, "step": 5021 }, { "epoch": 3.3258278145695366, "grad_norm": 0.9188472060843526, "learning_rate": 0.00018361979861866278, "loss": 1.8125, "step": 5022 }, { "epoch": 3.3264900662251655, "grad_norm": 0.8255974302938327, "learning_rate": 0.00018355474571519868, "loss": 1.5234, "step": 5023 }, { "epoch": 3.3271523178807945, "grad_norm": 0.88238951083261, "learning_rate": 0.00018348968616751024, "loss": 1.6094, "step": 5024 }, { "epoch": 3.327814569536424, "grad_norm": 0.8026687114985431, "learning_rate": 0.00018342461998848003, "loss": 1.4609, "step": 5025 }, { "epoch": 3.328476821192053, "grad_norm": 1.1027280149205263, "learning_rate": 0.0001833595471909918, "loss": 2.1562, "step": 5026 }, { "epoch": 3.329139072847682, "grad_norm": 0.9345685778614579, "learning_rate": 0.0001832944677879308, "loss": 1.6094, "step": 5027 }, { "epoch": 3.329801324503311, "grad_norm": 0.9346015360278925, "learning_rate": 0.0001832293817921835, "loss": 1.8672, "step": 5028 }, { "epoch": 3.3304635761589405, "grad_norm": 0.9400707482711643, "learning_rate": 0.00018316428921663763, "loss": 1.8359, "step": 5029 }, { "epoch": 3.3311258278145695, "grad_norm": 0.987169996816523, "learning_rate": 0.0001830991900741823, "loss": 2.0625, "step": 5030 }, { "epoch": 3.331788079470199, "grad_norm": 0.9834782079646976, "learning_rate": 0.00018303408437770783, "loss": 1.8047, "step": 5031 }, { "epoch": 3.332450331125828, "grad_norm": 0.9914640845494984, "learning_rate": 0.00018296897214010595, "loss": 1.8203, "step": 5032 }, { "epoch": 3.3331125827814567, "grad_norm": 0.8976719812447882, "learning_rate": 0.0001829038533742696, "loss": 1.6016, "step": 5033 }, { "epoch": 3.333774834437086, "grad_norm": 0.7800758534756888, "learning_rate": 0.00018283872809309303, "loss": 1.3984, "step": 5034 }, { "epoch": 3.334437086092715, "grad_norm": 0.8929383388945731, "learning_rate": 0.00018277359630947186, "loss": 1.6797, "step": 5035 }, { "epoch": 3.3350993377483444, "grad_norm": 0.9837256524164946, "learning_rate": 0.00018270845803630286, "loss": 1.7109, "step": 5036 }, { "epoch": 3.3357615894039734, "grad_norm": 0.9833429636484584, "learning_rate": 0.00018264331328648415, "loss": 1.8359, "step": 5037 }, { "epoch": 3.3364238410596028, "grad_norm": 0.9992347770867407, "learning_rate": 0.0001825781620729152, "loss": 1.6172, "step": 5038 }, { "epoch": 3.3370860927152317, "grad_norm": 0.992486990087195, "learning_rate": 0.00018251300440849666, "loss": 1.7109, "step": 5039 }, { "epoch": 3.337748344370861, "grad_norm": 0.948886943612022, "learning_rate": 0.0001824478403061305, "loss": 1.6641, "step": 5040 }, { "epoch": 3.33841059602649, "grad_norm": 0.8743937954867568, "learning_rate": 0.00018238266977871996, "loss": 1.3516, "step": 5041 }, { "epoch": 3.339072847682119, "grad_norm": 0.948388915531691, "learning_rate": 0.00018231749283916952, "loss": 1.5547, "step": 5042 }, { "epoch": 3.3397350993377484, "grad_norm": 1.084187145765075, "learning_rate": 0.00018225230950038503, "loss": 1.8516, "step": 5043 }, { "epoch": 3.3403973509933773, "grad_norm": 0.9419551206500083, "learning_rate": 0.00018218711977527346, "loss": 1.5859, "step": 5044 }, { "epoch": 3.3410596026490067, "grad_norm": 1.0638050780973562, "learning_rate": 0.0001821219236767432, "loss": 1.9609, "step": 5045 }, { "epoch": 3.3417218543046356, "grad_norm": 0.9221734484489416, "learning_rate": 0.00018205672121770372, "loss": 1.7344, "step": 5046 }, { "epoch": 3.342384105960265, "grad_norm": 0.8810407645169892, "learning_rate": 0.00018199151241106597, "loss": 1.4453, "step": 5047 }, { "epoch": 3.343046357615894, "grad_norm": 1.031099908174573, "learning_rate": 0.000181926297269742, "loss": 1.8438, "step": 5048 }, { "epoch": 3.3437086092715234, "grad_norm": 0.8860284071959196, "learning_rate": 0.00018186107580664515, "loss": 1.5859, "step": 5049 }, { "epoch": 3.3443708609271523, "grad_norm": 0.9649460197630083, "learning_rate": 0.00018179584803468995, "loss": 1.7266, "step": 5050 }, { "epoch": 3.3450331125827812, "grad_norm": 0.9124202943425093, "learning_rate": 0.0001817306139667924, "loss": 1.5859, "step": 5051 }, { "epoch": 3.3456953642384106, "grad_norm": 0.9037979728665405, "learning_rate": 0.00018166537361586943, "loss": 1.5781, "step": 5052 }, { "epoch": 3.3463576158940396, "grad_norm": 1.0290571187327713, "learning_rate": 0.0001816001269948395, "loss": 1.8984, "step": 5053 }, { "epoch": 3.347019867549669, "grad_norm": 1.0449616638772539, "learning_rate": 0.00018153487411662214, "loss": 1.7656, "step": 5054 }, { "epoch": 3.347682119205298, "grad_norm": 1.0316468345319405, "learning_rate": 0.00018146961499413816, "loss": 1.9844, "step": 5055 }, { "epoch": 3.3483443708609273, "grad_norm": 0.8319563223450991, "learning_rate": 0.0001814043496403096, "loss": 1.375, "step": 5056 }, { "epoch": 3.3490066225165562, "grad_norm": 0.9343107230003916, "learning_rate": 0.00018133907806805984, "loss": 1.7344, "step": 5057 }, { "epoch": 3.3496688741721856, "grad_norm": 1.022443823532356, "learning_rate": 0.00018127380029031324, "loss": 1.7344, "step": 5058 }, { "epoch": 3.3503311258278146, "grad_norm": 1.01762922722523, "learning_rate": 0.0001812085163199957, "loss": 1.7734, "step": 5059 }, { "epoch": 3.3509933774834435, "grad_norm": 0.9968573220230004, "learning_rate": 0.00018114322617003413, "loss": 1.5469, "step": 5060 }, { "epoch": 3.351655629139073, "grad_norm": 0.9751367972087437, "learning_rate": 0.00018107792985335672, "loss": 1.7656, "step": 5061 }, { "epoch": 3.352317880794702, "grad_norm": 0.9640972340488173, "learning_rate": 0.0001810126273828929, "loss": 1.7578, "step": 5062 }, { "epoch": 3.352980132450331, "grad_norm": 0.9802542266216752, "learning_rate": 0.00018094731877157333, "loss": 1.7422, "step": 5063 }, { "epoch": 3.35364238410596, "grad_norm": 0.9388307632989934, "learning_rate": 0.00018088200403232983, "loss": 1.6484, "step": 5064 }, { "epoch": 3.3543046357615895, "grad_norm": 0.9609260919424933, "learning_rate": 0.0001808166831780955, "loss": 1.7188, "step": 5065 }, { "epoch": 3.3549668874172185, "grad_norm": 1.0610135633655273, "learning_rate": 0.00018075135622180457, "loss": 1.8828, "step": 5066 }, { "epoch": 3.355629139072848, "grad_norm": 0.8991894359222248, "learning_rate": 0.00018068602317639255, "loss": 1.5, "step": 5067 }, { "epoch": 3.356291390728477, "grad_norm": 1.0465344277467956, "learning_rate": 0.00018062068405479614, "loss": 1.8125, "step": 5068 }, { "epoch": 3.3569536423841058, "grad_norm": 1.0180421713451087, "learning_rate": 0.00018055533886995325, "loss": 1.6641, "step": 5069 }, { "epoch": 3.357615894039735, "grad_norm": 0.960124802528361, "learning_rate": 0.00018048998763480295, "loss": 1.6719, "step": 5070 }, { "epoch": 3.358278145695364, "grad_norm": 0.9502964674719154, "learning_rate": 0.00018042463036228556, "loss": 1.6875, "step": 5071 }, { "epoch": 3.3589403973509935, "grad_norm": 1.0749719657070982, "learning_rate": 0.00018035926706534255, "loss": 1.9766, "step": 5072 }, { "epoch": 3.3596026490066224, "grad_norm": 0.8406925620681531, "learning_rate": 0.00018029389775691657, "loss": 1.4375, "step": 5073 }, { "epoch": 3.360264900662252, "grad_norm": 0.9780892615733453, "learning_rate": 0.00018022852244995157, "loss": 1.875, "step": 5074 }, { "epoch": 3.3609271523178808, "grad_norm": 0.8682228548621989, "learning_rate": 0.00018016314115739253, "loss": 1.5391, "step": 5075 }, { "epoch": 3.36158940397351, "grad_norm": 0.9309727136669333, "learning_rate": 0.00018009775389218573, "loss": 1.7734, "step": 5076 }, { "epoch": 3.362251655629139, "grad_norm": 0.8893599767171835, "learning_rate": 0.00018003236066727862, "loss": 1.7266, "step": 5077 }, { "epoch": 3.362913907284768, "grad_norm": 0.9350769335309351, "learning_rate": 0.00017996696149561977, "loss": 1.8594, "step": 5078 }, { "epoch": 3.3635761589403974, "grad_norm": 0.8558348576534965, "learning_rate": 0.000179901556390159, "loss": 1.5234, "step": 5079 }, { "epoch": 3.3642384105960264, "grad_norm": 0.9071987013320861, "learning_rate": 0.00017983614536384727, "loss": 1.6797, "step": 5080 }, { "epoch": 3.3649006622516557, "grad_norm": 0.8668205201882796, "learning_rate": 0.00017977072842963665, "loss": 1.6562, "step": 5081 }, { "epoch": 3.3655629139072847, "grad_norm": 0.9364387986560406, "learning_rate": 0.00017970530560048052, "loss": 1.8047, "step": 5082 }, { "epoch": 3.366225165562914, "grad_norm": 0.941052400467578, "learning_rate": 0.00017963987688933333, "loss": 1.6953, "step": 5083 }, { "epoch": 3.366887417218543, "grad_norm": 0.9277656406501404, "learning_rate": 0.0001795744423091507, "loss": 1.8203, "step": 5084 }, { "epoch": 3.3675496688741724, "grad_norm": 0.9449287247129142, "learning_rate": 0.0001795090018728894, "loss": 1.8828, "step": 5085 }, { "epoch": 3.3682119205298013, "grad_norm": 0.9672444783816346, "learning_rate": 0.0001794435555935075, "loss": 1.8594, "step": 5086 }, { "epoch": 3.3688741721854303, "grad_norm": 0.9957896559548454, "learning_rate": 0.00017937810348396394, "loss": 1.9453, "step": 5087 }, { "epoch": 3.3695364238410597, "grad_norm": 0.986517051079045, "learning_rate": 0.0001793126455572191, "loss": 1.7812, "step": 5088 }, { "epoch": 3.3701986754966886, "grad_norm": 0.8636512093455239, "learning_rate": 0.00017924718182623443, "loss": 1.6094, "step": 5089 }, { "epoch": 3.370860927152318, "grad_norm": 0.9110108067689228, "learning_rate": 0.00017918171230397244, "loss": 1.6797, "step": 5090 }, { "epoch": 3.371523178807947, "grad_norm": 1.165778743061743, "learning_rate": 0.00017911623700339683, "loss": 1.8828, "step": 5091 }, { "epoch": 3.3721854304635763, "grad_norm": 0.9212506783652876, "learning_rate": 0.0001790507559374725, "loss": 1.6484, "step": 5092 }, { "epoch": 3.3728476821192053, "grad_norm": 0.8754487821143622, "learning_rate": 0.0001789852691191654, "loss": 1.625, "step": 5093 }, { "epoch": 3.3735099337748347, "grad_norm": 0.9159833008898494, "learning_rate": 0.0001789197765614428, "loss": 1.6562, "step": 5094 }, { "epoch": 3.3741721854304636, "grad_norm": 0.8783513636468169, "learning_rate": 0.0001788542782772728, "loss": 1.5234, "step": 5095 }, { "epoch": 3.3748344370860925, "grad_norm": 0.9197782442711238, "learning_rate": 0.00017878877427962494, "loss": 1.6328, "step": 5096 }, { "epoch": 3.375496688741722, "grad_norm": 0.9554336499177547, "learning_rate": 0.0001787232645814697, "loss": 1.6641, "step": 5097 }, { "epoch": 3.376158940397351, "grad_norm": 0.9125179128302443, "learning_rate": 0.0001786577491957788, "loss": 1.5938, "step": 5098 }, { "epoch": 3.3768211920529803, "grad_norm": 1.0084577877741088, "learning_rate": 0.00017859222813552497, "loss": 1.6562, "step": 5099 }, { "epoch": 3.377483443708609, "grad_norm": 1.0410624958083203, "learning_rate": 0.0001785267014136822, "loss": 1.8828, "step": 5100 }, { "epoch": 3.3781456953642386, "grad_norm": 1.0131893977822088, "learning_rate": 0.0001784611690432255, "loss": 1.7109, "step": 5101 }, { "epoch": 3.3788079470198675, "grad_norm": 0.9458835609804696, "learning_rate": 0.000178395631037131, "loss": 1.5859, "step": 5102 }, { "epoch": 3.379470198675497, "grad_norm": 1.0156015632569515, "learning_rate": 0.00017833008740837606, "loss": 1.8984, "step": 5103 }, { "epoch": 3.380132450331126, "grad_norm": 0.9756965648383552, "learning_rate": 0.00017826453816993898, "loss": 1.7031, "step": 5104 }, { "epoch": 3.380794701986755, "grad_norm": 0.8956392749755535, "learning_rate": 0.00017819898333479932, "loss": 1.5703, "step": 5105 }, { "epoch": 3.381456953642384, "grad_norm": 0.9963481266964629, "learning_rate": 0.00017813342291593765, "loss": 1.8594, "step": 5106 }, { "epoch": 3.382119205298013, "grad_norm": 1.0091932522462537, "learning_rate": 0.00017806785692633573, "loss": 1.9844, "step": 5107 }, { "epoch": 3.3827814569536425, "grad_norm": 0.8878873652868379, "learning_rate": 0.0001780022853789763, "loss": 1.6562, "step": 5108 }, { "epoch": 3.3834437086092715, "grad_norm": 0.9670764897878065, "learning_rate": 0.00017793670828684339, "loss": 1.875, "step": 5109 }, { "epoch": 3.384105960264901, "grad_norm": 0.8255076547604595, "learning_rate": 0.00017787112566292193, "loss": 1.6094, "step": 5110 }, { "epoch": 3.38476821192053, "grad_norm": 0.8818807173340291, "learning_rate": 0.00017780553752019804, "loss": 1.6641, "step": 5111 }, { "epoch": 3.385430463576159, "grad_norm": 0.9090337399738706, "learning_rate": 0.00017773994387165895, "loss": 1.625, "step": 5112 }, { "epoch": 3.386092715231788, "grad_norm": 0.8490336544998451, "learning_rate": 0.00017767434473029298, "loss": 1.4453, "step": 5113 }, { "epoch": 3.386754966887417, "grad_norm": 0.9893413967551508, "learning_rate": 0.0001776087401090894, "loss": 1.8125, "step": 5114 }, { "epoch": 3.3874172185430464, "grad_norm": 0.9641637277538008, "learning_rate": 0.00017754313002103883, "loss": 1.6406, "step": 5115 }, { "epoch": 3.3880794701986754, "grad_norm": 0.9318992993731228, "learning_rate": 0.00017747751447913269, "loss": 1.6016, "step": 5116 }, { "epoch": 3.388741721854305, "grad_norm": 0.9101877559318672, "learning_rate": 0.00017741189349636374, "loss": 1.5547, "step": 5117 }, { "epoch": 3.3894039735099337, "grad_norm": 0.974597500722578, "learning_rate": 0.00017734626708572554, "loss": 1.7422, "step": 5118 }, { "epoch": 3.390066225165563, "grad_norm": 1.0820275836193503, "learning_rate": 0.000177280635260213, "loss": 1.7812, "step": 5119 }, { "epoch": 3.390728476821192, "grad_norm": 0.9185536801502157, "learning_rate": 0.00017721499803282186, "loss": 1.5469, "step": 5120 }, { "epoch": 3.3913907284768214, "grad_norm": 1.0103915650128934, "learning_rate": 0.00017714935541654912, "loss": 1.7812, "step": 5121 }, { "epoch": 3.3920529801324504, "grad_norm": 1.054022166284303, "learning_rate": 0.00017708370742439272, "loss": 1.9688, "step": 5122 }, { "epoch": 3.3927152317880793, "grad_norm": 0.9219987100440208, "learning_rate": 0.0001770180540693518, "loss": 1.625, "step": 5123 }, { "epoch": 3.3933774834437087, "grad_norm": 1.0058784411001633, "learning_rate": 0.0001769523953644264, "loss": 1.7188, "step": 5124 }, { "epoch": 3.3940397350993377, "grad_norm": 0.9165941477436577, "learning_rate": 0.0001768867313226177, "loss": 1.6484, "step": 5125 }, { "epoch": 3.394701986754967, "grad_norm": 0.9888203613782737, "learning_rate": 0.000176821061956928, "loss": 1.7422, "step": 5126 }, { "epoch": 3.395364238410596, "grad_norm": 0.9129513486955185, "learning_rate": 0.00017675538728036052, "loss": 1.6328, "step": 5127 }, { "epoch": 3.396026490066225, "grad_norm": 0.8710059058136744, "learning_rate": 0.0001766897073059196, "loss": 1.5703, "step": 5128 }, { "epoch": 3.3966887417218543, "grad_norm": 0.8423861063369497, "learning_rate": 0.00017662402204661064, "loss": 1.4844, "step": 5129 }, { "epoch": 3.3973509933774833, "grad_norm": 1.0017613191912975, "learning_rate": 0.0001765583315154401, "loss": 1.7422, "step": 5130 }, { "epoch": 3.3980132450331126, "grad_norm": 1.0220260167071629, "learning_rate": 0.00017649263572541542, "loss": 1.7344, "step": 5131 }, { "epoch": 3.3986754966887416, "grad_norm": 1.013103145234632, "learning_rate": 0.00017642693468954517, "loss": 1.8516, "step": 5132 }, { "epoch": 3.399337748344371, "grad_norm": 0.9769217142885969, "learning_rate": 0.00017636122842083882, "loss": 1.7891, "step": 5133 }, { "epoch": 3.4, "grad_norm": 1.0383267855881078, "learning_rate": 0.00017629551693230708, "loss": 1.8906, "step": 5134 }, { "epoch": 3.4006622516556293, "grad_norm": 0.9269581507474266, "learning_rate": 0.00017622980023696152, "loss": 1.5234, "step": 5135 }, { "epoch": 3.4013245033112582, "grad_norm": 0.9226033334110777, "learning_rate": 0.00017616407834781476, "loss": 1.5938, "step": 5136 }, { "epoch": 3.401986754966887, "grad_norm": 0.9081231875994198, "learning_rate": 0.0001760983512778805, "loss": 1.5859, "step": 5137 }, { "epoch": 3.4026490066225166, "grad_norm": 1.0738185585901567, "learning_rate": 0.00017603261904017357, "loss": 1.9531, "step": 5138 }, { "epoch": 3.4033112582781455, "grad_norm": 0.8832305354231387, "learning_rate": 0.00017596688164770954, "loss": 1.5156, "step": 5139 }, { "epoch": 3.403973509933775, "grad_norm": 1.0400026640035083, "learning_rate": 0.00017590113911350528, "loss": 1.8984, "step": 5140 }, { "epoch": 3.404635761589404, "grad_norm": 0.8944807683302377, "learning_rate": 0.00017583539145057852, "loss": 1.6016, "step": 5141 }, { "epoch": 3.4052980132450332, "grad_norm": 0.9260783917645355, "learning_rate": 0.00017576963867194805, "loss": 1.6797, "step": 5142 }, { "epoch": 3.405960264900662, "grad_norm": 0.8835608765662691, "learning_rate": 0.00017570388079063366, "loss": 1.6641, "step": 5143 }, { "epoch": 3.4066225165562916, "grad_norm": 0.8816293366083017, "learning_rate": 0.00017563811781965628, "loss": 1.5156, "step": 5144 }, { "epoch": 3.4072847682119205, "grad_norm": 0.8811289852256803, "learning_rate": 0.00017557234977203754, "loss": 1.5859, "step": 5145 }, { "epoch": 3.4079470198675494, "grad_norm": 1.0573460701966, "learning_rate": 0.00017550657666080046, "loss": 1.8672, "step": 5146 }, { "epoch": 3.408609271523179, "grad_norm": 0.8225149793388778, "learning_rate": 0.0001754407984989687, "loss": 1.4375, "step": 5147 }, { "epoch": 3.4092715231788078, "grad_norm": 0.7935449828389942, "learning_rate": 0.00017537501529956716, "loss": 1.4062, "step": 5148 }, { "epoch": 3.409933774834437, "grad_norm": 1.0137609996665957, "learning_rate": 0.00017530922707562172, "loss": 1.7344, "step": 5149 }, { "epoch": 3.410596026490066, "grad_norm": 0.9846339830912962, "learning_rate": 0.00017524343384015914, "loss": 1.6953, "step": 5150 }, { "epoch": 3.4112582781456955, "grad_norm": 0.8512413079628269, "learning_rate": 0.00017517763560620725, "loss": 1.4375, "step": 5151 }, { "epoch": 3.4119205298013244, "grad_norm": 1.0022531353037152, "learning_rate": 0.00017511183238679492, "loss": 1.7969, "step": 5152 }, { "epoch": 3.412582781456954, "grad_norm": 0.9778620353582559, "learning_rate": 0.00017504602419495178, "loss": 1.7812, "step": 5153 }, { "epoch": 3.4132450331125828, "grad_norm": 0.8350125507052268, "learning_rate": 0.00017498021104370878, "loss": 1.3594, "step": 5154 }, { "epoch": 3.4139072847682117, "grad_norm": 0.9690444733007956, "learning_rate": 0.00017491439294609756, "loss": 1.6641, "step": 5155 }, { "epoch": 3.414569536423841, "grad_norm": 0.9627171325812037, "learning_rate": 0.00017484856991515095, "loss": 1.6875, "step": 5156 }, { "epoch": 3.41523178807947, "grad_norm": 0.851727492951156, "learning_rate": 0.00017478274196390257, "loss": 1.5625, "step": 5157 }, { "epoch": 3.4158940397350994, "grad_norm": 1.0453021078697722, "learning_rate": 0.00017471690910538716, "loss": 2.0312, "step": 5158 }, { "epoch": 3.4165562913907284, "grad_norm": 0.9830777749073366, "learning_rate": 0.0001746510713526404, "loss": 1.6016, "step": 5159 }, { "epoch": 3.4172185430463577, "grad_norm": 0.9971480024196377, "learning_rate": 0.0001745852287186989, "loss": 1.875, "step": 5160 }, { "epoch": 3.4178807947019867, "grad_norm": 0.9942757053527151, "learning_rate": 0.00017451938121660024, "loss": 1.9219, "step": 5161 }, { "epoch": 3.418543046357616, "grad_norm": 1.0050534685934427, "learning_rate": 0.000174453528859383, "loss": 1.9375, "step": 5162 }, { "epoch": 3.419205298013245, "grad_norm": 0.9301307388383542, "learning_rate": 0.0001743876716600867, "loss": 1.7969, "step": 5163 }, { "epoch": 3.419867549668874, "grad_norm": 1.0910609991504707, "learning_rate": 0.0001743218096317518, "loss": 1.9375, "step": 5164 }, { "epoch": 3.4205298013245033, "grad_norm": 0.9431145080024445, "learning_rate": 0.00017425594278741975, "loss": 1.6094, "step": 5165 }, { "epoch": 3.4211920529801323, "grad_norm": 0.9253719953549182, "learning_rate": 0.00017419007114013294, "loss": 1.7578, "step": 5166 }, { "epoch": 3.4218543046357617, "grad_norm": 0.8843643292777894, "learning_rate": 0.00017412419470293474, "loss": 1.7109, "step": 5167 }, { "epoch": 3.4225165562913906, "grad_norm": 0.8593766949201391, "learning_rate": 0.00017405831348886938, "loss": 1.7109, "step": 5168 }, { "epoch": 3.42317880794702, "grad_norm": 0.9578827339813413, "learning_rate": 0.00017399242751098214, "loss": 1.8047, "step": 5169 }, { "epoch": 3.423841059602649, "grad_norm": 0.8219353680274456, "learning_rate": 0.00017392653678231922, "loss": 1.4297, "step": 5170 }, { "epoch": 3.4245033112582783, "grad_norm": 0.937618782490708, "learning_rate": 0.00017386064131592768, "loss": 1.7188, "step": 5171 }, { "epoch": 3.4251655629139073, "grad_norm": 0.8926713982758337, "learning_rate": 0.0001737947411248556, "loss": 1.5625, "step": 5172 }, { "epoch": 3.4258278145695362, "grad_norm": 0.8275875525709161, "learning_rate": 0.00017372883622215202, "loss": 1.4219, "step": 5173 }, { "epoch": 3.4264900662251656, "grad_norm": 0.9529976715954195, "learning_rate": 0.00017366292662086675, "loss": 1.6719, "step": 5174 }, { "epoch": 3.4271523178807946, "grad_norm": 0.9124424848710261, "learning_rate": 0.0001735970123340508, "loss": 1.5781, "step": 5175 }, { "epoch": 3.427814569536424, "grad_norm": 0.9632432252481405, "learning_rate": 0.00017353109337475583, "loss": 1.6484, "step": 5176 }, { "epoch": 3.428476821192053, "grad_norm": 0.962037185201734, "learning_rate": 0.00017346516975603462, "loss": 1.6953, "step": 5177 }, { "epoch": 3.4291390728476823, "grad_norm": 1.0312288289682048, "learning_rate": 0.00017339924149094077, "loss": 1.7891, "step": 5178 }, { "epoch": 3.429801324503311, "grad_norm": 0.9986741960623842, "learning_rate": 0.00017333330859252885, "loss": 1.6719, "step": 5179 }, { "epoch": 3.4304635761589406, "grad_norm": 0.9407574450157301, "learning_rate": 0.00017326737107385432, "loss": 1.6562, "step": 5180 }, { "epoch": 3.4311258278145695, "grad_norm": 1.1083206945456445, "learning_rate": 0.00017320142894797357, "loss": 1.9844, "step": 5181 }, { "epoch": 3.4317880794701985, "grad_norm": 0.9130389249446954, "learning_rate": 0.00017313548222794384, "loss": 1.5625, "step": 5182 }, { "epoch": 3.432450331125828, "grad_norm": 0.9985952866795484, "learning_rate": 0.00017306953092682345, "loss": 1.7969, "step": 5183 }, { "epoch": 3.433112582781457, "grad_norm": 0.9345722211104205, "learning_rate": 0.00017300357505767145, "loss": 1.6797, "step": 5184 }, { "epoch": 3.433774834437086, "grad_norm": 0.9942148398657046, "learning_rate": 0.00017293761463354782, "loss": 1.8125, "step": 5185 }, { "epoch": 3.434437086092715, "grad_norm": 0.9175002432868609, "learning_rate": 0.00017287164966751356, "loss": 1.7812, "step": 5186 }, { "epoch": 3.4350993377483445, "grad_norm": 0.8078161634006769, "learning_rate": 0.00017280568017263045, "loss": 1.4531, "step": 5187 }, { "epoch": 3.4357615894039735, "grad_norm": 0.9462517171189211, "learning_rate": 0.00017273970616196117, "loss": 1.7344, "step": 5188 }, { "epoch": 3.436423841059603, "grad_norm": 0.9940237190080945, "learning_rate": 0.00017267372764856941, "loss": 1.7812, "step": 5189 }, { "epoch": 3.437086092715232, "grad_norm": 0.8548880566761075, "learning_rate": 0.0001726077446455196, "loss": 1.5156, "step": 5190 }, { "epoch": 3.4377483443708607, "grad_norm": 0.9833874378071286, "learning_rate": 0.00017254175716587718, "loss": 1.9766, "step": 5191 }, { "epoch": 3.43841059602649, "grad_norm": 0.9044997317576567, "learning_rate": 0.00017247576522270842, "loss": 1.6016, "step": 5192 }, { "epoch": 3.439072847682119, "grad_norm": 0.9245921145990159, "learning_rate": 0.00017240976882908045, "loss": 1.6328, "step": 5193 }, { "epoch": 3.4397350993377485, "grad_norm": 0.8612237107283524, "learning_rate": 0.00017234376799806132, "loss": 1.3438, "step": 5194 }, { "epoch": 3.4403973509933774, "grad_norm": 1.0544629423580019, "learning_rate": 0.00017227776274271995, "loss": 1.9141, "step": 5195 }, { "epoch": 3.441059602649007, "grad_norm": 0.9283501469804176, "learning_rate": 0.00017221175307612625, "loss": 1.5703, "step": 5196 }, { "epoch": 3.4417218543046357, "grad_norm": 0.9652977305624869, "learning_rate": 0.00017214573901135066, "loss": 1.6484, "step": 5197 }, { "epoch": 3.442384105960265, "grad_norm": 1.0340862167885854, "learning_rate": 0.00017207972056146494, "loss": 1.8203, "step": 5198 }, { "epoch": 3.443046357615894, "grad_norm": 0.9776205777261562, "learning_rate": 0.00017201369773954138, "loss": 1.7109, "step": 5199 }, { "epoch": 3.443708609271523, "grad_norm": 0.992653793883981, "learning_rate": 0.0001719476705586533, "loss": 1.8828, "step": 5200 }, { "epoch": 3.4443708609271524, "grad_norm": 0.9493260288157698, "learning_rate": 0.00017188163903187478, "loss": 1.6328, "step": 5201 }, { "epoch": 3.4450331125827813, "grad_norm": 1.1148339790373614, "learning_rate": 0.00017181560317228097, "loss": 1.7656, "step": 5202 }, { "epoch": 3.4456953642384107, "grad_norm": 0.9340067036068305, "learning_rate": 0.00017174956299294752, "loss": 1.7578, "step": 5203 }, { "epoch": 3.4463576158940397, "grad_norm": 0.9394075405131492, "learning_rate": 0.00017168351850695133, "loss": 1.6562, "step": 5204 }, { "epoch": 3.447019867549669, "grad_norm": 0.9813082441785328, "learning_rate": 0.00017161746972736981, "loss": 1.5078, "step": 5205 }, { "epoch": 3.447682119205298, "grad_norm": 0.9214886875005742, "learning_rate": 0.0001715514166672815, "loss": 1.5781, "step": 5206 }, { "epoch": 3.4483443708609274, "grad_norm": 0.9286119216719376, "learning_rate": 0.0001714853593397656, "loss": 1.7812, "step": 5207 }, { "epoch": 3.4490066225165563, "grad_norm": 0.9926078091220786, "learning_rate": 0.00017141929775790223, "loss": 1.9609, "step": 5208 }, { "epoch": 3.4496688741721853, "grad_norm": 0.9505181992828995, "learning_rate": 0.00017135323193477226, "loss": 1.7344, "step": 5209 }, { "epoch": 3.4503311258278146, "grad_norm": 0.8921344413666605, "learning_rate": 0.0001712871618834577, "loss": 1.5312, "step": 5210 }, { "epoch": 3.4509933774834436, "grad_norm": 0.9719192118167114, "learning_rate": 0.00017122108761704092, "loss": 1.7109, "step": 5211 }, { "epoch": 3.451655629139073, "grad_norm": 0.9122406435169303, "learning_rate": 0.00017115500914860555, "loss": 1.6797, "step": 5212 }, { "epoch": 3.452317880794702, "grad_norm": 0.9868875464505232, "learning_rate": 0.00017108892649123585, "loss": 1.7266, "step": 5213 }, { "epoch": 3.4529801324503313, "grad_norm": 0.9799445831968059, "learning_rate": 0.0001710228396580169, "loss": 1.8203, "step": 5214 }, { "epoch": 3.4536423841059603, "grad_norm": 0.9462054016228678, "learning_rate": 0.00017095674866203468, "loss": 1.7188, "step": 5215 }, { "epoch": 3.4543046357615896, "grad_norm": 0.9829135808918863, "learning_rate": 0.000170890653516376, "loss": 1.8203, "step": 5216 }, { "epoch": 3.4549668874172186, "grad_norm": 0.9758074887367897, "learning_rate": 0.00017082455423412838, "loss": 1.8203, "step": 5217 }, { "epoch": 3.4556291390728475, "grad_norm": 0.8931033412785244, "learning_rate": 0.0001707584508283803, "loss": 1.625, "step": 5218 }, { "epoch": 3.456291390728477, "grad_norm": 0.8894440899934166, "learning_rate": 0.00017069234331222095, "loss": 1.6094, "step": 5219 }, { "epoch": 3.456953642384106, "grad_norm": 0.9354605166093602, "learning_rate": 0.00017062623169874038, "loss": 1.5859, "step": 5220 }, { "epoch": 3.4576158940397352, "grad_norm": 0.8595179034065843, "learning_rate": 0.0001705601160010295, "loss": 1.5859, "step": 5221 }, { "epoch": 3.458278145695364, "grad_norm": 0.9767206693405877, "learning_rate": 0.00017049399623217994, "loss": 1.7109, "step": 5222 }, { "epoch": 3.4589403973509936, "grad_norm": 0.9874006782889105, "learning_rate": 0.00017042787240528412, "loss": 1.7734, "step": 5223 }, { "epoch": 3.4596026490066225, "grad_norm": 1.0040262306897425, "learning_rate": 0.0001703617445334354, "loss": 1.7188, "step": 5224 }, { "epoch": 3.460264900662252, "grad_norm": 0.9445113353407504, "learning_rate": 0.00017029561262972784, "loss": 1.7422, "step": 5225 }, { "epoch": 3.460927152317881, "grad_norm": 0.9494424848810213, "learning_rate": 0.00017022947670725626, "loss": 1.5781, "step": 5226 }, { "epoch": 3.46158940397351, "grad_norm": 0.9718869461790585, "learning_rate": 0.00017016333677911637, "loss": 1.6797, "step": 5227 }, { "epoch": 3.462251655629139, "grad_norm": 0.9498185102336367, "learning_rate": 0.00017009719285840467, "loss": 1.7422, "step": 5228 }, { "epoch": 3.462913907284768, "grad_norm": 0.8194112847223924, "learning_rate": 0.00017003104495821834, "loss": 1.4062, "step": 5229 }, { "epoch": 3.4635761589403975, "grad_norm": 0.9250246431600145, "learning_rate": 0.00016996489309165544, "loss": 1.5234, "step": 5230 }, { "epoch": 3.4642384105960264, "grad_norm": 1.0198790943345215, "learning_rate": 0.00016989873727181492, "loss": 1.9297, "step": 5231 }, { "epoch": 3.4649006622516554, "grad_norm": 0.9842678224561896, "learning_rate": 0.00016983257751179617, "loss": 1.6875, "step": 5232 }, { "epoch": 3.4655629139072848, "grad_norm": 0.8330272488671963, "learning_rate": 0.00016976641382469978, "loss": 1.3672, "step": 5233 }, { "epoch": 3.466225165562914, "grad_norm": 0.9792468844088593, "learning_rate": 0.0001697002462236268, "loss": 1.7031, "step": 5234 }, { "epoch": 3.466887417218543, "grad_norm": 1.002084953663449, "learning_rate": 0.00016963407472167922, "loss": 1.9062, "step": 5235 }, { "epoch": 3.467549668874172, "grad_norm": 1.0603825795026105, "learning_rate": 0.00016956789933195976, "loss": 2.0156, "step": 5236 }, { "epoch": 3.4682119205298014, "grad_norm": 0.8962450222557493, "learning_rate": 0.00016950172006757188, "loss": 1.6094, "step": 5237 }, { "epoch": 3.4688741721854304, "grad_norm": 0.9944748601312259, "learning_rate": 0.0001694355369416198, "loss": 1.8359, "step": 5238 }, { "epoch": 3.4695364238410598, "grad_norm": 0.928621485633392, "learning_rate": 0.0001693693499672087, "loss": 1.7656, "step": 5239 }, { "epoch": 3.4701986754966887, "grad_norm": 0.9554765753132255, "learning_rate": 0.00016930315915744413, "loss": 1.7891, "step": 5240 }, { "epoch": 3.4708609271523176, "grad_norm": 0.8309663100079769, "learning_rate": 0.0001692369645254328, "loss": 1.4844, "step": 5241 }, { "epoch": 3.471523178807947, "grad_norm": 0.8459899206018462, "learning_rate": 0.00016917076608428196, "loss": 1.6328, "step": 5242 }, { "epoch": 3.472185430463576, "grad_norm": 0.9249338020546705, "learning_rate": 0.00016910456384709962, "loss": 1.7109, "step": 5243 }, { "epoch": 3.4728476821192054, "grad_norm": 0.9420128900420639, "learning_rate": 0.0001690383578269946, "loss": 1.7344, "step": 5244 }, { "epoch": 3.4735099337748343, "grad_norm": 0.9154703425442362, "learning_rate": 0.0001689721480370765, "loss": 1.8516, "step": 5245 }, { "epoch": 3.4741721854304637, "grad_norm": 0.9115509457711516, "learning_rate": 0.00016890593449045554, "loss": 1.5547, "step": 5246 }, { "epoch": 3.4748344370860926, "grad_norm": 0.9195113380865291, "learning_rate": 0.00016883971720024283, "loss": 1.6328, "step": 5247 }, { "epoch": 3.475496688741722, "grad_norm": 1.0300629991813193, "learning_rate": 0.00016877349617955013, "loss": 1.9609, "step": 5248 }, { "epoch": 3.476158940397351, "grad_norm": 0.9200860492406189, "learning_rate": 0.00016870727144148995, "loss": 1.75, "step": 5249 }, { "epoch": 3.47682119205298, "grad_norm": 1.0466688254251262, "learning_rate": 0.00016864104299917556, "loss": 1.9141, "step": 5250 }, { "epoch": 3.4774834437086093, "grad_norm": 0.9571153679732961, "learning_rate": 0.00016857481086572092, "loss": 1.7422, "step": 5251 }, { "epoch": 3.4781456953642382, "grad_norm": 1.005558675131107, "learning_rate": 0.0001685085750542408, "loss": 1.9688, "step": 5252 }, { "epoch": 3.4788079470198676, "grad_norm": 0.8650513105357885, "learning_rate": 0.00016844233557785057, "loss": 1.5234, "step": 5253 }, { "epoch": 3.4794701986754966, "grad_norm": 0.8834245347261634, "learning_rate": 0.00016837609244966652, "loss": 1.7109, "step": 5254 }, { "epoch": 3.480132450331126, "grad_norm": 0.9949494662009583, "learning_rate": 0.00016830984568280547, "loss": 1.8047, "step": 5255 }, { "epoch": 3.480794701986755, "grad_norm": 0.9711205775089836, "learning_rate": 0.00016824359529038507, "loss": 1.8984, "step": 5256 }, { "epoch": 3.4814569536423843, "grad_norm": 0.8947138616107438, "learning_rate": 0.00016817734128552364, "loss": 1.6094, "step": 5257 }, { "epoch": 3.482119205298013, "grad_norm": 0.8553176349555313, "learning_rate": 0.00016811108368134026, "loss": 1.6875, "step": 5258 }, { "epoch": 3.482781456953642, "grad_norm": 0.895860607198179, "learning_rate": 0.0001680448224909546, "loss": 1.7109, "step": 5259 }, { "epoch": 3.4834437086092715, "grad_norm": 0.9551462077379637, "learning_rate": 0.00016797855772748732, "loss": 1.6875, "step": 5260 }, { "epoch": 3.4841059602649005, "grad_norm": 0.9585888795253112, "learning_rate": 0.00016791228940405938, "loss": 1.875, "step": 5261 }, { "epoch": 3.48476821192053, "grad_norm": 0.8771332527273271, "learning_rate": 0.00016784601753379286, "loss": 1.5312, "step": 5262 }, { "epoch": 3.485430463576159, "grad_norm": 0.8746484470103016, "learning_rate": 0.00016777974212981022, "loss": 1.5859, "step": 5263 }, { "epoch": 3.486092715231788, "grad_norm": 0.9598038288933977, "learning_rate": 0.00016771346320523485, "loss": 1.8281, "step": 5264 }, { "epoch": 3.486754966887417, "grad_norm": 0.9392528817749131, "learning_rate": 0.00016764718077319068, "loss": 1.6641, "step": 5265 }, { "epoch": 3.4874172185430465, "grad_norm": 0.9503590662789897, "learning_rate": 0.00016758089484680236, "loss": 1.5625, "step": 5266 }, { "epoch": 3.4880794701986755, "grad_norm": 1.0724450285528642, "learning_rate": 0.0001675146054391953, "loss": 1.9297, "step": 5267 }, { "epoch": 3.4887417218543044, "grad_norm": 0.9400965140843992, "learning_rate": 0.00016744831256349565, "loss": 1.6641, "step": 5268 }, { "epoch": 3.489403973509934, "grad_norm": 0.9756525924498628, "learning_rate": 0.00016738201623282996, "loss": 1.6406, "step": 5269 }, { "epoch": 3.4900662251655628, "grad_norm": 0.969980022614054, "learning_rate": 0.00016731571646032585, "loss": 1.5781, "step": 5270 }, { "epoch": 3.490728476821192, "grad_norm": 0.8971813730908372, "learning_rate": 0.0001672494132591113, "loss": 1.5234, "step": 5271 }, { "epoch": 3.491390728476821, "grad_norm": 0.9517732045252427, "learning_rate": 0.00016718310664231523, "loss": 1.7188, "step": 5272 }, { "epoch": 3.4920529801324505, "grad_norm": 0.9126961195759552, "learning_rate": 0.00016711679662306702, "loss": 1.5, "step": 5273 }, { "epoch": 3.4927152317880794, "grad_norm": 0.9204594854959108, "learning_rate": 0.00016705048321449687, "loss": 1.6016, "step": 5274 }, { "epoch": 3.493377483443709, "grad_norm": 0.9655831877074936, "learning_rate": 0.0001669841664297355, "loss": 1.5234, "step": 5275 }, { "epoch": 3.4940397350993377, "grad_norm": 0.9461758735625779, "learning_rate": 0.00016691784628191452, "loss": 1.7734, "step": 5276 }, { "epoch": 3.4947019867549667, "grad_norm": 0.9420351792903754, "learning_rate": 0.00016685152278416598, "loss": 1.625, "step": 5277 }, { "epoch": 3.495364238410596, "grad_norm": 0.8992624539972025, "learning_rate": 0.00016678519594962276, "loss": 1.6094, "step": 5278 }, { "epoch": 3.496026490066225, "grad_norm": 1.012086155907385, "learning_rate": 0.00016671886579141832, "loss": 1.7109, "step": 5279 }, { "epoch": 3.4966887417218544, "grad_norm": 0.9666842373900281, "learning_rate": 0.00016665253232268676, "loss": 1.5938, "step": 5280 }, { "epoch": 3.4973509933774833, "grad_norm": 1.0646123567243013, "learning_rate": 0.00016658619555656287, "loss": 1.9609, "step": 5281 }, { "epoch": 3.4980132450331127, "grad_norm": 0.9925466748554195, "learning_rate": 0.00016651985550618213, "loss": 1.7734, "step": 5282 }, { "epoch": 3.4986754966887417, "grad_norm": 0.9918108701806201, "learning_rate": 0.0001664535121846806, "loss": 1.8281, "step": 5283 }, { "epoch": 3.499337748344371, "grad_norm": 0.9261380881931536, "learning_rate": 0.000166387165605195, "loss": 1.6562, "step": 5284 }, { "epoch": 3.5, "grad_norm": 0.8741274561890802, "learning_rate": 0.00016632081578086275, "loss": 1.5078, "step": 5285 }, { "epoch": 3.500662251655629, "grad_norm": 1.0060911809836188, "learning_rate": 0.00016625446272482188, "loss": 1.8281, "step": 5286 }, { "epoch": 3.5013245033112583, "grad_norm": 0.9569351129318284, "learning_rate": 0.00016618810645021103, "loss": 1.6875, "step": 5287 }, { "epoch": 3.5019867549668873, "grad_norm": 0.9321104000352731, "learning_rate": 0.00016612174697016948, "loss": 1.6172, "step": 5288 }, { "epoch": 3.5026490066225167, "grad_norm": 0.9651616673486624, "learning_rate": 0.00016605538429783728, "loss": 1.8516, "step": 5289 }, { "epoch": 3.5033112582781456, "grad_norm": 0.933445569041152, "learning_rate": 0.00016598901844635484, "loss": 1.6953, "step": 5290 }, { "epoch": 3.503973509933775, "grad_norm": 0.901934301030569, "learning_rate": 0.00016592264942886348, "loss": 1.6328, "step": 5291 }, { "epoch": 3.504635761589404, "grad_norm": 0.8250077250213438, "learning_rate": 0.00016585627725850498, "loss": 1.5234, "step": 5292 }, { "epoch": 3.5052980132450333, "grad_norm": 0.8320573737159552, "learning_rate": 0.0001657899019484218, "loss": 1.4531, "step": 5293 }, { "epoch": 3.5059602649006623, "grad_norm": 0.9353399210580502, "learning_rate": 0.00016572352351175697, "loss": 1.6797, "step": 5294 }, { "epoch": 3.506622516556291, "grad_norm": 1.003243494706625, "learning_rate": 0.00016565714196165428, "loss": 1.8516, "step": 5295 }, { "epoch": 3.5072847682119206, "grad_norm": 0.9401139521924904, "learning_rate": 0.0001655907573112579, "loss": 1.8203, "step": 5296 }, { "epoch": 3.5079470198675495, "grad_norm": 1.018118386901662, "learning_rate": 0.00016552436957371292, "loss": 1.8438, "step": 5297 }, { "epoch": 3.508609271523179, "grad_norm": 0.9213721937148905, "learning_rate": 0.0001654579787621647, "loss": 1.5859, "step": 5298 }, { "epoch": 3.509271523178808, "grad_norm": 0.8482252684075946, "learning_rate": 0.0001653915848897595, "loss": 1.4844, "step": 5299 }, { "epoch": 3.5099337748344372, "grad_norm": 0.9403256391577341, "learning_rate": 0.00016532518796964408, "loss": 1.6562, "step": 5300 }, { "epoch": 3.510596026490066, "grad_norm": 1.0416924243612968, "learning_rate": 0.0001652587880149657, "loss": 2.0469, "step": 5301 }, { "epoch": 3.5112582781456956, "grad_norm": 0.9416614362980242, "learning_rate": 0.00016519238503887235, "loss": 1.6875, "step": 5302 }, { "epoch": 3.5119205298013245, "grad_norm": 0.9406254505140788, "learning_rate": 0.0001651259790545126, "loss": 1.7188, "step": 5303 }, { "epoch": 3.5125827814569535, "grad_norm": 0.9774319316061385, "learning_rate": 0.0001650595700750356, "loss": 1.8906, "step": 5304 }, { "epoch": 3.513245033112583, "grad_norm": 0.9867103464541125, "learning_rate": 0.00016499315811359104, "loss": 1.7031, "step": 5305 }, { "epoch": 3.513907284768212, "grad_norm": 0.9589847928558287, "learning_rate": 0.00016492674318332933, "loss": 1.7188, "step": 5306 }, { "epoch": 3.514569536423841, "grad_norm": 0.8966152008905183, "learning_rate": 0.00016486032529740128, "loss": 1.5859, "step": 5307 }, { "epoch": 3.51523178807947, "grad_norm": 0.7386399050134128, "learning_rate": 0.00016479390446895852, "loss": 1.3359, "step": 5308 }, { "epoch": 3.515894039735099, "grad_norm": 0.823873002611102, "learning_rate": 0.00016472748071115302, "loss": 1.4219, "step": 5309 }, { "epoch": 3.5165562913907285, "grad_norm": 0.9215419416073001, "learning_rate": 0.00016466105403713748, "loss": 1.625, "step": 5310 }, { "epoch": 3.517218543046358, "grad_norm": 0.9229146485782297, "learning_rate": 0.00016459462446006515, "loss": 1.6094, "step": 5311 }, { "epoch": 3.517880794701987, "grad_norm": 1.0142740466052895, "learning_rate": 0.00016452819199308993, "loss": 1.7969, "step": 5312 }, { "epoch": 3.5185430463576157, "grad_norm": 1.0373773911987614, "learning_rate": 0.0001644617566493661, "loss": 1.7734, "step": 5313 }, { "epoch": 3.519205298013245, "grad_norm": 1.03877460628737, "learning_rate": 0.00016439531844204862, "loss": 2.0156, "step": 5314 }, { "epoch": 3.519867549668874, "grad_norm": 0.9573624856950872, "learning_rate": 0.00016432887738429308, "loss": 1.7578, "step": 5315 }, { "epoch": 3.5205298013245034, "grad_norm": 0.972271586718286, "learning_rate": 0.00016426243348925554, "loss": 1.8594, "step": 5316 }, { "epoch": 3.5211920529801324, "grad_norm": 1.0027442672793094, "learning_rate": 0.0001641959867700927, "loss": 1.7188, "step": 5317 }, { "epoch": 3.5218543046357613, "grad_norm": 0.9117405587575127, "learning_rate": 0.00016412953723996166, "loss": 1.6875, "step": 5318 }, { "epoch": 3.5225165562913907, "grad_norm": 0.9127624287298804, "learning_rate": 0.00016406308491202028, "loss": 1.5469, "step": 5319 }, { "epoch": 3.52317880794702, "grad_norm": 0.8596008269240704, "learning_rate": 0.0001639966297994269, "loss": 1.5234, "step": 5320 }, { "epoch": 3.523841059602649, "grad_norm": 0.9400024357350738, "learning_rate": 0.00016393017191534024, "loss": 1.7109, "step": 5321 }, { "epoch": 3.524503311258278, "grad_norm": 0.9319391549211633, "learning_rate": 0.00016386371127291994, "loss": 1.5234, "step": 5322 }, { "epoch": 3.5251655629139074, "grad_norm": 0.9176192087423477, "learning_rate": 0.00016379724788532585, "loss": 1.6641, "step": 5323 }, { "epoch": 3.5258278145695363, "grad_norm": 0.9478734226947579, "learning_rate": 0.00016373078176571846, "loss": 1.6719, "step": 5324 }, { "epoch": 3.5264900662251657, "grad_norm": 0.9740121291970824, "learning_rate": 0.0001636643129272589, "loss": 1.8203, "step": 5325 }, { "epoch": 3.5271523178807946, "grad_norm": 0.9670634309004048, "learning_rate": 0.00016359784138310872, "loss": 1.8281, "step": 5326 }, { "epoch": 3.5278145695364236, "grad_norm": 1.1150831716696257, "learning_rate": 0.00016353136714643003, "loss": 1.8906, "step": 5327 }, { "epoch": 3.528476821192053, "grad_norm": 1.0700857593354138, "learning_rate": 0.0001634648902303855, "loss": 1.7578, "step": 5328 }, { "epoch": 3.5291390728476824, "grad_norm": 1.0058119883684582, "learning_rate": 0.0001633984106481384, "loss": 1.8516, "step": 5329 }, { "epoch": 3.5298013245033113, "grad_norm": 1.0434992788607969, "learning_rate": 0.00016333192841285233, "loss": 2.0625, "step": 5330 }, { "epoch": 3.5304635761589402, "grad_norm": 0.9012628605402628, "learning_rate": 0.00016326544353769158, "loss": 1.6562, "step": 5331 }, { "epoch": 3.5311258278145696, "grad_norm": 0.8322996338329111, "learning_rate": 0.00016319895603582096, "loss": 1.4688, "step": 5332 }, { "epoch": 3.5317880794701986, "grad_norm": 0.9008661495937892, "learning_rate": 0.00016313246592040571, "loss": 1.6797, "step": 5333 }, { "epoch": 3.532450331125828, "grad_norm": 0.9187321145513938, "learning_rate": 0.00016306597320461165, "loss": 1.75, "step": 5334 }, { "epoch": 3.533112582781457, "grad_norm": 0.9478436478403233, "learning_rate": 0.0001629994779016051, "loss": 1.8125, "step": 5335 }, { "epoch": 3.533774834437086, "grad_norm": 0.9718965375396881, "learning_rate": 0.00016293298002455287, "loss": 1.6406, "step": 5336 }, { "epoch": 3.5344370860927152, "grad_norm": 0.8292936538070134, "learning_rate": 0.00016286647958662235, "loss": 1.3906, "step": 5337 }, { "epoch": 3.5350993377483446, "grad_norm": 0.9983472822966541, "learning_rate": 0.00016279997660098138, "loss": 1.7422, "step": 5338 }, { "epoch": 3.5357615894039736, "grad_norm": 0.9525656553610059, "learning_rate": 0.00016273347108079823, "loss": 1.7656, "step": 5339 }, { "epoch": 3.5364238410596025, "grad_norm": 0.9207529051251476, "learning_rate": 0.0001626669630392419, "loss": 1.6641, "step": 5340 }, { "epoch": 3.537086092715232, "grad_norm": 0.8906035639535697, "learning_rate": 0.00016260045248948162, "loss": 1.6094, "step": 5341 }, { "epoch": 3.537748344370861, "grad_norm": 0.9652991120405814, "learning_rate": 0.0001625339394446873, "loss": 1.7422, "step": 5342 }, { "epoch": 3.53841059602649, "grad_norm": 0.9870217413823394, "learning_rate": 0.00016246742391802928, "loss": 1.7344, "step": 5343 }, { "epoch": 3.539072847682119, "grad_norm": 0.9189682503745589, "learning_rate": 0.0001624009059226784, "loss": 1.6094, "step": 5344 }, { "epoch": 3.539735099337748, "grad_norm": 1.0390735357272625, "learning_rate": 0.00016233438547180596, "loss": 1.7734, "step": 5345 }, { "epoch": 3.5403973509933775, "grad_norm": 1.0241092171949788, "learning_rate": 0.00016226786257858377, "loss": 1.7422, "step": 5346 }, { "epoch": 3.541059602649007, "grad_norm": 0.922730632725712, "learning_rate": 0.00016220133725618416, "loss": 1.5469, "step": 5347 }, { "epoch": 3.541721854304636, "grad_norm": 0.9616496064607529, "learning_rate": 0.00016213480951777988, "loss": 1.6641, "step": 5348 }, { "epoch": 3.5423841059602648, "grad_norm": 1.083854411177325, "learning_rate": 0.00016206827937654422, "loss": 2.1094, "step": 5349 }, { "epoch": 3.543046357615894, "grad_norm": 0.9185236640848786, "learning_rate": 0.00016200174684565086, "loss": 1.6406, "step": 5350 }, { "epoch": 3.543708609271523, "grad_norm": 0.9771546885209988, "learning_rate": 0.00016193521193827405, "loss": 1.75, "step": 5351 }, { "epoch": 3.5443708609271525, "grad_norm": 1.0834340016136035, "learning_rate": 0.00016186867466758841, "loss": 2.0469, "step": 5352 }, { "epoch": 3.5450331125827814, "grad_norm": 0.8663187373358745, "learning_rate": 0.00016180213504676913, "loss": 1.5, "step": 5353 }, { "epoch": 3.5456953642384104, "grad_norm": 0.927913074240894, "learning_rate": 0.0001617355930889918, "loss": 1.75, "step": 5354 }, { "epoch": 3.5463576158940397, "grad_norm": 0.9486252595529084, "learning_rate": 0.00016166904880743254, "loss": 1.6094, "step": 5355 }, { "epoch": 3.547019867549669, "grad_norm": 0.965354088632048, "learning_rate": 0.00016160250221526775, "loss": 1.875, "step": 5356 }, { "epoch": 3.547682119205298, "grad_norm": 0.8338428783457262, "learning_rate": 0.0001615359533256746, "loss": 1.5234, "step": 5357 }, { "epoch": 3.548344370860927, "grad_norm": 0.9130006045409188, "learning_rate": 0.00016146940215183036, "loss": 1.7812, "step": 5358 }, { "epoch": 3.5490066225165564, "grad_norm": 1.0145154158734289, "learning_rate": 0.00016140284870691301, "loss": 2.0312, "step": 5359 }, { "epoch": 3.5496688741721854, "grad_norm": 0.8876100411554546, "learning_rate": 0.0001613362930041009, "loss": 1.6719, "step": 5360 }, { "epoch": 3.5503311258278147, "grad_norm": 0.9418101170194497, "learning_rate": 0.00016126973505657284, "loss": 1.6797, "step": 5361 }, { "epoch": 3.5509933774834437, "grad_norm": 0.9490533025987453, "learning_rate": 0.00016120317487750802, "loss": 1.6641, "step": 5362 }, { "epoch": 3.5516556291390726, "grad_norm": 0.874417609386111, "learning_rate": 0.0001611366124800861, "loss": 1.5469, "step": 5363 }, { "epoch": 3.552317880794702, "grad_norm": 0.9573053560230277, "learning_rate": 0.00016107004787748727, "loss": 1.6797, "step": 5364 }, { "epoch": 3.5529801324503314, "grad_norm": 1.0297934741878627, "learning_rate": 0.00016100348108289206, "loss": 1.7812, "step": 5365 }, { "epoch": 3.5536423841059603, "grad_norm": 0.8841883924055438, "learning_rate": 0.00016093691210948137, "loss": 1.5938, "step": 5366 }, { "epoch": 3.5543046357615893, "grad_norm": 0.9894702599579677, "learning_rate": 0.00016087034097043678, "loss": 1.8438, "step": 5367 }, { "epoch": 3.5549668874172187, "grad_norm": 0.990306523487853, "learning_rate": 0.00016080376767893997, "loss": 1.7969, "step": 5368 }, { "epoch": 3.5556291390728476, "grad_norm": 0.9899658849023713, "learning_rate": 0.00016073719224817337, "loss": 1.7891, "step": 5369 }, { "epoch": 3.556291390728477, "grad_norm": 0.8311432536383253, "learning_rate": 0.00016067061469131956, "loss": 1.4141, "step": 5370 }, { "epoch": 3.556953642384106, "grad_norm": 0.7984629533567666, "learning_rate": 0.00016060403502156172, "loss": 1.4375, "step": 5371 }, { "epoch": 3.557615894039735, "grad_norm": 0.967955308474612, "learning_rate": 0.0001605374532520834, "loss": 1.7031, "step": 5372 }, { "epoch": 3.5582781456953643, "grad_norm": 0.9761527886101768, "learning_rate": 0.00016047086939606852, "loss": 1.7422, "step": 5373 }, { "epoch": 3.558940397350993, "grad_norm": 0.9340661407231386, "learning_rate": 0.00016040428346670145, "loss": 1.6094, "step": 5374 }, { "epoch": 3.5596026490066226, "grad_norm": 0.9783495099741326, "learning_rate": 0.000160337695477167, "loss": 1.7344, "step": 5375 }, { "epoch": 3.5602649006622515, "grad_norm": 0.9359563624768892, "learning_rate": 0.00016027110544065024, "loss": 1.5859, "step": 5376 }, { "epoch": 3.560927152317881, "grad_norm": 0.9450339624509777, "learning_rate": 0.0001602045133703369, "loss": 1.6016, "step": 5377 }, { "epoch": 3.56158940397351, "grad_norm": 0.9628700639638467, "learning_rate": 0.00016013791927941293, "loss": 1.6719, "step": 5378 }, { "epoch": 3.5622516556291393, "grad_norm": 1.0251631613773962, "learning_rate": 0.00016007132318106472, "loss": 1.8594, "step": 5379 }, { "epoch": 3.562913907284768, "grad_norm": 0.9996603208388756, "learning_rate": 0.00016000472508847903, "loss": 1.5703, "step": 5380 }, { "epoch": 3.563576158940397, "grad_norm": 0.8910190273271855, "learning_rate": 0.0001599381250148431, "loss": 1.5, "step": 5381 }, { "epoch": 3.5642384105960265, "grad_norm": 0.9299813235358839, "learning_rate": 0.00015987152297334447, "loss": 1.5391, "step": 5382 }, { "epoch": 3.5649006622516555, "grad_norm": 1.0491675738589996, "learning_rate": 0.0001598049189771711, "loss": 1.6797, "step": 5383 }, { "epoch": 3.565562913907285, "grad_norm": 0.975461702275182, "learning_rate": 0.00015973831303951142, "loss": 1.5938, "step": 5384 }, { "epoch": 3.566225165562914, "grad_norm": 1.0353038114152482, "learning_rate": 0.00015967170517355405, "loss": 1.7031, "step": 5385 }, { "epoch": 3.566887417218543, "grad_norm": 0.9648528974811715, "learning_rate": 0.0001596050953924882, "loss": 1.5625, "step": 5386 }, { "epoch": 3.567549668874172, "grad_norm": 0.9084991596426929, "learning_rate": 0.00015953848370950334, "loss": 1.5781, "step": 5387 }, { "epoch": 3.5682119205298015, "grad_norm": 0.9983643976619423, "learning_rate": 0.00015947187013778933, "loss": 1.6797, "step": 5388 }, { "epoch": 3.5688741721854305, "grad_norm": 1.0358335739726412, "learning_rate": 0.00015940525469053645, "loss": 1.7266, "step": 5389 }, { "epoch": 3.5695364238410594, "grad_norm": 0.8919323314061441, "learning_rate": 0.00015933863738093533, "loss": 1.4141, "step": 5390 }, { "epoch": 3.570198675496689, "grad_norm": 0.9381757175654588, "learning_rate": 0.00015927201822217696, "loss": 1.4922, "step": 5391 }, { "epoch": 3.5708609271523177, "grad_norm": 1.0589568980792632, "learning_rate": 0.00015920539722745264, "loss": 1.8125, "step": 5392 }, { "epoch": 3.571523178807947, "grad_norm": 1.0028699686031068, "learning_rate": 0.00015913877440995415, "loss": 1.7344, "step": 5393 }, { "epoch": 3.572185430463576, "grad_norm": 1.036546700263265, "learning_rate": 0.00015907214978287356, "loss": 1.7734, "step": 5394 }, { "epoch": 3.5728476821192054, "grad_norm": 0.9379696953110752, "learning_rate": 0.00015900552335940327, "loss": 1.5625, "step": 5395 }, { "epoch": 3.5735099337748344, "grad_norm": 0.9273931399306699, "learning_rate": 0.00015893889515273618, "loss": 1.5234, "step": 5396 }, { "epoch": 3.5741721854304638, "grad_norm": 0.9561556912096334, "learning_rate": 0.0001588722651760653, "loss": 1.625, "step": 5397 }, { "epoch": 3.5748344370860927, "grad_norm": 0.9592435514233916, "learning_rate": 0.00015880563344258423, "loss": 1.7109, "step": 5398 }, { "epoch": 3.5754966887417217, "grad_norm": 0.9792211152231995, "learning_rate": 0.00015873899996548678, "loss": 1.9297, "step": 5399 }, { "epoch": 3.576158940397351, "grad_norm": 0.9881906058986323, "learning_rate": 0.00015867236475796719, "loss": 1.7422, "step": 5400 }, { "epoch": 3.57682119205298, "grad_norm": 0.9391093415271614, "learning_rate": 0.00015860572783321989, "loss": 1.6641, "step": 5401 }, { "epoch": 3.5774834437086094, "grad_norm": 0.9246299320589654, "learning_rate": 0.00015853908920443984, "loss": 1.6562, "step": 5402 }, { "epoch": 3.5781456953642383, "grad_norm": 0.8889564698604083, "learning_rate": 0.0001584724488848222, "loss": 1.4766, "step": 5403 }, { "epoch": 3.5788079470198677, "grad_norm": 0.9876388835322506, "learning_rate": 0.00015840580688756262, "loss": 1.8516, "step": 5404 }, { "epoch": 3.5794701986754967, "grad_norm": 0.9859119781425864, "learning_rate": 0.0001583391632258568, "loss": 1.6719, "step": 5405 }, { "epoch": 3.580132450331126, "grad_norm": 1.0146871885059339, "learning_rate": 0.00015827251791290114, "loss": 1.8828, "step": 5406 }, { "epoch": 3.580794701986755, "grad_norm": 0.942691073926143, "learning_rate": 0.0001582058709618921, "loss": 1.6094, "step": 5407 }, { "epoch": 3.581456953642384, "grad_norm": 1.0201317386083246, "learning_rate": 0.00015813922238602652, "loss": 1.7812, "step": 5408 }, { "epoch": 3.5821192052980133, "grad_norm": 0.8919510088579388, "learning_rate": 0.00015807257219850157, "loss": 1.5703, "step": 5409 }, { "epoch": 3.5827814569536423, "grad_norm": 0.9469981899054225, "learning_rate": 0.00015800592041251478, "loss": 1.7109, "step": 5410 }, { "epoch": 3.5834437086092716, "grad_norm": 0.9967146262916864, "learning_rate": 0.00015793926704126397, "loss": 1.7656, "step": 5411 }, { "epoch": 3.5841059602649006, "grad_norm": 0.9841558606116027, "learning_rate": 0.00015787261209794723, "loss": 1.7578, "step": 5412 }, { "epoch": 3.58476821192053, "grad_norm": 1.042073372583229, "learning_rate": 0.00015780595559576312, "loss": 1.8828, "step": 5413 }, { "epoch": 3.585430463576159, "grad_norm": 1.0103138569457304, "learning_rate": 0.00015773929754791025, "loss": 1.7578, "step": 5414 }, { "epoch": 3.5860927152317883, "grad_norm": 1.0721599492092924, "learning_rate": 0.00015767263796758772, "loss": 1.9375, "step": 5415 }, { "epoch": 3.5867549668874172, "grad_norm": 0.8190898500881091, "learning_rate": 0.00015760597686799495, "loss": 1.4688, "step": 5416 }, { "epoch": 3.587417218543046, "grad_norm": 0.9644837922021989, "learning_rate": 0.00015753931426233154, "loss": 1.6484, "step": 5417 }, { "epoch": 3.5880794701986756, "grad_norm": 0.8153412020978149, "learning_rate": 0.00015747265016379745, "loss": 1.4766, "step": 5418 }, { "epoch": 3.5887417218543045, "grad_norm": 0.9515821289026316, "learning_rate": 0.00015740598458559302, "loss": 1.8516, "step": 5419 }, { "epoch": 3.589403973509934, "grad_norm": 0.8946083230115717, "learning_rate": 0.00015733931754091865, "loss": 1.5781, "step": 5420 }, { "epoch": 3.590066225165563, "grad_norm": 0.9707668224994146, "learning_rate": 0.00015727264904297534, "loss": 1.8125, "step": 5421 }, { "epoch": 3.590728476821192, "grad_norm": 0.8760502090432105, "learning_rate": 0.00015720597910496412, "loss": 1.7109, "step": 5422 }, { "epoch": 3.591390728476821, "grad_norm": 0.9384935661429146, "learning_rate": 0.0001571393077400864, "loss": 1.6406, "step": 5423 }, { "epoch": 3.5920529801324506, "grad_norm": 0.8972093627877783, "learning_rate": 0.0001570726349615439, "loss": 1.4766, "step": 5424 }, { "epoch": 3.5927152317880795, "grad_norm": 1.003480833521167, "learning_rate": 0.00015700596078253864, "loss": 1.7891, "step": 5425 }, { "epoch": 3.5933774834437084, "grad_norm": 1.0478319064574422, "learning_rate": 0.00015693928521627278, "loss": 1.7656, "step": 5426 }, { "epoch": 3.594039735099338, "grad_norm": 1.1018321155042545, "learning_rate": 0.0001568726082759489, "loss": 1.9219, "step": 5427 }, { "epoch": 3.5947019867549668, "grad_norm": 0.9772783293616389, "learning_rate": 0.0001568059299747698, "loss": 1.8203, "step": 5428 }, { "epoch": 3.595364238410596, "grad_norm": 0.9506712134957765, "learning_rate": 0.00015673925032593856, "loss": 1.7266, "step": 5429 }, { "epoch": 3.596026490066225, "grad_norm": 0.9121235943930099, "learning_rate": 0.00015667256934265842, "loss": 1.6328, "step": 5430 }, { "epoch": 3.596688741721854, "grad_norm": 1.0679961223352659, "learning_rate": 0.00015660588703813314, "loss": 1.8828, "step": 5431 }, { "epoch": 3.5973509933774834, "grad_norm": 0.9892472136282454, "learning_rate": 0.00015653920342556638, "loss": 1.7422, "step": 5432 }, { "epoch": 3.598013245033113, "grad_norm": 0.9590418855478537, "learning_rate": 0.0001564725185181625, "loss": 1.7812, "step": 5433 }, { "epoch": 3.5986754966887418, "grad_norm": 0.9473797454728158, "learning_rate": 0.00015640583232912566, "loss": 1.5938, "step": 5434 }, { "epoch": 3.5993377483443707, "grad_norm": 0.9859570800238129, "learning_rate": 0.00015633914487166062, "loss": 1.7734, "step": 5435 }, { "epoch": 3.6, "grad_norm": 1.0247291426891405, "learning_rate": 0.0001562724561589722, "loss": 1.8359, "step": 5436 }, { "epoch": 3.600662251655629, "grad_norm": 0.9919425626737989, "learning_rate": 0.00015620576620426556, "loss": 1.7109, "step": 5437 }, { "epoch": 3.6013245033112584, "grad_norm": 0.8605264856257943, "learning_rate": 0.00015613907502074606, "loss": 1.5, "step": 5438 }, { "epoch": 3.6019867549668874, "grad_norm": 0.9779958200009906, "learning_rate": 0.00015607238262161933, "loss": 1.7109, "step": 5439 }, { "epoch": 3.6026490066225163, "grad_norm": 1.0233186906882359, "learning_rate": 0.0001560056890200912, "loss": 1.7969, "step": 5440 }, { "epoch": 3.6033112582781457, "grad_norm": 0.9201585045169206, "learning_rate": 0.00015593899422936778, "loss": 1.5703, "step": 5441 }, { "epoch": 3.603973509933775, "grad_norm": 0.959662461867005, "learning_rate": 0.0001558722982626555, "loss": 1.6172, "step": 5442 }, { "epoch": 3.604635761589404, "grad_norm": 1.0150906420016441, "learning_rate": 0.00015580560113316076, "loss": 1.9688, "step": 5443 }, { "epoch": 3.605298013245033, "grad_norm": 0.8891405051120231, "learning_rate": 0.00015573890285409047, "loss": 1.5938, "step": 5444 }, { "epoch": 3.6059602649006623, "grad_norm": 0.9009181762928509, "learning_rate": 0.0001556722034386516, "loss": 1.5781, "step": 5445 }, { "epoch": 3.6066225165562913, "grad_norm": 1.1155995215838945, "learning_rate": 0.00015560550290005145, "loss": 1.9141, "step": 5446 }, { "epoch": 3.6072847682119207, "grad_norm": 1.0620802518976755, "learning_rate": 0.00015553880125149738, "loss": 1.9531, "step": 5447 }, { "epoch": 3.6079470198675496, "grad_norm": 0.8224461386855975, "learning_rate": 0.00015547209850619726, "loss": 1.3203, "step": 5448 }, { "epoch": 3.6086092715231786, "grad_norm": 0.9598294075484233, "learning_rate": 0.00015540539467735886, "loss": 1.7344, "step": 5449 }, { "epoch": 3.609271523178808, "grad_norm": 0.9778144971954751, "learning_rate": 0.0001553386897781903, "loss": 1.6484, "step": 5450 }, { "epoch": 3.6099337748344373, "grad_norm": 1.0048426328830289, "learning_rate": 0.00015527198382190002, "loss": 1.7969, "step": 5451 }, { "epoch": 3.6105960264900663, "grad_norm": 1.0005411540448157, "learning_rate": 0.0001552052768216965, "loss": 1.75, "step": 5452 }, { "epoch": 3.611258278145695, "grad_norm": 0.8560394317924507, "learning_rate": 0.00015513856879078845, "loss": 1.5234, "step": 5453 }, { "epoch": 3.6119205298013246, "grad_norm": 0.9774997545993043, "learning_rate": 0.00015507185974238493, "loss": 1.7734, "step": 5454 }, { "epoch": 3.6125827814569536, "grad_norm": 1.0343880114248794, "learning_rate": 0.00015500514968969496, "loss": 1.8906, "step": 5455 }, { "epoch": 3.613245033112583, "grad_norm": 0.9345352589876579, "learning_rate": 0.000154938438645928, "loss": 1.6016, "step": 5456 }, { "epoch": 3.613907284768212, "grad_norm": 0.93513857442134, "learning_rate": 0.0001548717266242936, "loss": 1.4766, "step": 5457 }, { "epoch": 3.614569536423841, "grad_norm": 1.0215768853711904, "learning_rate": 0.00015480501363800145, "loss": 1.6328, "step": 5458 }, { "epoch": 3.61523178807947, "grad_norm": 1.0812296879841494, "learning_rate": 0.00015473829970026154, "loss": 1.7734, "step": 5459 }, { "epoch": 3.6158940397350996, "grad_norm": 1.01091091373328, "learning_rate": 0.00015467158482428394, "loss": 1.6797, "step": 5460 }, { "epoch": 3.6165562913907285, "grad_norm": 1.0465564299690684, "learning_rate": 0.00015460486902327898, "loss": 1.8281, "step": 5461 }, { "epoch": 3.6172185430463575, "grad_norm": 0.982072133039844, "learning_rate": 0.00015453815231045718, "loss": 1.6094, "step": 5462 }, { "epoch": 3.617880794701987, "grad_norm": 0.9655544635893837, "learning_rate": 0.00015447143469902918, "loss": 1.5547, "step": 5463 }, { "epoch": 3.618543046357616, "grad_norm": 1.0164231452862569, "learning_rate": 0.00015440471620220583, "loss": 1.8047, "step": 5464 }, { "epoch": 3.619205298013245, "grad_norm": 0.9953279295614368, "learning_rate": 0.0001543379968331982, "loss": 1.6875, "step": 5465 }, { "epoch": 3.619867549668874, "grad_norm": 1.0639397585683934, "learning_rate": 0.00015427127660521748, "loss": 1.8828, "step": 5466 }, { "epoch": 3.620529801324503, "grad_norm": 0.9617729731907882, "learning_rate": 0.000154204555531475, "loss": 1.8047, "step": 5467 }, { "epoch": 3.6211920529801325, "grad_norm": 0.9457230297282906, "learning_rate": 0.00015413783362518233, "loss": 1.5703, "step": 5468 }, { "epoch": 3.621854304635762, "grad_norm": 0.8805514993806806, "learning_rate": 0.00015407111089955116, "loss": 1.4375, "step": 5469 }, { "epoch": 3.622516556291391, "grad_norm": 0.9964779430225779, "learning_rate": 0.00015400438736779332, "loss": 1.75, "step": 5470 }, { "epoch": 3.6231788079470197, "grad_norm": 0.8904535470639685, "learning_rate": 0.0001539376630431209, "loss": 1.4844, "step": 5471 }, { "epoch": 3.623841059602649, "grad_norm": 0.9349304044804878, "learning_rate": 0.00015387093793874606, "loss": 1.6797, "step": 5472 }, { "epoch": 3.624503311258278, "grad_norm": 1.0242792948477977, "learning_rate": 0.00015380421206788114, "loss": 1.7266, "step": 5473 }, { "epoch": 3.6251655629139075, "grad_norm": 0.9918116265299677, "learning_rate": 0.00015373748544373863, "loss": 1.7734, "step": 5474 }, { "epoch": 3.6258278145695364, "grad_norm": 0.9333770872602772, "learning_rate": 0.00015367075807953113, "loss": 1.7578, "step": 5475 }, { "epoch": 3.6264900662251653, "grad_norm": 0.8887581458336022, "learning_rate": 0.00015360402998847142, "loss": 1.5938, "step": 5476 }, { "epoch": 3.6271523178807947, "grad_norm": 0.9221880761289662, "learning_rate": 0.0001535373011837725, "loss": 1.6797, "step": 5477 }, { "epoch": 3.627814569536424, "grad_norm": 0.874205702348191, "learning_rate": 0.00015347057167864735, "loss": 1.4531, "step": 5478 }, { "epoch": 3.628476821192053, "grad_norm": 1.0475042398995513, "learning_rate": 0.00015340384148630924, "loss": 1.75, "step": 5479 }, { "epoch": 3.629139072847682, "grad_norm": 0.9773058547495003, "learning_rate": 0.00015333711061997152, "loss": 1.5156, "step": 5480 }, { "epoch": 3.6298013245033114, "grad_norm": 1.0526254354535474, "learning_rate": 0.0001532703790928476, "loss": 1.7344, "step": 5481 }, { "epoch": 3.6304635761589403, "grad_norm": 1.0073689996216701, "learning_rate": 0.0001532036469181511, "loss": 1.6875, "step": 5482 }, { "epoch": 3.6311258278145697, "grad_norm": 0.875935510454572, "learning_rate": 0.00015313691410909588, "loss": 1.3125, "step": 5483 }, { "epoch": 3.6317880794701987, "grad_norm": 0.9027009282184744, "learning_rate": 0.0001530701806788956, "loss": 1.5078, "step": 5484 }, { "epoch": 3.6324503311258276, "grad_norm": 0.8371768737462985, "learning_rate": 0.00015300344664076438, "loss": 1.3672, "step": 5485 }, { "epoch": 3.633112582781457, "grad_norm": 0.9766885151956994, "learning_rate": 0.0001529367120079163, "loss": 1.5938, "step": 5486 }, { "epoch": 3.633774834437086, "grad_norm": 1.0991650990996824, "learning_rate": 0.0001528699767935656, "loss": 2.0625, "step": 5487 }, { "epoch": 3.6344370860927153, "grad_norm": 1.115275419392536, "learning_rate": 0.00015280324101092654, "loss": 1.8438, "step": 5488 }, { "epoch": 3.6350993377483443, "grad_norm": 1.0121744638211336, "learning_rate": 0.00015273650467321367, "loss": 1.8984, "step": 5489 }, { "epoch": 3.6357615894039736, "grad_norm": 0.9421789207267149, "learning_rate": 0.00015266976779364146, "loss": 1.5547, "step": 5490 }, { "epoch": 3.6364238410596026, "grad_norm": 0.9668593254691284, "learning_rate": 0.00015260303038542466, "loss": 1.7188, "step": 5491 }, { "epoch": 3.637086092715232, "grad_norm": 0.9556438256318691, "learning_rate": 0.00015253629246177793, "loss": 1.7734, "step": 5492 }, { "epoch": 3.637748344370861, "grad_norm": 0.8742868386223109, "learning_rate": 0.00015246955403591629, "loss": 1.5234, "step": 5493 }, { "epoch": 3.63841059602649, "grad_norm": 0.9602324700577255, "learning_rate": 0.00015240281512105463, "loss": 1.7109, "step": 5494 }, { "epoch": 3.6390728476821192, "grad_norm": 0.9328884559263386, "learning_rate": 0.00015233607573040801, "loss": 1.6797, "step": 5495 }, { "epoch": 3.639735099337748, "grad_norm": 0.9405996314645788, "learning_rate": 0.00015226933587719158, "loss": 1.5938, "step": 5496 }, { "epoch": 3.6403973509933776, "grad_norm": 0.9576571673499773, "learning_rate": 0.00015220259557462067, "loss": 1.5547, "step": 5497 }, { "epoch": 3.6410596026490065, "grad_norm": 1.050272470898764, "learning_rate": 0.00015213585483591057, "loss": 1.9219, "step": 5498 }, { "epoch": 3.641721854304636, "grad_norm": 0.9987198495981005, "learning_rate": 0.00015206911367427674, "loss": 1.7578, "step": 5499 }, { "epoch": 3.642384105960265, "grad_norm": 0.996248277055577, "learning_rate": 0.00015200237210293469, "loss": 1.625, "step": 5500 }, { "epoch": 3.6430463576158942, "grad_norm": 0.9590741462116509, "learning_rate": 0.00015193563013509998, "loss": 1.6016, "step": 5501 }, { "epoch": 3.643708609271523, "grad_norm": 0.91603837073998, "learning_rate": 0.00015186888778398832, "loss": 1.5312, "step": 5502 }, { "epoch": 3.644370860927152, "grad_norm": 0.9534665359162454, "learning_rate": 0.00015180214506281547, "loss": 1.6094, "step": 5503 }, { "epoch": 3.6450331125827815, "grad_norm": 0.9672835885698239, "learning_rate": 0.0001517354019847972, "loss": 1.6875, "step": 5504 }, { "epoch": 3.6456953642384105, "grad_norm": 1.0360052841837173, "learning_rate": 0.00015166865856314947, "loss": 1.7656, "step": 5505 }, { "epoch": 3.64635761589404, "grad_norm": 0.9298236501823403, "learning_rate": 0.00015160191481108822, "loss": 1.5703, "step": 5506 }, { "epoch": 3.647019867549669, "grad_norm": 0.8476777562742751, "learning_rate": 0.00015153517074182945, "loss": 1.3984, "step": 5507 }, { "epoch": 3.647682119205298, "grad_norm": 0.8970675329374492, "learning_rate": 0.0001514684263685893, "loss": 1.4922, "step": 5508 }, { "epoch": 3.648344370860927, "grad_norm": 0.9466460120849547, "learning_rate": 0.00015140168170458385, "loss": 1.5781, "step": 5509 }, { "epoch": 3.6490066225165565, "grad_norm": 0.9440707442843377, "learning_rate": 0.0001513349367630294, "loss": 1.7188, "step": 5510 }, { "epoch": 3.6496688741721854, "grad_norm": 1.0377063444488313, "learning_rate": 0.0001512681915571421, "loss": 1.7422, "step": 5511 }, { "epoch": 3.6503311258278144, "grad_norm": 0.9822735634082073, "learning_rate": 0.00015120144610013841, "loss": 1.625, "step": 5512 }, { "epoch": 3.6509933774834438, "grad_norm": 0.8639155918115943, "learning_rate": 0.00015113470040523458, "loss": 1.4453, "step": 5513 }, { "epoch": 3.6516556291390727, "grad_norm": 0.9374240610196501, "learning_rate": 0.00015106795448564708, "loss": 1.5625, "step": 5514 }, { "epoch": 3.652317880794702, "grad_norm": 1.1014439805689693, "learning_rate": 0.00015100120835459234, "loss": 1.8281, "step": 5515 }, { "epoch": 3.652980132450331, "grad_norm": 0.9814054303046156, "learning_rate": 0.00015093446202528685, "loss": 1.7344, "step": 5516 }, { "epoch": 3.6536423841059604, "grad_norm": 0.8684138356787554, "learning_rate": 0.0001508677155109472, "loss": 1.3984, "step": 5517 }, { "epoch": 3.6543046357615894, "grad_norm": 1.0439118104088005, "learning_rate": 0.0001508009688247899, "loss": 1.7266, "step": 5518 }, { "epoch": 3.6549668874172188, "grad_norm": 0.923249160212107, "learning_rate": 0.00015073422198003155, "loss": 1.5703, "step": 5519 }, { "epoch": 3.6556291390728477, "grad_norm": 1.0011446626755878, "learning_rate": 0.00015066747498988895, "loss": 1.5781, "step": 5520 }, { "epoch": 3.6562913907284766, "grad_norm": 1.039299280898437, "learning_rate": 0.00015060072786757852, "loss": 1.8281, "step": 5521 }, { "epoch": 3.656953642384106, "grad_norm": 1.0096593010718178, "learning_rate": 0.00015053398062631716, "loss": 1.7734, "step": 5522 }, { "epoch": 3.657615894039735, "grad_norm": 0.9821805777952813, "learning_rate": 0.0001504672332793215, "loss": 1.6875, "step": 5523 }, { "epoch": 3.6582781456953644, "grad_norm": 0.9079183521622091, "learning_rate": 0.0001504004858398083, "loss": 1.5, "step": 5524 }, { "epoch": 3.6589403973509933, "grad_norm": 1.1327686814064506, "learning_rate": 0.00015033373832099427, "loss": 1.8203, "step": 5525 }, { "epoch": 3.6596026490066222, "grad_norm": 0.9042112388775178, "learning_rate": 0.0001502669907360963, "loss": 1.5391, "step": 5526 }, { "epoch": 3.6602649006622516, "grad_norm": 1.0148392219883262, "learning_rate": 0.00015020024309833103, "loss": 1.7344, "step": 5527 }, { "epoch": 3.660927152317881, "grad_norm": 0.9219452470146404, "learning_rate": 0.00015013349542091539, "loss": 1.5391, "step": 5528 }, { "epoch": 3.66158940397351, "grad_norm": 0.9425000423485886, "learning_rate": 0.00015006674771706607, "loss": 1.8281, "step": 5529 }, { "epoch": 3.662251655629139, "grad_norm": 0.7583464420091842, "learning_rate": 0.00015, "loss": 1.2891, "step": 5530 }, { "epoch": 3.6629139072847683, "grad_norm": 1.0030341484290775, "learning_rate": 0.0001499332522829339, "loss": 1.7656, "step": 5531 }, { "epoch": 3.6635761589403972, "grad_norm": 1.0134292386614951, "learning_rate": 0.00014986650457908456, "loss": 1.7656, "step": 5532 }, { "epoch": 3.6642384105960266, "grad_norm": 1.0119688181796418, "learning_rate": 0.00014979975690166897, "loss": 1.8281, "step": 5533 }, { "epoch": 3.6649006622516556, "grad_norm": 0.9146532036052958, "learning_rate": 0.0001497330092639037, "loss": 1.5391, "step": 5534 }, { "epoch": 3.6655629139072845, "grad_norm": 0.8710071585930678, "learning_rate": 0.00014966626167900568, "loss": 1.4219, "step": 5535 }, { "epoch": 3.666225165562914, "grad_norm": 0.9279411330783591, "learning_rate": 0.0001495995141601917, "loss": 1.5625, "step": 5536 }, { "epoch": 3.6668874172185433, "grad_norm": 1.0350663930933217, "learning_rate": 0.00014953276672067847, "loss": 1.7031, "step": 5537 }, { "epoch": 3.667549668874172, "grad_norm": 1.0128820889897836, "learning_rate": 0.0001494660193736828, "loss": 1.8594, "step": 5538 }, { "epoch": 3.668211920529801, "grad_norm": 1.0317139319114441, "learning_rate": 0.00014939927213242145, "loss": 1.8203, "step": 5539 }, { "epoch": 3.6688741721854305, "grad_norm": 0.9491985540004312, "learning_rate": 0.00014933252501011108, "loss": 1.75, "step": 5540 }, { "epoch": 3.6695364238410595, "grad_norm": 1.0361653749675888, "learning_rate": 0.0001492657780199684, "loss": 1.75, "step": 5541 }, { "epoch": 3.670198675496689, "grad_norm": 1.0256050113708572, "learning_rate": 0.0001491990311752101, "loss": 1.8203, "step": 5542 }, { "epoch": 3.670860927152318, "grad_norm": 0.9881460390296565, "learning_rate": 0.00014913228448905285, "loss": 1.6953, "step": 5543 }, { "epoch": 3.6715231788079468, "grad_norm": 1.0443177739119482, "learning_rate": 0.0001490655379747131, "loss": 1.7969, "step": 5544 }, { "epoch": 3.672185430463576, "grad_norm": 0.9222403091905449, "learning_rate": 0.00014899879164540766, "loss": 1.6484, "step": 5545 }, { "epoch": 3.6728476821192055, "grad_norm": 0.945148596621292, "learning_rate": 0.00014893204551435292, "loss": 1.5312, "step": 5546 }, { "epoch": 3.6735099337748345, "grad_norm": 0.9288247341791913, "learning_rate": 0.00014886529959476537, "loss": 1.5547, "step": 5547 }, { "epoch": 3.6741721854304634, "grad_norm": 0.9952181484905053, "learning_rate": 0.00014879855389986156, "loss": 1.75, "step": 5548 }, { "epoch": 3.674834437086093, "grad_norm": 0.8988603331179746, "learning_rate": 0.0001487318084428579, "loss": 1.5625, "step": 5549 }, { "epoch": 3.6754966887417218, "grad_norm": 0.8599023511872526, "learning_rate": 0.0001486650632369706, "loss": 1.3281, "step": 5550 }, { "epoch": 3.676158940397351, "grad_norm": 0.9919892401932802, "learning_rate": 0.00014859831829541612, "loss": 1.5703, "step": 5551 }, { "epoch": 3.67682119205298, "grad_norm": 0.897263953060312, "learning_rate": 0.00014853157363141071, "loss": 1.4375, "step": 5552 }, { "epoch": 3.677483443708609, "grad_norm": 1.032144369501003, "learning_rate": 0.00014846482925817052, "loss": 1.7734, "step": 5553 }, { "epoch": 3.6781456953642384, "grad_norm": 0.9562754487394266, "learning_rate": 0.00014839808518891175, "loss": 1.6406, "step": 5554 }, { "epoch": 3.678807947019868, "grad_norm": 0.985348141173966, "learning_rate": 0.00014833134143685053, "loss": 1.5469, "step": 5555 }, { "epoch": 3.6794701986754967, "grad_norm": 1.0444345008922429, "learning_rate": 0.00014826459801520276, "loss": 1.8047, "step": 5556 }, { "epoch": 3.6801324503311257, "grad_norm": 1.002745888366712, "learning_rate": 0.00014819785493718453, "loss": 1.6875, "step": 5557 }, { "epoch": 3.680794701986755, "grad_norm": 0.974481678075266, "learning_rate": 0.00014813111221601168, "loss": 1.6172, "step": 5558 }, { "epoch": 3.681456953642384, "grad_norm": 1.1126230037973754, "learning_rate": 0.00014806436986490004, "loss": 1.8203, "step": 5559 }, { "epoch": 3.6821192052980134, "grad_norm": 1.0757588345701155, "learning_rate": 0.00014799762789706531, "loss": 1.7422, "step": 5560 }, { "epoch": 3.6827814569536423, "grad_norm": 0.9551034195794778, "learning_rate": 0.00014793088632572324, "loss": 1.4609, "step": 5561 }, { "epoch": 3.6834437086092713, "grad_norm": 1.0102837414680566, "learning_rate": 0.00014786414516408943, "loss": 1.7266, "step": 5562 }, { "epoch": 3.6841059602649007, "grad_norm": 0.9832592519926433, "learning_rate": 0.0001477974044253793, "loss": 1.7031, "step": 5563 }, { "epoch": 3.68476821192053, "grad_norm": 0.9345885251456978, "learning_rate": 0.0001477306641228084, "loss": 1.5234, "step": 5564 }, { "epoch": 3.685430463576159, "grad_norm": 0.9824194908956344, "learning_rate": 0.000147663924269592, "loss": 1.8203, "step": 5565 }, { "epoch": 3.686092715231788, "grad_norm": 0.9264013293335893, "learning_rate": 0.0001475971848789454, "loss": 1.625, "step": 5566 }, { "epoch": 3.6867549668874173, "grad_norm": 0.9382544908818071, "learning_rate": 0.00014753044596408371, "loss": 1.5547, "step": 5567 }, { "epoch": 3.6874172185430463, "grad_norm": 0.9088663435231644, "learning_rate": 0.00014746370753822204, "loss": 1.6172, "step": 5568 }, { "epoch": 3.6880794701986757, "grad_norm": 0.9252757998979266, "learning_rate": 0.00014739696961457534, "loss": 1.6562, "step": 5569 }, { "epoch": 3.6887417218543046, "grad_norm": 1.0094862449849653, "learning_rate": 0.00014733023220635857, "loss": 1.8438, "step": 5570 }, { "epoch": 3.6894039735099335, "grad_norm": 0.970846480457611, "learning_rate": 0.00014726349532678636, "loss": 1.6875, "step": 5571 }, { "epoch": 3.690066225165563, "grad_norm": 0.9444455615418398, "learning_rate": 0.00014719675898907343, "loss": 1.7031, "step": 5572 }, { "epoch": 3.6907284768211923, "grad_norm": 0.9980956811982235, "learning_rate": 0.00014713002320643442, "loss": 1.625, "step": 5573 }, { "epoch": 3.6913907284768213, "grad_norm": 1.0025500010848614, "learning_rate": 0.00014706328799208372, "loss": 1.8984, "step": 5574 }, { "epoch": 3.69205298013245, "grad_norm": 0.9323150777124989, "learning_rate": 0.00014699655335923556, "loss": 1.6328, "step": 5575 }, { "epoch": 3.6927152317880796, "grad_norm": 0.8944490473037413, "learning_rate": 0.00014692981932110439, "loss": 1.4531, "step": 5576 }, { "epoch": 3.6933774834437085, "grad_norm": 0.869931796322871, "learning_rate": 0.00014686308589090418, "loss": 1.4531, "step": 5577 }, { "epoch": 3.694039735099338, "grad_norm": 0.9703009668262551, "learning_rate": 0.00014679635308184884, "loss": 1.7109, "step": 5578 }, { "epoch": 3.694701986754967, "grad_norm": 0.9644091753060987, "learning_rate": 0.0001467296209071524, "loss": 1.6641, "step": 5579 }, { "epoch": 3.695364238410596, "grad_norm": 0.9035572053381756, "learning_rate": 0.0001466628893800285, "loss": 1.5469, "step": 5580 }, { "epoch": 3.696026490066225, "grad_norm": 1.017841226749287, "learning_rate": 0.0001465961585136907, "loss": 1.8594, "step": 5581 }, { "epoch": 3.6966887417218546, "grad_norm": 0.9989350431442365, "learning_rate": 0.00014652942832135265, "loss": 1.7266, "step": 5582 }, { "epoch": 3.6973509933774835, "grad_norm": 0.9403895232462992, "learning_rate": 0.00014646269881622752, "loss": 1.7188, "step": 5583 }, { "epoch": 3.6980132450331125, "grad_norm": 0.9739936708752042, "learning_rate": 0.00014639597001152855, "loss": 1.6406, "step": 5584 }, { "epoch": 3.698675496688742, "grad_norm": 0.9491168587978074, "learning_rate": 0.00014632924192046885, "loss": 1.7578, "step": 5585 }, { "epoch": 3.699337748344371, "grad_norm": 0.9897125475624153, "learning_rate": 0.0001462625145562614, "loss": 1.6016, "step": 5586 }, { "epoch": 3.7, "grad_norm": 0.8939728112319097, "learning_rate": 0.00014619578793211883, "loss": 1.5859, "step": 5587 }, { "epoch": 3.700662251655629, "grad_norm": 0.9705381869647484, "learning_rate": 0.00014612906206125392, "loss": 1.6484, "step": 5588 }, { "epoch": 3.701324503311258, "grad_norm": 0.9405711655891572, "learning_rate": 0.0001460623369568791, "loss": 1.5859, "step": 5589 }, { "epoch": 3.7019867549668874, "grad_norm": 1.0313009082623266, "learning_rate": 0.00014599561263220663, "loss": 1.7266, "step": 5590 }, { "epoch": 3.7026490066225164, "grad_norm": 0.9005687496376479, "learning_rate": 0.00014592888910044884, "loss": 1.4922, "step": 5591 }, { "epoch": 3.703311258278146, "grad_norm": 0.97008795498272, "learning_rate": 0.0001458621663748177, "loss": 1.6328, "step": 5592 }, { "epoch": 3.7039735099337747, "grad_norm": 0.9953498354116098, "learning_rate": 0.000145795444468525, "loss": 1.7422, "step": 5593 }, { "epoch": 3.704635761589404, "grad_norm": 0.8908695882478231, "learning_rate": 0.00014572872339478252, "loss": 1.4297, "step": 5594 }, { "epoch": 3.705298013245033, "grad_norm": 0.9761189124975352, "learning_rate": 0.00014566200316680177, "loss": 1.625, "step": 5595 }, { "epoch": 3.7059602649006624, "grad_norm": 0.9398596380609003, "learning_rate": 0.00014559528379779412, "loss": 1.6406, "step": 5596 }, { "epoch": 3.7066225165562914, "grad_norm": 0.9761935981793552, "learning_rate": 0.00014552856530097082, "loss": 1.6328, "step": 5597 }, { "epoch": 3.7072847682119203, "grad_norm": 1.0888091177479453, "learning_rate": 0.00014546184768954285, "loss": 1.875, "step": 5598 }, { "epoch": 3.7079470198675497, "grad_norm": 0.9579716210560884, "learning_rate": 0.000145395130976721, "loss": 1.7266, "step": 5599 }, { "epoch": 3.7086092715231787, "grad_norm": 1.128682628999998, "learning_rate": 0.00014532841517571603, "loss": 1.9922, "step": 5600 }, { "epoch": 3.709271523178808, "grad_norm": 0.8242983068854691, "learning_rate": 0.0001452617002997385, "loss": 1.3672, "step": 5601 }, { "epoch": 3.709933774834437, "grad_norm": 0.9482687352475686, "learning_rate": 0.0001451949863619985, "loss": 1.7578, "step": 5602 }, { "epoch": 3.7105960264900664, "grad_norm": 0.9925017791024802, "learning_rate": 0.0001451282733757064, "loss": 1.7109, "step": 5603 }, { "epoch": 3.7112582781456953, "grad_norm": 0.9561807048584128, "learning_rate": 0.00014506156135407196, "loss": 1.6562, "step": 5604 }, { "epoch": 3.7119205298013247, "grad_norm": 0.9526472922476253, "learning_rate": 0.00014499485031030498, "loss": 1.5547, "step": 5605 }, { "epoch": 3.7125827814569536, "grad_norm": 0.9827693551193025, "learning_rate": 0.00014492814025761504, "loss": 1.6953, "step": 5606 }, { "epoch": 3.7132450331125826, "grad_norm": 1.012678017791769, "learning_rate": 0.00014486143120921155, "loss": 1.8672, "step": 5607 }, { "epoch": 3.713907284768212, "grad_norm": 0.9254644204380189, "learning_rate": 0.00014479472317830347, "loss": 1.6016, "step": 5608 }, { "epoch": 3.714569536423841, "grad_norm": 0.8970856822627125, "learning_rate": 0.00014472801617809996, "loss": 1.5469, "step": 5609 }, { "epoch": 3.7152317880794703, "grad_norm": 0.9475262876547241, "learning_rate": 0.00014466131022180967, "loss": 1.6797, "step": 5610 }, { "epoch": 3.7158940397350992, "grad_norm": 0.8499995844607321, "learning_rate": 0.0001445946053226411, "loss": 1.4922, "step": 5611 }, { "epoch": 3.7165562913907286, "grad_norm": 0.9982125249649828, "learning_rate": 0.00014452790149380274, "loss": 1.7266, "step": 5612 }, { "epoch": 3.7172185430463576, "grad_norm": 0.9367850805374772, "learning_rate": 0.0001444611987485026, "loss": 1.5625, "step": 5613 }, { "epoch": 3.717880794701987, "grad_norm": 1.0543349519410439, "learning_rate": 0.00014439449709994858, "loss": 1.8828, "step": 5614 }, { "epoch": 3.718543046357616, "grad_norm": 1.0254683118674408, "learning_rate": 0.00014432779656134838, "loss": 1.8984, "step": 5615 }, { "epoch": 3.719205298013245, "grad_norm": 0.9233628097734294, "learning_rate": 0.00014426109714590956, "loss": 1.5938, "step": 5616 }, { "epoch": 3.7198675496688742, "grad_norm": 1.0153939976167983, "learning_rate": 0.00014419439886683927, "loss": 1.7344, "step": 5617 }, { "epoch": 3.720529801324503, "grad_norm": 1.0405714573072997, "learning_rate": 0.0001441277017373445, "loss": 1.7656, "step": 5618 }, { "epoch": 3.7211920529801326, "grad_norm": 0.885350648078471, "learning_rate": 0.0001440610057706322, "loss": 1.5, "step": 5619 }, { "epoch": 3.7218543046357615, "grad_norm": 1.016074204471749, "learning_rate": 0.0001439943109799088, "loss": 1.7891, "step": 5620 }, { "epoch": 3.722516556291391, "grad_norm": 1.0248513332024847, "learning_rate": 0.00014392761737838067, "loss": 1.9219, "step": 5621 }, { "epoch": 3.72317880794702, "grad_norm": 0.9449960291192131, "learning_rate": 0.00014386092497925394, "loss": 1.6328, "step": 5622 }, { "epoch": 3.723841059602649, "grad_norm": 0.9471670889436311, "learning_rate": 0.00014379423379573447, "loss": 1.6406, "step": 5623 }, { "epoch": 3.724503311258278, "grad_norm": 0.9386835968082387, "learning_rate": 0.0001437275438410278, "loss": 1.6641, "step": 5624 }, { "epoch": 3.725165562913907, "grad_norm": 0.941368694439891, "learning_rate": 0.00014366085512833939, "loss": 1.6094, "step": 5625 }, { "epoch": 3.7258278145695365, "grad_norm": 0.9960179578596038, "learning_rate": 0.00014359416767087434, "loss": 1.7578, "step": 5626 }, { "epoch": 3.7264900662251654, "grad_norm": 0.9685617809806073, "learning_rate": 0.0001435274814818375, "loss": 1.5469, "step": 5627 }, { "epoch": 3.727152317880795, "grad_norm": 0.9677780454351947, "learning_rate": 0.0001434607965744336, "loss": 1.6406, "step": 5628 }, { "epoch": 3.7278145695364238, "grad_norm": 0.8760744613168246, "learning_rate": 0.00014339411296186691, "loss": 1.4609, "step": 5629 }, { "epoch": 3.7284768211920527, "grad_norm": 1.017822928026569, "learning_rate": 0.00014332743065734155, "loss": 1.7734, "step": 5630 }, { "epoch": 3.729139072847682, "grad_norm": 0.9272345174728477, "learning_rate": 0.00014326074967406147, "loss": 1.5703, "step": 5631 }, { "epoch": 3.7298013245033115, "grad_norm": 0.9894624685316453, "learning_rate": 0.0001431940700252302, "loss": 1.8438, "step": 5632 }, { "epoch": 3.7304635761589404, "grad_norm": 0.9783515355177508, "learning_rate": 0.00014312739172405106, "loss": 1.5625, "step": 5633 }, { "epoch": 3.7311258278145694, "grad_norm": 1.11083942076743, "learning_rate": 0.00014306071478372722, "loss": 2.0156, "step": 5634 }, { "epoch": 3.7317880794701987, "grad_norm": 0.9797835951686907, "learning_rate": 0.00014299403921746136, "loss": 1.6797, "step": 5635 }, { "epoch": 3.7324503311258277, "grad_norm": 0.9416095701021461, "learning_rate": 0.00014292736503845606, "loss": 1.6797, "step": 5636 }, { "epoch": 3.733112582781457, "grad_norm": 0.9940043599716795, "learning_rate": 0.00014286069225991356, "loss": 1.6641, "step": 5637 }, { "epoch": 3.733774834437086, "grad_norm": 0.9826100116067852, "learning_rate": 0.0001427940208950359, "loss": 1.7266, "step": 5638 }, { "epoch": 3.734437086092715, "grad_norm": 0.9451758549611984, "learning_rate": 0.0001427273509570246, "loss": 1.6719, "step": 5639 }, { "epoch": 3.7350993377483444, "grad_norm": 0.9703228818629791, "learning_rate": 0.00014266068245908132, "loss": 1.6875, "step": 5640 }, { "epoch": 3.7357615894039737, "grad_norm": 1.0106032018736029, "learning_rate": 0.00014259401541440698, "loss": 1.8438, "step": 5641 }, { "epoch": 3.7364238410596027, "grad_norm": 1.0404719289854627, "learning_rate": 0.0001425273498362025, "loss": 1.7578, "step": 5642 }, { "epoch": 3.7370860927152316, "grad_norm": 0.998132834745957, "learning_rate": 0.00014246068573766846, "loss": 1.7656, "step": 5643 }, { "epoch": 3.737748344370861, "grad_norm": 1.0336219051160884, "learning_rate": 0.00014239402313200508, "loss": 1.7344, "step": 5644 }, { "epoch": 3.73841059602649, "grad_norm": 0.9353352245417232, "learning_rate": 0.00014232736203241225, "loss": 1.7266, "step": 5645 }, { "epoch": 3.7390728476821193, "grad_norm": 0.9300243616148803, "learning_rate": 0.00014226070245208978, "loss": 1.5156, "step": 5646 }, { "epoch": 3.7397350993377483, "grad_norm": 0.9244521891266527, "learning_rate": 0.0001421940444042369, "loss": 1.6562, "step": 5647 }, { "epoch": 3.7403973509933772, "grad_norm": 0.7956324220321047, "learning_rate": 0.00014212738790205274, "loss": 1.3516, "step": 5648 }, { "epoch": 3.7410596026490066, "grad_norm": 0.9126502973640434, "learning_rate": 0.00014206073295873603, "loss": 1.5312, "step": 5649 }, { "epoch": 3.741721854304636, "grad_norm": 0.9586999478373044, "learning_rate": 0.00014199407958748525, "loss": 1.6016, "step": 5650 }, { "epoch": 3.742384105960265, "grad_norm": 0.9777055959622017, "learning_rate": 0.00014192742780149843, "loss": 1.6797, "step": 5651 }, { "epoch": 3.743046357615894, "grad_norm": 0.9853199011330255, "learning_rate": 0.00014186077761397348, "loss": 1.6719, "step": 5652 }, { "epoch": 3.7437086092715233, "grad_norm": 0.9662368713866214, "learning_rate": 0.0001417941290381079, "loss": 1.6484, "step": 5653 }, { "epoch": 3.744370860927152, "grad_norm": 1.0773504417427564, "learning_rate": 0.0001417274820870988, "loss": 1.9141, "step": 5654 }, { "epoch": 3.7450331125827816, "grad_norm": 0.9815264205569884, "learning_rate": 0.00014166083677414316, "loss": 1.6484, "step": 5655 }, { "epoch": 3.7456953642384105, "grad_norm": 0.9972954942664096, "learning_rate": 0.0001415941931124374, "loss": 1.7266, "step": 5656 }, { "epoch": 3.7463576158940395, "grad_norm": 0.9799990803632999, "learning_rate": 0.00014152755111517776, "loss": 1.6797, "step": 5657 }, { "epoch": 3.747019867549669, "grad_norm": 1.0469392724177857, "learning_rate": 0.00014146091079556014, "loss": 1.8672, "step": 5658 }, { "epoch": 3.7476821192052983, "grad_norm": 1.0194287665751198, "learning_rate": 0.00014139427216678012, "loss": 1.8203, "step": 5659 }, { "epoch": 3.748344370860927, "grad_norm": 0.9181467112141901, "learning_rate": 0.0001413276352420328, "loss": 1.6719, "step": 5660 }, { "epoch": 3.749006622516556, "grad_norm": 0.949606732470327, "learning_rate": 0.0001412610000345132, "loss": 1.6875, "step": 5661 }, { "epoch": 3.7496688741721855, "grad_norm": 0.9410252988020519, "learning_rate": 0.00014119436655741575, "loss": 1.6094, "step": 5662 }, { "epoch": 3.7503311258278145, "grad_norm": 0.9825457443446158, "learning_rate": 0.00014112773482393467, "loss": 1.75, "step": 5663 }, { "epoch": 3.750993377483444, "grad_norm": 0.9440412432588978, "learning_rate": 0.00014106110484726382, "loss": 1.8125, "step": 5664 }, { "epoch": 3.751655629139073, "grad_norm": 0.863725226358742, "learning_rate": 0.00014099447664059673, "loss": 1.4688, "step": 5665 }, { "epoch": 3.7523178807947017, "grad_norm": 1.0028840702324409, "learning_rate": 0.0001409278502171264, "loss": 1.8047, "step": 5666 }, { "epoch": 3.752980132450331, "grad_norm": 1.054399273564284, "learning_rate": 0.00014086122559004585, "loss": 1.7188, "step": 5667 }, { "epoch": 3.7536423841059605, "grad_norm": 0.8163596490350989, "learning_rate": 0.00014079460277254736, "loss": 1.3672, "step": 5668 }, { "epoch": 3.7543046357615895, "grad_norm": 1.0618980530904467, "learning_rate": 0.00014072798177782302, "loss": 1.9688, "step": 5669 }, { "epoch": 3.7549668874172184, "grad_norm": 0.8952504387012096, "learning_rate": 0.00014066136261906465, "loss": 1.6094, "step": 5670 }, { "epoch": 3.755629139072848, "grad_norm": 0.9223267227442431, "learning_rate": 0.00014059474530946355, "loss": 1.5703, "step": 5671 }, { "epoch": 3.7562913907284767, "grad_norm": 1.0095120967015605, "learning_rate": 0.00014052812986221065, "loss": 1.7734, "step": 5672 }, { "epoch": 3.756953642384106, "grad_norm": 1.0827830327076555, "learning_rate": 0.00014046151629049667, "loss": 2.0625, "step": 5673 }, { "epoch": 3.757615894039735, "grad_norm": 0.941133340300096, "learning_rate": 0.0001403949046075118, "loss": 1.5781, "step": 5674 }, { "epoch": 3.758278145695364, "grad_norm": 0.9630145732901069, "learning_rate": 0.00014032829482644597, "loss": 1.6562, "step": 5675 }, { "epoch": 3.7589403973509934, "grad_norm": 1.0327799604157473, "learning_rate": 0.00014026168696048858, "loss": 1.8047, "step": 5676 }, { "epoch": 3.7596026490066228, "grad_norm": 0.9055650407301284, "learning_rate": 0.0001401950810228289, "loss": 1.5156, "step": 5677 }, { "epoch": 3.7602649006622517, "grad_norm": 1.1233065021637858, "learning_rate": 0.00014012847702665553, "loss": 1.9531, "step": 5678 }, { "epoch": 3.7609271523178807, "grad_norm": 1.0491289542925442, "learning_rate": 0.0001400618749851569, "loss": 1.7891, "step": 5679 }, { "epoch": 3.76158940397351, "grad_norm": 0.961104917441985, "learning_rate": 0.00013999527491152095, "loss": 1.6875, "step": 5680 }, { "epoch": 3.762251655629139, "grad_norm": 1.0077368971253076, "learning_rate": 0.0001399286768189353, "loss": 1.7891, "step": 5681 }, { "epoch": 3.7629139072847684, "grad_norm": 1.0042633887930412, "learning_rate": 0.00013986208072058707, "loss": 1.7734, "step": 5682 }, { "epoch": 3.7635761589403973, "grad_norm": 0.8589847078877201, "learning_rate": 0.0001397954866296631, "loss": 1.4141, "step": 5683 }, { "epoch": 3.7642384105960263, "grad_norm": 1.0018422344688633, "learning_rate": 0.00013972889455934976, "loss": 1.5625, "step": 5684 }, { "epoch": 3.7649006622516556, "grad_norm": 0.939286224245937, "learning_rate": 0.00013966230452283304, "loss": 1.6094, "step": 5685 }, { "epoch": 3.765562913907285, "grad_norm": 0.892072243592512, "learning_rate": 0.00013959571653329855, "loss": 1.6328, "step": 5686 }, { "epoch": 3.766225165562914, "grad_norm": 0.9856769682634343, "learning_rate": 0.0001395291306039315, "loss": 1.6641, "step": 5687 }, { "epoch": 3.766887417218543, "grad_norm": 0.9568813037262116, "learning_rate": 0.00013946254674791658, "loss": 1.6094, "step": 5688 }, { "epoch": 3.7675496688741723, "grad_norm": 0.9896710300184778, "learning_rate": 0.00013939596497843825, "loss": 1.6406, "step": 5689 }, { "epoch": 3.7682119205298013, "grad_norm": 0.9445385746993286, "learning_rate": 0.0001393293853086804, "loss": 1.625, "step": 5690 }, { "epoch": 3.7688741721854306, "grad_norm": 1.0133760765212727, "learning_rate": 0.0001392628077518266, "loss": 1.8359, "step": 5691 }, { "epoch": 3.7695364238410596, "grad_norm": 1.0071765208390107, "learning_rate": 0.00013919623232106, "loss": 1.7344, "step": 5692 }, { "epoch": 3.7701986754966885, "grad_norm": 0.9703738590723116, "learning_rate": 0.00013912965902956325, "loss": 1.625, "step": 5693 }, { "epoch": 3.770860927152318, "grad_norm": 0.9923639914237589, "learning_rate": 0.0001390630878905186, "loss": 1.7344, "step": 5694 }, { "epoch": 3.7715231788079473, "grad_norm": 1.0083452971202145, "learning_rate": 0.00013899651891710797, "loss": 1.7578, "step": 5695 }, { "epoch": 3.7721854304635762, "grad_norm": 1.0150064655603623, "learning_rate": 0.00013892995212251276, "loss": 1.7266, "step": 5696 }, { "epoch": 3.772847682119205, "grad_norm": 0.9279819379467651, "learning_rate": 0.00013886338751991387, "loss": 1.6719, "step": 5697 }, { "epoch": 3.7735099337748346, "grad_norm": 1.056865820097976, "learning_rate": 0.00013879682512249198, "loss": 1.6797, "step": 5698 }, { "epoch": 3.7741721854304635, "grad_norm": 0.9444921512427924, "learning_rate": 0.00013873026494342716, "loss": 1.625, "step": 5699 }, { "epoch": 3.774834437086093, "grad_norm": 1.0709212671201553, "learning_rate": 0.00013866370699589905, "loss": 1.7656, "step": 5700 }, { "epoch": 3.775496688741722, "grad_norm": 0.9082300104270435, "learning_rate": 0.00013859715129308696, "loss": 1.4922, "step": 5701 }, { "epoch": 3.776158940397351, "grad_norm": 0.9542834236978023, "learning_rate": 0.00013853059784816967, "loss": 1.5859, "step": 5702 }, { "epoch": 3.77682119205298, "grad_norm": 1.0728101835737411, "learning_rate": 0.00013846404667432538, "loss": 1.8359, "step": 5703 }, { "epoch": 3.777483443708609, "grad_norm": 1.0591772128029, "learning_rate": 0.00013839749778473223, "loss": 1.6641, "step": 5704 }, { "epoch": 3.7781456953642385, "grad_norm": 1.0358660860675137, "learning_rate": 0.00013833095119256749, "loss": 1.75, "step": 5705 }, { "epoch": 3.7788079470198674, "grad_norm": 1.0325599126047047, "learning_rate": 0.00013826440691100817, "loss": 1.6797, "step": 5706 }, { "epoch": 3.779470198675497, "grad_norm": 1.0399512326645468, "learning_rate": 0.00013819786495323084, "loss": 1.6172, "step": 5707 }, { "epoch": 3.7801324503311258, "grad_norm": 0.9781114398453498, "learning_rate": 0.00013813132533241161, "loss": 1.6328, "step": 5708 }, { "epoch": 3.780794701986755, "grad_norm": 0.9855326889046522, "learning_rate": 0.00013806478806172595, "loss": 1.5938, "step": 5709 }, { "epoch": 3.781456953642384, "grad_norm": 1.0201773486335914, "learning_rate": 0.00013799825315434914, "loss": 1.7344, "step": 5710 }, { "epoch": 3.782119205298013, "grad_norm": 0.9371631054935312, "learning_rate": 0.00013793172062345578, "loss": 1.6406, "step": 5711 }, { "epoch": 3.7827814569536424, "grad_norm": 0.9428023034748756, "learning_rate": 0.0001378651904822201, "loss": 1.6484, "step": 5712 }, { "epoch": 3.7834437086092714, "grad_norm": 0.9562675729634152, "learning_rate": 0.0001377986627438158, "loss": 1.5547, "step": 5713 }, { "epoch": 3.7841059602649008, "grad_norm": 0.8808872507289454, "learning_rate": 0.00013773213742141624, "loss": 1.4297, "step": 5714 }, { "epoch": 3.7847682119205297, "grad_norm": 1.0909838555229883, "learning_rate": 0.00013766561452819404, "loss": 1.9531, "step": 5715 }, { "epoch": 3.785430463576159, "grad_norm": 1.1484771048173754, "learning_rate": 0.00013759909407732158, "loss": 2.2969, "step": 5716 }, { "epoch": 3.786092715231788, "grad_norm": 1.0203841321488, "learning_rate": 0.0001375325760819707, "loss": 1.6953, "step": 5717 }, { "epoch": 3.7867549668874174, "grad_norm": 0.9628254838927495, "learning_rate": 0.00013746606055531265, "loss": 1.6484, "step": 5718 }, { "epoch": 3.7874172185430464, "grad_norm": 1.0106842071481297, "learning_rate": 0.00013739954751051836, "loss": 1.7422, "step": 5719 }, { "epoch": 3.7880794701986753, "grad_norm": 0.9536315792364277, "learning_rate": 0.00013733303696075808, "loss": 1.6406, "step": 5720 }, { "epoch": 3.7887417218543047, "grad_norm": 0.9162744150712108, "learning_rate": 0.00013726652891920171, "loss": 1.6094, "step": 5721 }, { "epoch": 3.7894039735099336, "grad_norm": 0.9748402605860955, "learning_rate": 0.00013720002339901862, "loss": 1.8281, "step": 5722 }, { "epoch": 3.790066225165563, "grad_norm": 0.7916566815676708, "learning_rate": 0.00013713352041337765, "loss": 1.2656, "step": 5723 }, { "epoch": 3.790728476821192, "grad_norm": 0.9344195977762946, "learning_rate": 0.00013706701997544704, "loss": 1.7109, "step": 5724 }, { "epoch": 3.7913907284768213, "grad_norm": 0.8938308065855307, "learning_rate": 0.0001370005220983949, "loss": 1.6016, "step": 5725 }, { "epoch": 3.7920529801324503, "grad_norm": 0.881142606188734, "learning_rate": 0.00013693402679538835, "loss": 1.6172, "step": 5726 }, { "epoch": 3.7927152317880797, "grad_norm": 1.017744141979622, "learning_rate": 0.00013686753407959426, "loss": 1.7812, "step": 5727 }, { "epoch": 3.7933774834437086, "grad_norm": 0.9836010370691388, "learning_rate": 0.000136801043964179, "loss": 1.4844, "step": 5728 }, { "epoch": 3.7940397350993376, "grad_norm": 1.123892077746329, "learning_rate": 0.00013673455646230842, "loss": 2.0156, "step": 5729 }, { "epoch": 3.794701986754967, "grad_norm": 0.9526271989357165, "learning_rate": 0.0001366680715871477, "loss": 1.6094, "step": 5730 }, { "epoch": 3.795364238410596, "grad_norm": 0.8911064896091004, "learning_rate": 0.0001366015893518616, "loss": 1.4766, "step": 5731 }, { "epoch": 3.7960264900662253, "grad_norm": 1.023839038526304, "learning_rate": 0.00013653510976961448, "loss": 1.7109, "step": 5732 }, { "epoch": 3.796688741721854, "grad_norm": 1.0803501312572128, "learning_rate": 0.00013646863285357003, "loss": 1.7578, "step": 5733 }, { "epoch": 3.7973509933774836, "grad_norm": 0.9897947708568728, "learning_rate": 0.00013640215861689128, "loss": 1.8828, "step": 5734 }, { "epoch": 3.7980132450331126, "grad_norm": 1.0981155807569185, "learning_rate": 0.00013633568707274114, "loss": 1.9453, "step": 5735 }, { "epoch": 3.798675496688742, "grad_norm": 0.9383758702936661, "learning_rate": 0.00013626921823428156, "loss": 1.5625, "step": 5736 }, { "epoch": 3.799337748344371, "grad_norm": 0.95693712344548, "learning_rate": 0.00013620275211467415, "loss": 1.6875, "step": 5737 }, { "epoch": 3.8, "grad_norm": 0.971831467738029, "learning_rate": 0.00013613628872708006, "loss": 1.7656, "step": 5738 }, { "epoch": 3.800662251655629, "grad_norm": 0.9231508566210885, "learning_rate": 0.00013606982808465976, "loss": 1.6328, "step": 5739 }, { "epoch": 3.801324503311258, "grad_norm": 0.9649222469752964, "learning_rate": 0.00013600337020057312, "loss": 1.6094, "step": 5740 }, { "epoch": 3.8019867549668875, "grad_norm": 0.8832598447323994, "learning_rate": 0.00013593691508797972, "loss": 1.5, "step": 5741 }, { "epoch": 3.8026490066225165, "grad_norm": 1.0198826404788734, "learning_rate": 0.00013587046276003836, "loss": 1.8359, "step": 5742 }, { "epoch": 3.8033112582781454, "grad_norm": 0.9476413823835215, "learning_rate": 0.00013580401322990733, "loss": 1.5859, "step": 5743 }, { "epoch": 3.803973509933775, "grad_norm": 0.9483183820247099, "learning_rate": 0.00013573756651074443, "loss": 1.6484, "step": 5744 }, { "epoch": 3.804635761589404, "grad_norm": 0.8537848353306242, "learning_rate": 0.00013567112261570692, "loss": 1.4062, "step": 5745 }, { "epoch": 3.805298013245033, "grad_norm": 0.9334984580580629, "learning_rate": 0.00013560468155795135, "loss": 1.6328, "step": 5746 }, { "epoch": 3.805960264900662, "grad_norm": 0.9610968453009481, "learning_rate": 0.0001355382433506339, "loss": 1.5938, "step": 5747 }, { "epoch": 3.8066225165562915, "grad_norm": 0.91203956856607, "learning_rate": 0.00013547180800691007, "loss": 1.5312, "step": 5748 }, { "epoch": 3.8072847682119204, "grad_norm": 0.9830084941334887, "learning_rate": 0.00013540537553993477, "loss": 1.6484, "step": 5749 }, { "epoch": 3.80794701986755, "grad_norm": 1.0499276764915741, "learning_rate": 0.00013533894596286252, "loss": 1.8438, "step": 5750 }, { "epoch": 3.8086092715231787, "grad_norm": 0.8836277757580524, "learning_rate": 0.000135272519288847, "loss": 1.4531, "step": 5751 }, { "epoch": 3.8092715231788077, "grad_norm": 1.0752627978238134, "learning_rate": 0.0001352060955310415, "loss": 1.7969, "step": 5752 }, { "epoch": 3.809933774834437, "grad_norm": 1.0158898445699451, "learning_rate": 0.0001351396747025987, "loss": 1.9531, "step": 5753 }, { "epoch": 3.8105960264900665, "grad_norm": 1.0127802233900909, "learning_rate": 0.00013507325681667067, "loss": 1.7734, "step": 5754 }, { "epoch": 3.8112582781456954, "grad_norm": 1.0480319364823467, "learning_rate": 0.0001350068418864089, "loss": 1.8516, "step": 5755 }, { "epoch": 3.8119205298013243, "grad_norm": 0.9138218201489762, "learning_rate": 0.00013494042992496442, "loss": 1.5312, "step": 5756 }, { "epoch": 3.8125827814569537, "grad_norm": 0.9301062370901482, "learning_rate": 0.0001348740209454874, "loss": 1.5078, "step": 5757 }, { "epoch": 3.8132450331125827, "grad_norm": 1.0198606091774272, "learning_rate": 0.00013480761496112762, "loss": 1.7656, "step": 5758 }, { "epoch": 3.813907284768212, "grad_norm": 0.8557286299139155, "learning_rate": 0.00013474121198503428, "loss": 1.4219, "step": 5759 }, { "epoch": 3.814569536423841, "grad_norm": 0.9573858866747225, "learning_rate": 0.00013467481203035595, "loss": 1.6406, "step": 5760 }, { "epoch": 3.81523178807947, "grad_norm": 0.9718778552723135, "learning_rate": 0.0001346084151102404, "loss": 1.5781, "step": 5761 }, { "epoch": 3.8158940397350993, "grad_norm": 0.9862821023605535, "learning_rate": 0.00013454202123783528, "loss": 1.6875, "step": 5762 }, { "epoch": 3.8165562913907287, "grad_norm": 0.9334446754601656, "learning_rate": 0.00013447563042628708, "loss": 1.6172, "step": 5763 }, { "epoch": 3.8172185430463577, "grad_norm": 0.974817216879401, "learning_rate": 0.00013440924268874206, "loss": 1.6562, "step": 5764 }, { "epoch": 3.8178807947019866, "grad_norm": 1.0578977318239806, "learning_rate": 0.00013434285803834572, "loss": 1.7188, "step": 5765 }, { "epoch": 3.818543046357616, "grad_norm": 1.0485375514928774, "learning_rate": 0.00013427647648824303, "loss": 1.7812, "step": 5766 }, { "epoch": 3.819205298013245, "grad_norm": 0.9300662014167557, "learning_rate": 0.00013421009805157817, "loss": 1.5312, "step": 5767 }, { "epoch": 3.8198675496688743, "grad_norm": 1.0821623397165827, "learning_rate": 0.00013414372274149502, "loss": 1.8906, "step": 5768 }, { "epoch": 3.8205298013245033, "grad_norm": 1.0399652774971395, "learning_rate": 0.00013407735057113652, "loss": 1.8672, "step": 5769 }, { "epoch": 3.821192052980132, "grad_norm": 0.8122532344545084, "learning_rate": 0.00013401098155364514, "loss": 1.3672, "step": 5770 }, { "epoch": 3.8218543046357616, "grad_norm": 1.0526021554578184, "learning_rate": 0.00013394461570216272, "loss": 1.8516, "step": 5771 }, { "epoch": 3.822516556291391, "grad_norm": 0.9447389305926531, "learning_rate": 0.0001338782530298305, "loss": 1.7266, "step": 5772 }, { "epoch": 3.82317880794702, "grad_norm": 1.0594615717732863, "learning_rate": 0.00013381189354978897, "loss": 1.7969, "step": 5773 }, { "epoch": 3.823841059602649, "grad_norm": 1.00006804971187, "learning_rate": 0.0001337455372751781, "loss": 1.5859, "step": 5774 }, { "epoch": 3.8245033112582782, "grad_norm": 0.7803282827523832, "learning_rate": 0.00013367918421913722, "loss": 1.3359, "step": 5775 }, { "epoch": 3.825165562913907, "grad_norm": 0.9063099270267305, "learning_rate": 0.00013361283439480495, "loss": 1.6484, "step": 5776 }, { "epoch": 3.8258278145695366, "grad_norm": 0.8350609777442217, "learning_rate": 0.0001335464878153194, "loss": 1.4766, "step": 5777 }, { "epoch": 3.8264900662251655, "grad_norm": 1.0030996858016865, "learning_rate": 0.00013348014449381787, "loss": 1.8125, "step": 5778 }, { "epoch": 3.8271523178807945, "grad_norm": 0.9710877033752598, "learning_rate": 0.0001334138044434371, "loss": 1.7656, "step": 5779 }, { "epoch": 3.827814569536424, "grad_norm": 0.9509175288003212, "learning_rate": 0.00013334746767731324, "loss": 1.7266, "step": 5780 }, { "epoch": 3.8284768211920532, "grad_norm": 0.9559609777899778, "learning_rate": 0.00013328113420858168, "loss": 1.6172, "step": 5781 }, { "epoch": 3.829139072847682, "grad_norm": 0.9405955291328864, "learning_rate": 0.00013321480405037718, "loss": 1.5781, "step": 5782 }, { "epoch": 3.829801324503311, "grad_norm": 0.8743275146132014, "learning_rate": 0.000133148477215834, "loss": 1.6016, "step": 5783 }, { "epoch": 3.8304635761589405, "grad_norm": 0.8799514484085708, "learning_rate": 0.00013308215371808549, "loss": 1.6328, "step": 5784 }, { "epoch": 3.8311258278145695, "grad_norm": 1.016599624345667, "learning_rate": 0.00013301583357026444, "loss": 1.8828, "step": 5785 }, { "epoch": 3.831788079470199, "grad_norm": 1.048670063429407, "learning_rate": 0.0001329495167855031, "loss": 1.8047, "step": 5786 }, { "epoch": 3.832450331125828, "grad_norm": 0.9489994891076399, "learning_rate": 0.00013288320337693298, "loss": 1.5547, "step": 5787 }, { "epoch": 3.8331125827814567, "grad_norm": 1.100992065568837, "learning_rate": 0.0001328168933576848, "loss": 1.8672, "step": 5788 }, { "epoch": 3.833774834437086, "grad_norm": 0.9956615078108998, "learning_rate": 0.00013275058674088868, "loss": 1.75, "step": 5789 }, { "epoch": 3.8344370860927155, "grad_norm": 1.0008777241587552, "learning_rate": 0.00013268428353967415, "loss": 1.5625, "step": 5790 }, { "epoch": 3.8350993377483444, "grad_norm": 0.9444321188884052, "learning_rate": 0.00013261798376717007, "loss": 1.6641, "step": 5791 }, { "epoch": 3.8357615894039734, "grad_norm": 1.0468922444149253, "learning_rate": 0.00013255168743650438, "loss": 1.6953, "step": 5792 }, { "epoch": 3.8364238410596028, "grad_norm": 0.889149022846279, "learning_rate": 0.00013248539456080469, "loss": 1.4453, "step": 5793 }, { "epoch": 3.8370860927152317, "grad_norm": 1.0077299520477745, "learning_rate": 0.00013241910515319767, "loss": 1.7109, "step": 5794 }, { "epoch": 3.837748344370861, "grad_norm": 1.0191500892496874, "learning_rate": 0.00013235281922680932, "loss": 1.7344, "step": 5795 }, { "epoch": 3.83841059602649, "grad_norm": 1.0595608007313757, "learning_rate": 0.00013228653679476515, "loss": 1.8828, "step": 5796 }, { "epoch": 3.839072847682119, "grad_norm": 1.0129762218529146, "learning_rate": 0.00013222025787018978, "loss": 1.7031, "step": 5797 }, { "epoch": 3.8397350993377484, "grad_norm": 0.8719262290535541, "learning_rate": 0.0001321539824662071, "loss": 1.5156, "step": 5798 }, { "epoch": 3.8403973509933778, "grad_norm": 0.8307836141053977, "learning_rate": 0.0001320877105959406, "loss": 1.3203, "step": 5799 }, { "epoch": 3.8410596026490067, "grad_norm": 0.9715188005003972, "learning_rate": 0.00013202144227251274, "loss": 1.6797, "step": 5800 }, { "epoch": 3.8417218543046356, "grad_norm": 1.0725347373714311, "learning_rate": 0.00013195517750904536, "loss": 1.7656, "step": 5801 }, { "epoch": 3.842384105960265, "grad_norm": 1.0191276107829323, "learning_rate": 0.00013188891631865977, "loss": 1.7969, "step": 5802 }, { "epoch": 3.843046357615894, "grad_norm": 0.8977150611532076, "learning_rate": 0.00013182265871447638, "loss": 1.3672, "step": 5803 }, { "epoch": 3.8437086092715234, "grad_norm": 0.9420445304668542, "learning_rate": 0.00013175640470961493, "loss": 1.5625, "step": 5804 }, { "epoch": 3.8443708609271523, "grad_norm": 0.9906768205028295, "learning_rate": 0.00013169015431719453, "loss": 1.6641, "step": 5805 }, { "epoch": 3.8450331125827812, "grad_norm": 0.9318403938997516, "learning_rate": 0.00013162390755033348, "loss": 1.5312, "step": 5806 }, { "epoch": 3.8456953642384106, "grad_norm": 0.8727805657192615, "learning_rate": 0.00013155766442214938, "loss": 1.5156, "step": 5807 }, { "epoch": 3.8463576158940396, "grad_norm": 1.0982128282010468, "learning_rate": 0.0001314914249457592, "loss": 1.9297, "step": 5808 }, { "epoch": 3.847019867549669, "grad_norm": 1.0576031700647566, "learning_rate": 0.0001314251891342791, "loss": 1.8516, "step": 5809 }, { "epoch": 3.847682119205298, "grad_norm": 0.9628591444325123, "learning_rate": 0.00013135895700082444, "loss": 1.6641, "step": 5810 }, { "epoch": 3.8483443708609273, "grad_norm": 0.9244436574679447, "learning_rate": 0.00013129272855851005, "loss": 1.5703, "step": 5811 }, { "epoch": 3.8490066225165562, "grad_norm": 0.976728360030882, "learning_rate": 0.00013122650382044984, "loss": 1.5625, "step": 5812 }, { "epoch": 3.8496688741721856, "grad_norm": 0.9952981797860961, "learning_rate": 0.00013116028279975712, "loss": 1.7188, "step": 5813 }, { "epoch": 3.8503311258278146, "grad_norm": 0.9721000097141832, "learning_rate": 0.00013109406550954446, "loss": 1.4453, "step": 5814 }, { "epoch": 3.8509933774834435, "grad_norm": 1.014108730709295, "learning_rate": 0.00013102785196292353, "loss": 1.7031, "step": 5815 }, { "epoch": 3.851655629139073, "grad_norm": 0.9593041523752063, "learning_rate": 0.00013096164217300536, "loss": 1.5781, "step": 5816 }, { "epoch": 3.852317880794702, "grad_norm": 1.0542042708130113, "learning_rate": 0.00013089543615290038, "loss": 1.9141, "step": 5817 }, { "epoch": 3.852980132450331, "grad_norm": 0.9557246524231798, "learning_rate": 0.0001308292339157181, "loss": 1.5469, "step": 5818 }, { "epoch": 3.85364238410596, "grad_norm": 0.9282879936423754, "learning_rate": 0.00013076303547456715, "loss": 1.5391, "step": 5819 }, { "epoch": 3.8543046357615895, "grad_norm": 1.0935065347961586, "learning_rate": 0.00013069684084255584, "loss": 1.7578, "step": 5820 }, { "epoch": 3.8549668874172185, "grad_norm": 0.9754163101470362, "learning_rate": 0.00013063065003279136, "loss": 1.7031, "step": 5821 }, { "epoch": 3.855629139072848, "grad_norm": 0.9676640778149372, "learning_rate": 0.00013056446305838014, "loss": 1.5625, "step": 5822 }, { "epoch": 3.856291390728477, "grad_norm": 1.0008248604068417, "learning_rate": 0.00013049827993242812, "loss": 1.7578, "step": 5823 }, { "epoch": 3.8569536423841058, "grad_norm": 0.9306254484276619, "learning_rate": 0.00013043210066804027, "loss": 1.4688, "step": 5824 }, { "epoch": 3.857615894039735, "grad_norm": 0.8605743067730766, "learning_rate": 0.00013036592527832075, "loss": 1.4453, "step": 5825 }, { "epoch": 3.858278145695364, "grad_norm": 1.0383352838103712, "learning_rate": 0.0001302997537763732, "loss": 1.6172, "step": 5826 }, { "epoch": 3.8589403973509935, "grad_norm": 1.0587724770920803, "learning_rate": 0.00013023358617530025, "loss": 1.8359, "step": 5827 }, { "epoch": 3.8596026490066224, "grad_norm": 0.9754182362862439, "learning_rate": 0.00013016742248820378, "loss": 1.5625, "step": 5828 }, { "epoch": 3.860264900662252, "grad_norm": 0.9727320257296663, "learning_rate": 0.0001301012627281851, "loss": 1.6719, "step": 5829 }, { "epoch": 3.8609271523178808, "grad_norm": 1.0423407276771637, "learning_rate": 0.00013003510690834453, "loss": 1.7734, "step": 5830 }, { "epoch": 3.86158940397351, "grad_norm": 1.0002689746998945, "learning_rate": 0.00012996895504178163, "loss": 1.7266, "step": 5831 }, { "epoch": 3.862251655629139, "grad_norm": 0.947248356988845, "learning_rate": 0.00012990280714159533, "loss": 1.7109, "step": 5832 }, { "epoch": 3.862913907284768, "grad_norm": 0.9193020070269123, "learning_rate": 0.0001298366632208836, "loss": 1.6484, "step": 5833 }, { "epoch": 3.8635761589403974, "grad_norm": 0.929468572661472, "learning_rate": 0.00012977052329274372, "loss": 1.5469, "step": 5834 }, { "epoch": 3.8642384105960264, "grad_norm": 1.0230020514441112, "learning_rate": 0.00012970438737027216, "loss": 1.7656, "step": 5835 }, { "epoch": 3.8649006622516557, "grad_norm": 0.9662214349565599, "learning_rate": 0.00012963825546656463, "loss": 1.7656, "step": 5836 }, { "epoch": 3.8655629139072847, "grad_norm": 1.0294159562360312, "learning_rate": 0.00012957212759471585, "loss": 1.8203, "step": 5837 }, { "epoch": 3.866225165562914, "grad_norm": 0.9728227647816327, "learning_rate": 0.00012950600376782008, "loss": 1.6562, "step": 5838 }, { "epoch": 3.866887417218543, "grad_norm": 0.8496198782771706, "learning_rate": 0.0001294398839989705, "loss": 1.4922, "step": 5839 }, { "epoch": 3.8675496688741724, "grad_norm": 0.9990526014455295, "learning_rate": 0.00012937376830125956, "loss": 1.8594, "step": 5840 }, { "epoch": 3.8682119205298013, "grad_norm": 1.0005052144183477, "learning_rate": 0.00012930765668777905, "loss": 1.7656, "step": 5841 }, { "epoch": 3.8688741721854303, "grad_norm": 1.0134860465495785, "learning_rate": 0.00012924154917161974, "loss": 1.6328, "step": 5842 }, { "epoch": 3.8695364238410597, "grad_norm": 0.9501853581674249, "learning_rate": 0.0001291754457658716, "loss": 1.6562, "step": 5843 }, { "epoch": 3.8701986754966886, "grad_norm": 0.9392465949302188, "learning_rate": 0.000129109346483624, "loss": 1.5391, "step": 5844 }, { "epoch": 3.870860927152318, "grad_norm": 0.9766935264149513, "learning_rate": 0.00012904325133796532, "loss": 1.625, "step": 5845 }, { "epoch": 3.871523178807947, "grad_norm": 0.9997392811043313, "learning_rate": 0.0001289771603419831, "loss": 1.6875, "step": 5846 }, { "epoch": 3.872185430463576, "grad_norm": 0.9211031771490494, "learning_rate": 0.00012891107350876415, "loss": 1.5781, "step": 5847 }, { "epoch": 3.8728476821192053, "grad_norm": 1.0506552094682335, "learning_rate": 0.00012884499085139445, "loss": 1.7969, "step": 5848 }, { "epoch": 3.8735099337748347, "grad_norm": 1.0350422627669538, "learning_rate": 0.00012877891238295905, "loss": 1.8438, "step": 5849 }, { "epoch": 3.8741721854304636, "grad_norm": 1.0429563806091422, "learning_rate": 0.00012871283811654232, "loss": 1.7109, "step": 5850 }, { "epoch": 3.8748344370860925, "grad_norm": 1.1129951296119285, "learning_rate": 0.0001286467680652277, "loss": 1.9766, "step": 5851 }, { "epoch": 3.875496688741722, "grad_norm": 1.0690717537303676, "learning_rate": 0.00012858070224209782, "loss": 1.8516, "step": 5852 }, { "epoch": 3.876158940397351, "grad_norm": 0.9170043640436769, "learning_rate": 0.0001285146406602344, "loss": 1.6797, "step": 5853 }, { "epoch": 3.8768211920529803, "grad_norm": 1.09998301954349, "learning_rate": 0.0001284485833327185, "loss": 1.9922, "step": 5854 }, { "epoch": 3.877483443708609, "grad_norm": 0.9953467382613767, "learning_rate": 0.00012838253027263021, "loss": 1.9062, "step": 5855 }, { "epoch": 3.878145695364238, "grad_norm": 0.957293347289259, "learning_rate": 0.00012831648149304867, "loss": 1.7656, "step": 5856 }, { "epoch": 3.8788079470198675, "grad_norm": 0.9369865366557445, "learning_rate": 0.00012825043700705245, "loss": 1.8516, "step": 5857 }, { "epoch": 3.879470198675497, "grad_norm": 0.8740387815601298, "learning_rate": 0.00012818439682771906, "loss": 1.4375, "step": 5858 }, { "epoch": 3.880132450331126, "grad_norm": 0.9748721026645389, "learning_rate": 0.0001281183609681252, "loss": 1.6172, "step": 5859 }, { "epoch": 3.880794701986755, "grad_norm": 0.9648312107805943, "learning_rate": 0.00012805232944134668, "loss": 1.5781, "step": 5860 }, { "epoch": 3.881456953642384, "grad_norm": 1.0169843003840577, "learning_rate": 0.00012798630226045865, "loss": 1.8359, "step": 5861 }, { "epoch": 3.882119205298013, "grad_norm": 0.950052281545286, "learning_rate": 0.00012792027943853504, "loss": 1.7109, "step": 5862 }, { "epoch": 3.8827814569536425, "grad_norm": 0.9227243137423408, "learning_rate": 0.0001278542609886493, "loss": 1.5469, "step": 5863 }, { "epoch": 3.8834437086092715, "grad_norm": 1.036666229188495, "learning_rate": 0.0001277882469238738, "loss": 1.875, "step": 5864 }, { "epoch": 3.8841059602649004, "grad_norm": 0.96127561131324, "learning_rate": 0.00012772223725728, "loss": 1.6562, "step": 5865 }, { "epoch": 3.88476821192053, "grad_norm": 0.8542816196165587, "learning_rate": 0.00012765623200193866, "loss": 1.3906, "step": 5866 }, { "epoch": 3.885430463576159, "grad_norm": 0.920788879607266, "learning_rate": 0.00012759023117091958, "loss": 1.4922, "step": 5867 }, { "epoch": 3.886092715231788, "grad_norm": 1.0282958755440192, "learning_rate": 0.00012752423477729158, "loss": 1.7344, "step": 5868 }, { "epoch": 3.886754966887417, "grad_norm": 0.9867654313763583, "learning_rate": 0.0001274582428341228, "loss": 1.6953, "step": 5869 }, { "epoch": 3.8874172185430464, "grad_norm": 1.0467102657211078, "learning_rate": 0.0001273922553544804, "loss": 1.9844, "step": 5870 }, { "epoch": 3.8880794701986754, "grad_norm": 0.869763609556392, "learning_rate": 0.00012732627235143056, "loss": 1.4297, "step": 5871 }, { "epoch": 3.888741721854305, "grad_norm": 1.0545068625668788, "learning_rate": 0.00012726029383803883, "loss": 1.7578, "step": 5872 }, { "epoch": 3.8894039735099337, "grad_norm": 0.9842961714746176, "learning_rate": 0.00012719431982736957, "loss": 1.6406, "step": 5873 }, { "epoch": 3.8900662251655627, "grad_norm": 1.004152845974112, "learning_rate": 0.00012712835033248642, "loss": 1.6641, "step": 5874 }, { "epoch": 3.890728476821192, "grad_norm": 0.964149595314734, "learning_rate": 0.00012706238536645215, "loss": 1.7188, "step": 5875 }, { "epoch": 3.8913907284768214, "grad_norm": 0.9131544370962704, "learning_rate": 0.00012699642494232855, "loss": 1.5859, "step": 5876 }, { "epoch": 3.8920529801324504, "grad_norm": 1.0021870372446064, "learning_rate": 0.0001269304690731765, "loss": 1.8203, "step": 5877 }, { "epoch": 3.8927152317880793, "grad_norm": 0.9783535160857425, "learning_rate": 0.00012686451777205613, "loss": 1.6094, "step": 5878 }, { "epoch": 3.8933774834437087, "grad_norm": 0.8709895770505574, "learning_rate": 0.00012679857105202645, "loss": 1.3906, "step": 5879 }, { "epoch": 3.8940397350993377, "grad_norm": 1.0268328740158517, "learning_rate": 0.00012673262892614565, "loss": 1.9141, "step": 5880 }, { "epoch": 3.894701986754967, "grad_norm": 0.9372645488307642, "learning_rate": 0.00012666669140747115, "loss": 1.4688, "step": 5881 }, { "epoch": 3.895364238410596, "grad_norm": 0.8629146653294747, "learning_rate": 0.00012660075850905926, "loss": 1.3828, "step": 5882 }, { "epoch": 3.896026490066225, "grad_norm": 0.9777150611349158, "learning_rate": 0.00012653483024396533, "loss": 1.7344, "step": 5883 }, { "epoch": 3.8966887417218543, "grad_norm": 0.9929216255709661, "learning_rate": 0.00012646890662524414, "loss": 1.5938, "step": 5884 }, { "epoch": 3.8973509933774837, "grad_norm": 1.1211016784698822, "learning_rate": 0.0001264029876659492, "loss": 1.8359, "step": 5885 }, { "epoch": 3.8980132450331126, "grad_norm": 1.072560670779608, "learning_rate": 0.0001263370733791332, "loss": 1.8047, "step": 5886 }, { "epoch": 3.8986754966887416, "grad_norm": 1.0068384054783504, "learning_rate": 0.00012627116377784798, "loss": 1.6562, "step": 5887 }, { "epoch": 3.899337748344371, "grad_norm": 1.0653065836067976, "learning_rate": 0.00012620525887514439, "loss": 1.8125, "step": 5888 }, { "epoch": 3.9, "grad_norm": 1.0283730253106607, "learning_rate": 0.00012613935868407226, "loss": 1.6953, "step": 5889 }, { "epoch": 3.9006622516556293, "grad_norm": 1.0458955382640824, "learning_rate": 0.00012607346321768078, "loss": 1.8125, "step": 5890 }, { "epoch": 3.9013245033112582, "grad_norm": 0.9267144549066699, "learning_rate": 0.00012600757248901784, "loss": 1.6094, "step": 5891 }, { "epoch": 3.901986754966887, "grad_norm": 1.0351049928685543, "learning_rate": 0.0001259416865111306, "loss": 1.8281, "step": 5892 }, { "epoch": 3.9026490066225166, "grad_norm": 0.9135677889404842, "learning_rate": 0.00012587580529706523, "loss": 1.4297, "step": 5893 }, { "epoch": 3.903311258278146, "grad_norm": 0.9449771234070313, "learning_rate": 0.00012580992885986706, "loss": 1.6953, "step": 5894 }, { "epoch": 3.903973509933775, "grad_norm": 0.8346852287918528, "learning_rate": 0.00012574405721258025, "loss": 1.4062, "step": 5895 }, { "epoch": 3.904635761589404, "grad_norm": 0.9715417089160522, "learning_rate": 0.0001256781903682482, "loss": 1.7188, "step": 5896 }, { "epoch": 3.9052980132450332, "grad_norm": 0.9947939215971671, "learning_rate": 0.00012561232833991333, "loss": 1.5859, "step": 5897 }, { "epoch": 3.905960264900662, "grad_norm": 0.9793867250789362, "learning_rate": 0.00012554647114061698, "loss": 1.5859, "step": 5898 }, { "epoch": 3.9066225165562916, "grad_norm": 1.048347069842256, "learning_rate": 0.0001254806187833998, "loss": 1.7969, "step": 5899 }, { "epoch": 3.9072847682119205, "grad_norm": 1.0826426274022765, "learning_rate": 0.00012541477128130113, "loss": 1.9219, "step": 5900 }, { "epoch": 3.9079470198675494, "grad_norm": 0.9690639843832329, "learning_rate": 0.0001253489286473596, "loss": 1.5312, "step": 5901 }, { "epoch": 3.908609271523179, "grad_norm": 1.048630131969388, "learning_rate": 0.0001252830908946128, "loss": 1.7656, "step": 5902 }, { "epoch": 3.909271523178808, "grad_norm": 1.0125384441146907, "learning_rate": 0.00012521725803609743, "loss": 1.6484, "step": 5903 }, { "epoch": 3.909933774834437, "grad_norm": 1.0592439975757326, "learning_rate": 0.00012515143008484908, "loss": 1.8984, "step": 5904 }, { "epoch": 3.910596026490066, "grad_norm": 1.0347052524874842, "learning_rate": 0.0001250856070539024, "loss": 1.8594, "step": 5905 }, { "epoch": 3.9112582781456955, "grad_norm": 0.9763155137248515, "learning_rate": 0.00012501978895629122, "loss": 1.6953, "step": 5906 }, { "epoch": 3.9119205298013244, "grad_norm": 0.9430461396406326, "learning_rate": 0.0001249539758050482, "loss": 1.5859, "step": 5907 }, { "epoch": 3.912582781456954, "grad_norm": 0.9297064875882691, "learning_rate": 0.00012488816761320508, "loss": 1.4531, "step": 5908 }, { "epoch": 3.9132450331125828, "grad_norm": 1.0624510206137043, "learning_rate": 0.00012482236439379275, "loss": 1.8125, "step": 5909 }, { "epoch": 3.9139072847682117, "grad_norm": 0.9874905536719288, "learning_rate": 0.00012475656615984086, "loss": 1.6094, "step": 5910 }, { "epoch": 3.914569536423841, "grad_norm": 0.8719244310316486, "learning_rate": 0.00012469077292437826, "loss": 1.5625, "step": 5911 }, { "epoch": 3.91523178807947, "grad_norm": 0.9836125894637188, "learning_rate": 0.0001246249847004328, "loss": 1.5703, "step": 5912 }, { "epoch": 3.9158940397350994, "grad_norm": 0.8982377513872685, "learning_rate": 0.00012455920150103135, "loss": 1.4219, "step": 5913 }, { "epoch": 3.9165562913907284, "grad_norm": 0.8948373727141572, "learning_rate": 0.00012449342333919954, "loss": 1.4609, "step": 5914 }, { "epoch": 3.9172185430463577, "grad_norm": 1.0581692211663396, "learning_rate": 0.00012442765022796246, "loss": 1.8594, "step": 5915 }, { "epoch": 3.9178807947019867, "grad_norm": 0.9364056756042234, "learning_rate": 0.00012436188218034377, "loss": 1.5312, "step": 5916 }, { "epoch": 3.918543046357616, "grad_norm": 0.9663274615176431, "learning_rate": 0.00012429611920936629, "loss": 1.6172, "step": 5917 }, { "epoch": 3.919205298013245, "grad_norm": 1.0397212101636628, "learning_rate": 0.00012423036132805192, "loss": 1.6484, "step": 5918 }, { "epoch": 3.919867549668874, "grad_norm": 1.06424461569124, "learning_rate": 0.0001241646085494215, "loss": 1.8594, "step": 5919 }, { "epoch": 3.9205298013245033, "grad_norm": 0.9810239936307137, "learning_rate": 0.0001240988608864947, "loss": 1.5156, "step": 5920 }, { "epoch": 3.9211920529801323, "grad_norm": 1.1091338189020195, "learning_rate": 0.00012403311835229043, "loss": 1.8438, "step": 5921 }, { "epoch": 3.9218543046357617, "grad_norm": 0.9969446338102435, "learning_rate": 0.00012396738095982646, "loss": 1.6406, "step": 5922 }, { "epoch": 3.9225165562913906, "grad_norm": 0.9563424963387369, "learning_rate": 0.00012390164872211944, "loss": 1.6875, "step": 5923 }, { "epoch": 3.92317880794702, "grad_norm": 0.9985475950276158, "learning_rate": 0.00012383592165218522, "loss": 1.6719, "step": 5924 }, { "epoch": 3.923841059602649, "grad_norm": 1.0030571849737733, "learning_rate": 0.00012377019976303854, "loss": 1.8203, "step": 5925 }, { "epoch": 3.9245033112582783, "grad_norm": 1.0023968425257104, "learning_rate": 0.00012370448306769292, "loss": 1.8438, "step": 5926 }, { "epoch": 3.9251655629139073, "grad_norm": 0.9814640043562964, "learning_rate": 0.00012363877157916116, "loss": 1.5703, "step": 5927 }, { "epoch": 3.9258278145695362, "grad_norm": 0.9219190252015685, "learning_rate": 0.00012357306531045483, "loss": 1.4219, "step": 5928 }, { "epoch": 3.9264900662251656, "grad_norm": 1.0046704231576273, "learning_rate": 0.00012350736427458453, "loss": 1.6328, "step": 5929 }, { "epoch": 3.9271523178807946, "grad_norm": 1.1287223928500771, "learning_rate": 0.00012344166848455987, "loss": 1.9609, "step": 5930 }, { "epoch": 3.927814569536424, "grad_norm": 1.0293598361819116, "learning_rate": 0.00012337597795338936, "loss": 1.7578, "step": 5931 }, { "epoch": 3.928476821192053, "grad_norm": 0.9281624602392251, "learning_rate": 0.0001233102926940804, "loss": 1.625, "step": 5932 }, { "epoch": 3.9291390728476823, "grad_norm": 1.0896770117777286, "learning_rate": 0.00012324461271963949, "loss": 1.8828, "step": 5933 }, { "epoch": 3.929801324503311, "grad_norm": 0.9079346473691484, "learning_rate": 0.000123178938043072, "loss": 1.5234, "step": 5934 }, { "epoch": 3.9304635761589406, "grad_norm": 0.950803156088209, "learning_rate": 0.00012311326867738226, "loss": 1.5234, "step": 5935 }, { "epoch": 3.9311258278145695, "grad_norm": 0.9729264769664621, "learning_rate": 0.00012304760463557359, "loss": 1.7031, "step": 5936 }, { "epoch": 3.9317880794701985, "grad_norm": 1.044359922820051, "learning_rate": 0.0001229819459306482, "loss": 1.7656, "step": 5937 }, { "epoch": 3.932450331125828, "grad_norm": 1.0102225825056053, "learning_rate": 0.00012291629257560723, "loss": 1.6875, "step": 5938 }, { "epoch": 3.933112582781457, "grad_norm": 1.0713630773780578, "learning_rate": 0.00012285064458345086, "loss": 1.8516, "step": 5939 }, { "epoch": 3.933774834437086, "grad_norm": 1.0214244574864582, "learning_rate": 0.00012278500196717817, "loss": 1.7188, "step": 5940 }, { "epoch": 3.934437086092715, "grad_norm": 1.0500488854471932, "learning_rate": 0.00012271936473978699, "loss": 1.7891, "step": 5941 }, { "epoch": 3.9350993377483445, "grad_norm": 0.9670262089761591, "learning_rate": 0.00012265373291427446, "loss": 1.625, "step": 5942 }, { "epoch": 3.9357615894039735, "grad_norm": 0.8897064329153318, "learning_rate": 0.00012258810650363629, "loss": 1.5, "step": 5943 }, { "epoch": 3.936423841059603, "grad_norm": 0.8195610557444866, "learning_rate": 0.00012252248552086723, "loss": 1.3203, "step": 5944 }, { "epoch": 3.937086092715232, "grad_norm": 1.0036867643688288, "learning_rate": 0.00012245686997896114, "loss": 1.6719, "step": 5945 }, { "epoch": 3.9377483443708607, "grad_norm": 0.9095383813866591, "learning_rate": 0.0001223912598909106, "loss": 1.4297, "step": 5946 }, { "epoch": 3.93841059602649, "grad_norm": 0.9776995773326178, "learning_rate": 0.000122325655269707, "loss": 1.5156, "step": 5947 }, { "epoch": 3.939072847682119, "grad_norm": 0.9488376480693868, "learning_rate": 0.00012226005612834103, "loss": 1.6875, "step": 5948 }, { "epoch": 3.9397350993377485, "grad_norm": 1.0010495663157601, "learning_rate": 0.00012219446247980196, "loss": 1.625, "step": 5949 }, { "epoch": 3.9403973509933774, "grad_norm": 0.9643902103222894, "learning_rate": 0.00012212887433707805, "loss": 1.5078, "step": 5950 }, { "epoch": 3.941059602649007, "grad_norm": 1.0643656736341296, "learning_rate": 0.0001220632917131566, "loss": 1.9609, "step": 5951 }, { "epoch": 3.9417218543046357, "grad_norm": 0.9874863803986343, "learning_rate": 0.0001219977146210237, "loss": 1.5625, "step": 5952 }, { "epoch": 3.942384105960265, "grad_norm": 1.0440571283964017, "learning_rate": 0.00012193214307366428, "loss": 1.6016, "step": 5953 }, { "epoch": 3.943046357615894, "grad_norm": 1.0033139474758266, "learning_rate": 0.00012186657708406236, "loss": 1.6562, "step": 5954 }, { "epoch": 3.943708609271523, "grad_norm": 0.8221849043276196, "learning_rate": 0.0001218010166652007, "loss": 1.2969, "step": 5955 }, { "epoch": 3.9443708609271524, "grad_norm": 0.9716002401389392, "learning_rate": 0.000121735461830061, "loss": 1.5234, "step": 5956 }, { "epoch": 3.9450331125827813, "grad_norm": 0.9974501994455075, "learning_rate": 0.00012166991259162392, "loss": 1.6094, "step": 5957 }, { "epoch": 3.9456953642384107, "grad_norm": 1.0216206300416164, "learning_rate": 0.000121604368962869, "loss": 1.7266, "step": 5958 }, { "epoch": 3.9463576158940397, "grad_norm": 0.9967694006653924, "learning_rate": 0.00012153883095677454, "loss": 1.6719, "step": 5959 }, { "epoch": 3.9470198675496686, "grad_norm": 1.0359284825958364, "learning_rate": 0.00012147329858631779, "loss": 1.8359, "step": 5960 }, { "epoch": 3.947682119205298, "grad_norm": 0.9633156652934463, "learning_rate": 0.00012140777186447502, "loss": 1.6172, "step": 5961 }, { "epoch": 3.9483443708609274, "grad_norm": 0.9452832282136642, "learning_rate": 0.00012134225080422124, "loss": 1.6719, "step": 5962 }, { "epoch": 3.9490066225165563, "grad_norm": 1.0042114035486978, "learning_rate": 0.00012127673541853028, "loss": 1.7422, "step": 5963 }, { "epoch": 3.9496688741721853, "grad_norm": 0.9415386021728477, "learning_rate": 0.00012121122572037506, "loss": 1.5469, "step": 5964 }, { "epoch": 3.9503311258278146, "grad_norm": 1.01445552286751, "learning_rate": 0.00012114572172272719, "loss": 1.6719, "step": 5965 }, { "epoch": 3.9509933774834436, "grad_norm": 0.7838841489783515, "learning_rate": 0.0001210802234385572, "loss": 1.3125, "step": 5966 }, { "epoch": 3.951655629139073, "grad_norm": 0.880522595695106, "learning_rate": 0.00012101473088083458, "loss": 1.5156, "step": 5967 }, { "epoch": 3.952317880794702, "grad_norm": 0.9835173692418814, "learning_rate": 0.00012094924406252752, "loss": 1.6016, "step": 5968 }, { "epoch": 3.952980132450331, "grad_norm": 1.0265704125492976, "learning_rate": 0.00012088376299660316, "loss": 1.7812, "step": 5969 }, { "epoch": 3.9536423841059603, "grad_norm": 1.0227762749913194, "learning_rate": 0.00012081828769602756, "loss": 1.5703, "step": 5970 }, { "epoch": 3.9543046357615896, "grad_norm": 0.9037523186626133, "learning_rate": 0.00012075281817376557, "loss": 1.4375, "step": 5971 }, { "epoch": 3.9549668874172186, "grad_norm": 0.9868944250775389, "learning_rate": 0.00012068735444278082, "loss": 1.6562, "step": 5972 }, { "epoch": 3.9556291390728475, "grad_norm": 0.9209851956532732, "learning_rate": 0.00012062189651603605, "loss": 1.5078, "step": 5973 }, { "epoch": 3.956291390728477, "grad_norm": 1.0208474925142401, "learning_rate": 0.00012055644440649255, "loss": 1.8438, "step": 5974 }, { "epoch": 3.956953642384106, "grad_norm": 0.8344083178918509, "learning_rate": 0.00012049099812711055, "loss": 1.3203, "step": 5975 }, { "epoch": 3.9576158940397352, "grad_norm": 1.0165818378408986, "learning_rate": 0.00012042555769084928, "loss": 1.7188, "step": 5976 }, { "epoch": 3.958278145695364, "grad_norm": 0.8743450452572842, "learning_rate": 0.00012036012311066669, "loss": 1.5078, "step": 5977 }, { "epoch": 3.958940397350993, "grad_norm": 0.9494047538185969, "learning_rate": 0.00012029469439951942, "loss": 1.625, "step": 5978 }, { "epoch": 3.9596026490066225, "grad_norm": 1.0440633321122885, "learning_rate": 0.00012022927157036333, "loss": 2.0, "step": 5979 }, { "epoch": 3.960264900662252, "grad_norm": 0.9011934978391383, "learning_rate": 0.00012016385463615275, "loss": 1.3984, "step": 5980 }, { "epoch": 3.960927152317881, "grad_norm": 1.045971981148548, "learning_rate": 0.00012009844360984096, "loss": 1.8594, "step": 5981 }, { "epoch": 3.96158940397351, "grad_norm": 0.9554913497313078, "learning_rate": 0.00012003303850438019, "loss": 1.6484, "step": 5982 }, { "epoch": 3.962251655629139, "grad_norm": 0.9611305299763574, "learning_rate": 0.0001199676393327214, "loss": 1.6016, "step": 5983 }, { "epoch": 3.962913907284768, "grad_norm": 0.9438569027705873, "learning_rate": 0.00011990224610781421, "loss": 1.6016, "step": 5984 }, { "epoch": 3.9635761589403975, "grad_norm": 0.957260968360181, "learning_rate": 0.00011983685884260747, "loss": 1.6484, "step": 5985 }, { "epoch": 3.9642384105960264, "grad_norm": 0.8249719930959608, "learning_rate": 0.00011977147755004844, "loss": 1.3438, "step": 5986 }, { "epoch": 3.9649006622516554, "grad_norm": 1.030002225196568, "learning_rate": 0.00011970610224308341, "loss": 1.8281, "step": 5987 }, { "epoch": 3.9655629139072848, "grad_norm": 0.9614518389287605, "learning_rate": 0.00011964073293465744, "loss": 1.5312, "step": 5988 }, { "epoch": 3.966225165562914, "grad_norm": 1.0197959521456355, "learning_rate": 0.00011957536963771445, "loss": 1.8125, "step": 5989 }, { "epoch": 3.966887417218543, "grad_norm": 1.0295459023892108, "learning_rate": 0.00011951001236519701, "loss": 1.9375, "step": 5990 }, { "epoch": 3.967549668874172, "grad_norm": 1.0102801695364851, "learning_rate": 0.00011944466113004672, "loss": 1.75, "step": 5991 }, { "epoch": 3.9682119205298014, "grad_norm": 0.8783749337395543, "learning_rate": 0.00011937931594520385, "loss": 1.4609, "step": 5992 }, { "epoch": 3.9688741721854304, "grad_norm": 1.0270371887594265, "learning_rate": 0.00011931397682360741, "loss": 1.6797, "step": 5993 }, { "epoch": 3.9695364238410598, "grad_norm": 1.044666297418703, "learning_rate": 0.00011924864377819545, "loss": 1.8906, "step": 5994 }, { "epoch": 3.9701986754966887, "grad_norm": 0.9429889919620572, "learning_rate": 0.00011918331682190454, "loss": 1.625, "step": 5995 }, { "epoch": 3.9708609271523176, "grad_norm": 1.047992362691198, "learning_rate": 0.00011911799596767016, "loss": 1.7656, "step": 5996 }, { "epoch": 3.971523178807947, "grad_norm": 1.0892263762817587, "learning_rate": 0.00011905268122842665, "loss": 1.8281, "step": 5997 }, { "epoch": 3.9721854304635764, "grad_norm": 0.9938213694353012, "learning_rate": 0.00011898737261710707, "loss": 1.7266, "step": 5998 }, { "epoch": 3.9728476821192054, "grad_norm": 1.0173092095974985, "learning_rate": 0.00011892207014664324, "loss": 1.6562, "step": 5999 }, { "epoch": 3.9735099337748343, "grad_norm": 1.1267495337940885, "learning_rate": 0.00011885677382996587, "loss": 2.0156, "step": 6000 }, { "epoch": 3.9741721854304637, "grad_norm": 0.9297865360219322, "learning_rate": 0.00011879148368000429, "loss": 1.6328, "step": 6001 }, { "epoch": 3.9748344370860926, "grad_norm": 8.587726232289832, "learning_rate": 0.00011872619970968671, "loss": 1.6016, "step": 6002 }, { "epoch": 3.975496688741722, "grad_norm": 0.9185089325463119, "learning_rate": 0.00011866092193194017, "loss": 1.5391, "step": 6003 }, { "epoch": 3.976158940397351, "grad_norm": 0.9202026167686796, "learning_rate": 0.00011859565035969039, "loss": 1.4766, "step": 6004 }, { "epoch": 3.97682119205298, "grad_norm": 0.7841295988463723, "learning_rate": 0.0001185303850058618, "loss": 1.2266, "step": 6005 }, { "epoch": 3.9774834437086093, "grad_norm": 0.9823307399984005, "learning_rate": 0.00011846512588337786, "loss": 1.6094, "step": 6006 }, { "epoch": 3.9781456953642387, "grad_norm": 0.9848627286114742, "learning_rate": 0.0001183998730051605, "loss": 1.7266, "step": 6007 }, { "epoch": 3.9788079470198676, "grad_norm": 0.9895148946902289, "learning_rate": 0.00011833462638413053, "loss": 1.5391, "step": 6008 }, { "epoch": 3.9794701986754966, "grad_norm": 0.9650377811018416, "learning_rate": 0.00011826938603320759, "loss": 1.6328, "step": 6009 }, { "epoch": 3.980132450331126, "grad_norm": 1.0260845778965442, "learning_rate": 0.00011820415196531004, "loss": 1.6172, "step": 6010 }, { "epoch": 3.980794701986755, "grad_norm": 0.9528885037589999, "learning_rate": 0.00011813892419335483, "loss": 1.5391, "step": 6011 }, { "epoch": 3.9814569536423843, "grad_norm": 0.9856470385874971, "learning_rate": 0.000118073702730258, "loss": 1.6094, "step": 6012 }, { "epoch": 3.982119205298013, "grad_norm": 1.016281948867364, "learning_rate": 0.00011800848758893402, "loss": 1.7188, "step": 6013 }, { "epoch": 3.982781456953642, "grad_norm": 1.0071027348979533, "learning_rate": 0.00011794327878229629, "loss": 1.6328, "step": 6014 }, { "epoch": 3.9834437086092715, "grad_norm": 1.0196810093717645, "learning_rate": 0.0001178780763232568, "loss": 1.6719, "step": 6015 }, { "epoch": 3.984105960264901, "grad_norm": 0.9583299437773576, "learning_rate": 0.00011781288022472655, "loss": 1.5078, "step": 6016 }, { "epoch": 3.98476821192053, "grad_norm": 1.0359187393386555, "learning_rate": 0.00011774769049961501, "loss": 1.6562, "step": 6017 }, { "epoch": 3.985430463576159, "grad_norm": 1.1924712882228408, "learning_rate": 0.00011768250716083046, "loss": 2.0156, "step": 6018 }, { "epoch": 3.986092715231788, "grad_norm": 1.089328310253908, "learning_rate": 0.00011761733022128006, "loss": 1.7344, "step": 6019 }, { "epoch": 3.986754966887417, "grad_norm": 1.0246044973369064, "learning_rate": 0.00011755215969386954, "loss": 1.6797, "step": 6020 }, { "epoch": 3.9874172185430465, "grad_norm": 1.0244430954779407, "learning_rate": 0.00011748699559150336, "loss": 1.8828, "step": 6021 }, { "epoch": 3.9880794701986755, "grad_norm": 0.8980850522942763, "learning_rate": 0.0001174218379270848, "loss": 1.4922, "step": 6022 }, { "epoch": 3.9887417218543044, "grad_norm": 1.0316601983913096, "learning_rate": 0.00011735668671351585, "loss": 1.7656, "step": 6023 }, { "epoch": 3.989403973509934, "grad_norm": 1.0674478580151399, "learning_rate": 0.00011729154196369715, "loss": 1.8047, "step": 6024 }, { "epoch": 3.9900662251655628, "grad_norm": 0.979390877514744, "learning_rate": 0.00011722640369052814, "loss": 1.7422, "step": 6025 }, { "epoch": 3.990728476821192, "grad_norm": 1.049215990898217, "learning_rate": 0.00011716127190690697, "loss": 1.7578, "step": 6026 }, { "epoch": 3.991390728476821, "grad_norm": 1.0380192101443995, "learning_rate": 0.0001170961466257304, "loss": 1.8828, "step": 6027 }, { "epoch": 3.9920529801324505, "grad_norm": 0.9705093209116584, "learning_rate": 0.00011703102785989405, "loss": 1.6406, "step": 6028 }, { "epoch": 3.9927152317880794, "grad_norm": 1.0071879018076397, "learning_rate": 0.00011696591562229217, "loss": 1.6484, "step": 6029 }, { "epoch": 3.993377483443709, "grad_norm": 0.9147838453901272, "learning_rate": 0.00011690080992581768, "loss": 1.5781, "step": 6030 }, { "epoch": 3.9940397350993377, "grad_norm": 0.9532397247780353, "learning_rate": 0.00011683571078336236, "loss": 1.7734, "step": 6031 }, { "epoch": 3.9947019867549667, "grad_norm": 0.9470989205599568, "learning_rate": 0.00011677061820781649, "loss": 1.6172, "step": 6032 }, { "epoch": 3.995364238410596, "grad_norm": 0.9660650446767381, "learning_rate": 0.00011670553221206914, "loss": 1.7188, "step": 6033 }, { "epoch": 3.996026490066225, "grad_norm": 0.9931590095839206, "learning_rate": 0.00011664045280900815, "loss": 1.7656, "step": 6034 }, { "epoch": 3.9966887417218544, "grad_norm": 0.9464276998651157, "learning_rate": 0.00011657538001151997, "loss": 1.5469, "step": 6035 }, { "epoch": 3.9973509933774833, "grad_norm": 0.9267903575032951, "learning_rate": 0.00011651031383248971, "loss": 1.3906, "step": 6036 }, { "epoch": 3.9980132450331127, "grad_norm": 1.0232663393473593, "learning_rate": 0.00011644525428480133, "loss": 1.7031, "step": 6037 }, { "epoch": 3.9986754966887417, "grad_norm": 0.9589901748908097, "learning_rate": 0.00011638020138133727, "loss": 1.5469, "step": 6038 }, { "epoch": 3.999337748344371, "grad_norm": 1.000400829760944, "learning_rate": 0.00011631515513497872, "loss": 1.5312, "step": 6039 }, { "epoch": 4.0, "grad_norm": 0.9968704337632017, "learning_rate": 0.00011625011555860567, "loss": 1.7266, "step": 6040 }, { "epoch": 4.0, "eval_loss": 2.2305381298065186, "eval_runtime": 33.9662, "eval_samples_per_second": 9.951, "eval_steps_per_second": 9.951, "step": 6040 }, { "epoch": 4.000662251655629, "grad_norm": 0.8295225117868664, "learning_rate": 0.00011618508266509672, "loss": 1.1797, "step": 6041 }, { "epoch": 4.001324503311258, "grad_norm": 0.7266332386137614, "learning_rate": 0.00011612005646732897, "loss": 0.8516, "step": 6042 }, { "epoch": 4.001986754966888, "grad_norm": 0.7727855688630928, "learning_rate": 0.00011605503697817855, "loss": 1.0625, "step": 6043 }, { "epoch": 4.002649006622517, "grad_norm": 0.9028321973589735, "learning_rate": 0.00011599002421051996, "loss": 1.2344, "step": 6044 }, { "epoch": 4.003311258278146, "grad_norm": 0.765166278852301, "learning_rate": 0.00011592501817722645, "loss": 0.9688, "step": 6045 }, { "epoch": 4.0039735099337745, "grad_norm": 0.6569216129782834, "learning_rate": 0.00011586001889117002, "loss": 0.6719, "step": 6046 }, { "epoch": 4.004635761589404, "grad_norm": 0.7571735255158986, "learning_rate": 0.0001157950263652213, "loss": 0.8789, "step": 6047 }, { "epoch": 4.005298013245033, "grad_norm": 0.7659321357900977, "learning_rate": 0.00011573004061224944, "loss": 0.8477, "step": 6048 }, { "epoch": 4.005960264900662, "grad_norm": 0.9017468661723219, "learning_rate": 0.00011566506164512245, "loss": 0.9922, "step": 6049 }, { "epoch": 4.006622516556291, "grad_norm": 1.1376938621436214, "learning_rate": 0.00011560008947670689, "loss": 1.2266, "step": 6050 }, { "epoch": 4.00728476821192, "grad_norm": 1.0500217707359532, "learning_rate": 0.00011553512411986797, "loss": 1.0938, "step": 6051 }, { "epoch": 4.00794701986755, "grad_norm": 0.9951311941412714, "learning_rate": 0.00011547016558746963, "loss": 1.0312, "step": 6052 }, { "epoch": 4.008609271523179, "grad_norm": 0.9128312414997436, "learning_rate": 0.00011540521389237442, "loss": 0.9453, "step": 6053 }, { "epoch": 4.009271523178808, "grad_norm": 0.8697596319334847, "learning_rate": 0.0001153402690474434, "loss": 0.8789, "step": 6054 }, { "epoch": 4.009933774834437, "grad_norm": 0.9499494979222866, "learning_rate": 0.0001152753310655365, "loss": 0.8672, "step": 6055 }, { "epoch": 4.010596026490067, "grad_norm": 0.6655464246200212, "learning_rate": 0.00011521039995951217, "loss": 0.6172, "step": 6056 }, { "epoch": 4.011258278145696, "grad_norm": 1.0863903920417386, "learning_rate": 0.00011514547574222744, "loss": 0.9453, "step": 6057 }, { "epoch": 4.0119205298013245, "grad_norm": 0.9628375304753726, "learning_rate": 0.0001150805584265382, "loss": 0.8906, "step": 6058 }, { "epoch": 4.0125827814569535, "grad_norm": 1.147439567381915, "learning_rate": 0.00011501564802529869, "loss": 1.1016, "step": 6059 }, { "epoch": 4.013245033112582, "grad_norm": 0.7799343678360273, "learning_rate": 0.0001149507445513619, "loss": 0.6719, "step": 6060 }, { "epoch": 4.013907284768212, "grad_norm": 1.0997259834379338, "learning_rate": 0.00011488584801757957, "loss": 1.0547, "step": 6061 }, { "epoch": 4.014569536423841, "grad_norm": 0.867350769500672, "learning_rate": 0.00011482095843680189, "loss": 0.7578, "step": 6062 }, { "epoch": 4.01523178807947, "grad_norm": 0.7771363448981655, "learning_rate": 0.00011475607582187772, "loss": 0.6758, "step": 6063 }, { "epoch": 4.015894039735099, "grad_norm": 1.097822320698339, "learning_rate": 0.00011469120018565468, "loss": 1.0469, "step": 6064 }, { "epoch": 4.016556291390729, "grad_norm": 1.111989715189192, "learning_rate": 0.00011462633154097876, "loss": 1.1094, "step": 6065 }, { "epoch": 4.017218543046358, "grad_norm": 0.9528213136522604, "learning_rate": 0.0001145614699006947, "loss": 0.8906, "step": 6066 }, { "epoch": 4.017880794701987, "grad_norm": 0.933669287032352, "learning_rate": 0.00011449661527764594, "loss": 0.8281, "step": 6067 }, { "epoch": 4.018543046357616, "grad_norm": 0.9514160534723897, "learning_rate": 0.00011443176768467442, "loss": 0.8516, "step": 6068 }, { "epoch": 4.019205298013245, "grad_norm": 1.153412539569718, "learning_rate": 0.00011436692713462056, "loss": 1.1172, "step": 6069 }, { "epoch": 4.0198675496688745, "grad_norm": 1.1092156739458603, "learning_rate": 0.00011430209364032377, "loss": 1.0312, "step": 6070 }, { "epoch": 4.020529801324503, "grad_norm": 0.9980405443252298, "learning_rate": 0.00011423726721462164, "loss": 0.9453, "step": 6071 }, { "epoch": 4.021192052980132, "grad_norm": 1.050254890740168, "learning_rate": 0.00011417244787035066, "loss": 0.9688, "step": 6072 }, { "epoch": 4.021854304635761, "grad_norm": 0.9388678320829094, "learning_rate": 0.00011410763562034565, "loss": 0.9102, "step": 6073 }, { "epoch": 4.022516556291391, "grad_norm": 1.1209802759420444, "learning_rate": 0.00011404283047744036, "loss": 0.9297, "step": 6074 }, { "epoch": 4.02317880794702, "grad_norm": 0.9147887448046268, "learning_rate": 0.00011397803245446686, "loss": 0.8164, "step": 6075 }, { "epoch": 4.023841059602649, "grad_norm": 1.1873637000071138, "learning_rate": 0.00011391324156425588, "loss": 1.2266, "step": 6076 }, { "epoch": 4.024503311258278, "grad_norm": 1.1242035847406417, "learning_rate": 0.00011384845781963681, "loss": 1.0625, "step": 6077 }, { "epoch": 4.025165562913907, "grad_norm": 0.8701647628634304, "learning_rate": 0.0001137836812334376, "loss": 0.8086, "step": 6078 }, { "epoch": 4.025827814569537, "grad_norm": 1.0469717812158525, "learning_rate": 0.0001137189118184846, "loss": 0.9297, "step": 6079 }, { "epoch": 4.026490066225166, "grad_norm": 1.0041101064210378, "learning_rate": 0.00011365414958760312, "loss": 0.9258, "step": 6080 }, { "epoch": 4.027152317880795, "grad_norm": 0.8938651505809888, "learning_rate": 0.00011358939455361668, "loss": 0.7188, "step": 6081 }, { "epoch": 4.027814569536424, "grad_norm": 1.0322947369784174, "learning_rate": 0.00011352464672934752, "loss": 0.8984, "step": 6082 }, { "epoch": 4.028476821192053, "grad_norm": 1.1054143590771612, "learning_rate": 0.00011345990612761652, "loss": 0.9766, "step": 6083 }, { "epoch": 4.029139072847682, "grad_norm": 1.1125494691704148, "learning_rate": 0.00011339517276124305, "loss": 1.0391, "step": 6084 }, { "epoch": 4.029801324503311, "grad_norm": 1.0355392695858143, "learning_rate": 0.00011333044664304496, "loss": 0.9766, "step": 6085 }, { "epoch": 4.03046357615894, "grad_norm": 0.894740905730587, "learning_rate": 0.00011326572778583889, "loss": 0.8281, "step": 6086 }, { "epoch": 4.031125827814569, "grad_norm": 0.7829140849328088, "learning_rate": 0.00011320101620243984, "loss": 0.7031, "step": 6087 }, { "epoch": 4.031788079470199, "grad_norm": 1.0405278653112255, "learning_rate": 0.00011313631190566146, "loss": 1.0391, "step": 6088 }, { "epoch": 4.032450331125828, "grad_norm": 1.2253934130017077, "learning_rate": 0.000113071614908316, "loss": 1.1953, "step": 6089 }, { "epoch": 4.033112582781457, "grad_norm": 0.8015260583976082, "learning_rate": 0.00011300692522321413, "loss": 0.707, "step": 6090 }, { "epoch": 4.033774834437086, "grad_norm": 0.8143916875866967, "learning_rate": 0.00011294224286316516, "loss": 0.7031, "step": 6091 }, { "epoch": 4.034437086092716, "grad_norm": 1.273912609960614, "learning_rate": 0.00011287756784097697, "loss": 1.2266, "step": 6092 }, { "epoch": 4.035099337748345, "grad_norm": 0.6643608268915918, "learning_rate": 0.00011281290016945595, "loss": 0.5547, "step": 6093 }, { "epoch": 4.035761589403974, "grad_norm": 1.024292407413232, "learning_rate": 0.00011274823986140698, "loss": 0.875, "step": 6094 }, { "epoch": 4.0364238410596025, "grad_norm": 1.1113498589763122, "learning_rate": 0.00011268358692963368, "loss": 0.9609, "step": 6095 }, { "epoch": 4.0370860927152314, "grad_norm": 0.9856726075807472, "learning_rate": 0.00011261894138693796, "loss": 0.8789, "step": 6096 }, { "epoch": 4.037748344370861, "grad_norm": 0.9228205284624255, "learning_rate": 0.00011255430324612037, "loss": 0.7773, "step": 6097 }, { "epoch": 4.03841059602649, "grad_norm": 1.1404135386531435, "learning_rate": 0.00011248967251998005, "loss": 1.0469, "step": 6098 }, { "epoch": 4.039072847682119, "grad_norm": 0.9045183758269307, "learning_rate": 0.00011242504922131467, "loss": 0.6797, "step": 6099 }, { "epoch": 4.039735099337748, "grad_norm": 0.863250279444504, "learning_rate": 0.00011236043336292022, "loss": 0.6719, "step": 6100 }, { "epoch": 4.040397350993377, "grad_norm": 1.1707184880112114, "learning_rate": 0.00011229582495759159, "loss": 1.0625, "step": 6101 }, { "epoch": 4.041059602649007, "grad_norm": 0.8792400476696068, "learning_rate": 0.00011223122401812185, "loss": 0.7656, "step": 6102 }, { "epoch": 4.041721854304636, "grad_norm": 0.9834222264926916, "learning_rate": 0.00011216663055730273, "loss": 0.7812, "step": 6103 }, { "epoch": 4.042384105960265, "grad_norm": 1.0520949822226686, "learning_rate": 0.00011210204458792452, "loss": 0.8242, "step": 6104 }, { "epoch": 4.043046357615894, "grad_norm": 0.9018274966222466, "learning_rate": 0.000112037466122776, "loss": 0.8203, "step": 6105 }, { "epoch": 4.0437086092715235, "grad_norm": 1.1083785234738406, "learning_rate": 0.00011197289517464434, "loss": 0.9414, "step": 6106 }, { "epoch": 4.0443708609271525, "grad_norm": 1.1360670442136473, "learning_rate": 0.00011190833175631548, "loss": 1.0391, "step": 6107 }, { "epoch": 4.045033112582781, "grad_norm": 1.0727601198901249, "learning_rate": 0.00011184377588057361, "loss": 0.8867, "step": 6108 }, { "epoch": 4.04569536423841, "grad_norm": 1.0317899958147387, "learning_rate": 0.00011177922756020155, "loss": 0.8164, "step": 6109 }, { "epoch": 4.046357615894039, "grad_norm": 0.9532874345891553, "learning_rate": 0.00011171468680798063, "loss": 0.832, "step": 6110 }, { "epoch": 4.047019867549669, "grad_norm": 1.2175977993133529, "learning_rate": 0.00011165015363669068, "loss": 1.125, "step": 6111 }, { "epoch": 4.047682119205298, "grad_norm": 1.003430968101119, "learning_rate": 0.00011158562805910991, "loss": 0.8633, "step": 6112 }, { "epoch": 4.048344370860927, "grad_norm": 0.8657745475806908, "learning_rate": 0.00011152111008801525, "loss": 0.6719, "step": 6113 }, { "epoch": 4.049006622516556, "grad_norm": 1.098832170323678, "learning_rate": 0.0001114565997361819, "loss": 1.0, "step": 6114 }, { "epoch": 4.049668874172186, "grad_norm": 0.7862532444176566, "learning_rate": 0.00011139209701638369, "loss": 0.6094, "step": 6115 }, { "epoch": 4.050331125827815, "grad_norm": 1.002899318607169, "learning_rate": 0.00011132760194139293, "loss": 0.8281, "step": 6116 }, { "epoch": 4.050993377483444, "grad_norm": 1.2031492280036507, "learning_rate": 0.00011126311452398034, "loss": 0.9766, "step": 6117 }, { "epoch": 4.051655629139073, "grad_norm": 0.819001014965865, "learning_rate": 0.00011119863477691513, "loss": 0.6172, "step": 6118 }, { "epoch": 4.052317880794702, "grad_norm": 0.9490399510267773, "learning_rate": 0.00011113416271296514, "loss": 0.7461, "step": 6119 }, { "epoch": 4.052980132450331, "grad_norm": 1.1142188873033128, "learning_rate": 0.00011106969834489648, "loss": 0.918, "step": 6120 }, { "epoch": 4.05364238410596, "grad_norm": 1.0055561494429706, "learning_rate": 0.00011100524168547385, "loss": 0.7617, "step": 6121 }, { "epoch": 4.054304635761589, "grad_norm": 0.9863092902918323, "learning_rate": 0.00011094079274746048, "loss": 0.9023, "step": 6122 }, { "epoch": 4.054966887417218, "grad_norm": 0.8008173578446519, "learning_rate": 0.00011087635154361792, "loss": 0.625, "step": 6123 }, { "epoch": 4.055629139072848, "grad_norm": 1.1288406592370104, "learning_rate": 0.00011081191808670627, "loss": 0.9375, "step": 6124 }, { "epoch": 4.056291390728477, "grad_norm": 1.181138573846123, "learning_rate": 0.00011074749238948415, "loss": 0.9453, "step": 6125 }, { "epoch": 4.056953642384106, "grad_norm": 1.1128121527033252, "learning_rate": 0.00011068307446470858, "loss": 0.9492, "step": 6126 }, { "epoch": 4.057615894039735, "grad_norm": 1.2333781639211527, "learning_rate": 0.00011061866432513492, "loss": 1.0781, "step": 6127 }, { "epoch": 4.058278145695364, "grad_norm": 1.140504128823657, "learning_rate": 0.00011055426198351732, "loss": 0.9609, "step": 6128 }, { "epoch": 4.058940397350994, "grad_norm": 1.0578637746407857, "learning_rate": 0.00011048986745260803, "loss": 0.918, "step": 6129 }, { "epoch": 4.059602649006623, "grad_norm": 0.9837589691809756, "learning_rate": 0.00011042548074515797, "loss": 0.8359, "step": 6130 }, { "epoch": 4.0602649006622515, "grad_norm": 1.1316393457165368, "learning_rate": 0.00011036110187391639, "loss": 1.0469, "step": 6131 }, { "epoch": 4.0609271523178805, "grad_norm": 1.0999729518262902, "learning_rate": 0.00011029673085163114, "loss": 1.0, "step": 6132 }, { "epoch": 4.06158940397351, "grad_norm": 1.003516059397668, "learning_rate": 0.00011023236769104833, "loss": 0.8789, "step": 6133 }, { "epoch": 4.062251655629139, "grad_norm": 1.1241101033866776, "learning_rate": 0.00011016801240491263, "loss": 0.9297, "step": 6134 }, { "epoch": 4.062913907284768, "grad_norm": 0.862811325364301, "learning_rate": 0.00011010366500596715, "loss": 0.7578, "step": 6135 }, { "epoch": 4.063576158940397, "grad_norm": 1.0526687590971255, "learning_rate": 0.00011003932550695343, "loss": 0.918, "step": 6136 }, { "epoch": 4.064238410596026, "grad_norm": 1.040565041492152, "learning_rate": 0.00010997499392061129, "loss": 0.9453, "step": 6137 }, { "epoch": 4.064900662251656, "grad_norm": 1.1558093656867783, "learning_rate": 0.00010991067025967934, "loss": 0.9648, "step": 6138 }, { "epoch": 4.065562913907285, "grad_norm": 1.0914902805465116, "learning_rate": 0.00010984635453689423, "loss": 0.9102, "step": 6139 }, { "epoch": 4.066225165562914, "grad_norm": 0.9802797646791277, "learning_rate": 0.00010978204676499125, "loss": 0.8359, "step": 6140 }, { "epoch": 4.066887417218543, "grad_norm": 1.0352682665193418, "learning_rate": 0.00010971774695670414, "loss": 0.8398, "step": 6141 }, { "epoch": 4.067549668874173, "grad_norm": 1.0816479971775392, "learning_rate": 0.00010965345512476497, "loss": 0.8516, "step": 6142 }, { "epoch": 4.0682119205298015, "grad_norm": 1.1738892439614412, "learning_rate": 0.00010958917128190417, "loss": 0.9727, "step": 6143 }, { "epoch": 4.0688741721854305, "grad_norm": 1.190389227837336, "learning_rate": 0.0001095248954408508, "loss": 0.9492, "step": 6144 }, { "epoch": 4.069536423841059, "grad_norm": 1.164894174868382, "learning_rate": 0.00010946062761433217, "loss": 1.0703, "step": 6145 }, { "epoch": 4.070198675496688, "grad_norm": 0.8320490788695099, "learning_rate": 0.00010939636781507397, "loss": 0.6797, "step": 6146 }, { "epoch": 4.070860927152318, "grad_norm": 1.1075848503818417, "learning_rate": 0.00010933211605580052, "loss": 0.9219, "step": 6147 }, { "epoch": 4.071523178807947, "grad_norm": 1.0844108160278423, "learning_rate": 0.00010926787234923435, "loss": 1.0781, "step": 6148 }, { "epoch": 4.072185430463576, "grad_norm": 0.9548600743564585, "learning_rate": 0.00010920363670809637, "loss": 0.7695, "step": 6149 }, { "epoch": 4.072847682119205, "grad_norm": 1.0922439035925138, "learning_rate": 0.00010913940914510609, "loss": 0.9023, "step": 6150 }, { "epoch": 4.073509933774835, "grad_norm": 1.1207046484323397, "learning_rate": 0.00010907518967298123, "loss": 1.0, "step": 6151 }, { "epoch": 4.074172185430464, "grad_norm": 1.1158815742620745, "learning_rate": 0.00010901097830443796, "loss": 0.9297, "step": 6152 }, { "epoch": 4.074834437086093, "grad_norm": 0.9304070896365855, "learning_rate": 0.00010894677505219099, "loss": 0.7305, "step": 6153 }, { "epoch": 4.075496688741722, "grad_norm": 0.9282693787096283, "learning_rate": 0.00010888257992895317, "loss": 0.7734, "step": 6154 }, { "epoch": 4.076158940397351, "grad_norm": 1.1720616606525114, "learning_rate": 0.0001088183929474359, "loss": 1.0391, "step": 6155 }, { "epoch": 4.07682119205298, "grad_norm": 1.1520419968611935, "learning_rate": 0.000108754214120349, "loss": 0.9414, "step": 6156 }, { "epoch": 4.077483443708609, "grad_norm": 0.9666933805090103, "learning_rate": 0.00010869004346040055, "loss": 0.8008, "step": 6157 }, { "epoch": 4.078145695364238, "grad_norm": 1.0378605322634982, "learning_rate": 0.00010862588098029709, "loss": 0.8242, "step": 6158 }, { "epoch": 4.078807947019867, "grad_norm": 0.9499468543678622, "learning_rate": 0.00010856172669274358, "loss": 0.8438, "step": 6159 }, { "epoch": 4.079470198675497, "grad_norm": 1.2171483290736038, "learning_rate": 0.00010849758061044323, "loss": 1.0703, "step": 6160 }, { "epoch": 4.080132450331126, "grad_norm": 1.0604980876986472, "learning_rate": 0.00010843344274609772, "loss": 0.8555, "step": 6161 }, { "epoch": 4.080794701986755, "grad_norm": 0.9375595359931704, "learning_rate": 0.0001083693131124071, "loss": 0.7266, "step": 6162 }, { "epoch": 4.081456953642384, "grad_norm": 1.1731259180404756, "learning_rate": 0.00010830519172206981, "loss": 1.0234, "step": 6163 }, { "epoch": 4.082119205298013, "grad_norm": 1.052047635000336, "learning_rate": 0.00010824107858778245, "loss": 0.8438, "step": 6164 }, { "epoch": 4.082781456953643, "grad_norm": 1.0203960908284417, "learning_rate": 0.00010817697372224041, "loss": 0.7852, "step": 6165 }, { "epoch": 4.083443708609272, "grad_norm": 0.9726764675584749, "learning_rate": 0.00010811287713813701, "loss": 0.7266, "step": 6166 }, { "epoch": 4.084105960264901, "grad_norm": 0.9202087953719736, "learning_rate": 0.00010804878884816412, "loss": 0.7773, "step": 6167 }, { "epoch": 4.0847682119205295, "grad_norm": 1.003795981323917, "learning_rate": 0.00010798470886501203, "loss": 0.7461, "step": 6168 }, { "epoch": 4.085430463576159, "grad_norm": 0.994724144978067, "learning_rate": 0.0001079206372013693, "loss": 0.8125, "step": 6169 }, { "epoch": 4.086092715231788, "grad_norm": 1.1484884308538337, "learning_rate": 0.00010785657386992279, "loss": 0.9414, "step": 6170 }, { "epoch": 4.086754966887417, "grad_norm": 0.9676172352162731, "learning_rate": 0.00010779251888335782, "loss": 0.7852, "step": 6171 }, { "epoch": 4.087417218543046, "grad_norm": 0.9960351981894656, "learning_rate": 0.00010772847225435801, "loss": 0.7852, "step": 6172 }, { "epoch": 4.088079470198675, "grad_norm": 1.2089386000553126, "learning_rate": 0.00010766443399560529, "loss": 0.9961, "step": 6173 }, { "epoch": 4.088741721854305, "grad_norm": 0.9875402351823904, "learning_rate": 0.00010760040411978004, "loss": 0.8398, "step": 6174 }, { "epoch": 4.089403973509934, "grad_norm": 1.1503639680417312, "learning_rate": 0.00010753638263956089, "loss": 0.957, "step": 6175 }, { "epoch": 4.090066225165563, "grad_norm": 1.1444604033106045, "learning_rate": 0.00010747236956762473, "loss": 0.957, "step": 6176 }, { "epoch": 4.090728476821192, "grad_norm": 0.9620191937312393, "learning_rate": 0.000107408364916647, "loss": 0.8125, "step": 6177 }, { "epoch": 4.091390728476822, "grad_norm": 0.9050149486915279, "learning_rate": 0.00010734436869930134, "loss": 0.7188, "step": 6178 }, { "epoch": 4.092052980132451, "grad_norm": 1.013704251816913, "learning_rate": 0.00010728038092825967, "loss": 0.8398, "step": 6179 }, { "epoch": 4.0927152317880795, "grad_norm": 0.9317626812105108, "learning_rate": 0.00010721640161619238, "loss": 0.8398, "step": 6180 }, { "epoch": 4.093377483443708, "grad_norm": 1.1083403838074983, "learning_rate": 0.00010715243077576805, "loss": 0.9023, "step": 6181 }, { "epoch": 4.094039735099337, "grad_norm": 0.8991871627697058, "learning_rate": 0.00010708846841965363, "loss": 0.7422, "step": 6182 }, { "epoch": 4.094701986754967, "grad_norm": 1.0569232401961193, "learning_rate": 0.00010702451456051445, "loss": 0.8086, "step": 6183 }, { "epoch": 4.095364238410596, "grad_norm": 0.9293558481989543, "learning_rate": 0.00010696056921101409, "loss": 0.668, "step": 6184 }, { "epoch": 4.096026490066225, "grad_norm": 1.0967084173775539, "learning_rate": 0.00010689663238381448, "loss": 0.8945, "step": 6185 }, { "epoch": 4.096688741721854, "grad_norm": 1.2070857534071646, "learning_rate": 0.00010683270409157577, "loss": 0.8945, "step": 6186 }, { "epoch": 4.097350993377484, "grad_norm": 0.9638353882112738, "learning_rate": 0.00010676878434695656, "loss": 0.7266, "step": 6187 }, { "epoch": 4.098013245033113, "grad_norm": 1.13338973242338, "learning_rate": 0.0001067048731626137, "loss": 0.8242, "step": 6188 }, { "epoch": 4.098675496688742, "grad_norm": 1.0361474108741482, "learning_rate": 0.00010664097055120228, "loss": 0.7461, "step": 6189 }, { "epoch": 4.099337748344371, "grad_norm": 1.19580275207176, "learning_rate": 0.00010657707652537585, "loss": 0.9961, "step": 6190 }, { "epoch": 4.1, "grad_norm": 1.1300805453008844, "learning_rate": 0.00010651319109778606, "loss": 0.9258, "step": 6191 }, { "epoch": 4.1006622516556295, "grad_norm": 1.1577862467213746, "learning_rate": 0.00010644931428108295, "loss": 1.0234, "step": 6192 }, { "epoch": 4.101324503311258, "grad_norm": 1.121482014852132, "learning_rate": 0.00010638544608791496, "loss": 0.9102, "step": 6193 }, { "epoch": 4.101986754966887, "grad_norm": 1.11831185126957, "learning_rate": 0.00010632158653092868, "loss": 1.0, "step": 6194 }, { "epoch": 4.102649006622516, "grad_norm": 1.2356005011263262, "learning_rate": 0.00010625773562276894, "loss": 1.0547, "step": 6195 }, { "epoch": 4.103311258278145, "grad_norm": 0.9628279953061157, "learning_rate": 0.00010619389337607912, "loss": 0.7969, "step": 6196 }, { "epoch": 4.103973509933775, "grad_norm": 1.032529703126546, "learning_rate": 0.00010613005980350061, "loss": 0.7852, "step": 6197 }, { "epoch": 4.104635761589404, "grad_norm": 1.2501592861588524, "learning_rate": 0.00010606623491767319, "loss": 1.0469, "step": 6198 }, { "epoch": 4.105298013245033, "grad_norm": 0.8417599479593245, "learning_rate": 0.00010600241873123495, "loss": 0.6055, "step": 6199 }, { "epoch": 4.105960264900662, "grad_norm": 1.0220351338588083, "learning_rate": 0.00010593861125682226, "loss": 0.8008, "step": 6200 }, { "epoch": 4.106622516556292, "grad_norm": 1.2194440204982333, "learning_rate": 0.00010587481250706961, "loss": 1.1484, "step": 6201 }, { "epoch": 4.107284768211921, "grad_norm": 1.102084334508904, "learning_rate": 0.00010581102249461002, "loss": 0.9688, "step": 6202 }, { "epoch": 4.10794701986755, "grad_norm": 1.2041159758137538, "learning_rate": 0.00010574724123207457, "loss": 1.0234, "step": 6203 }, { "epoch": 4.108609271523179, "grad_norm": 0.9921530871313249, "learning_rate": 0.00010568346873209269, "loss": 0.8438, "step": 6204 }, { "epoch": 4.109271523178808, "grad_norm": 1.0154330240935314, "learning_rate": 0.0001056197050072921, "loss": 0.7852, "step": 6205 }, { "epoch": 4.109933774834437, "grad_norm": 1.1583271974980496, "learning_rate": 0.00010555595007029872, "loss": 0.9688, "step": 6206 }, { "epoch": 4.110596026490066, "grad_norm": 0.9789733912654255, "learning_rate": 0.0001054922039337367, "loss": 0.7578, "step": 6207 }, { "epoch": 4.111258278145695, "grad_norm": 1.1754808439068591, "learning_rate": 0.00010542846661022863, "loss": 1.0234, "step": 6208 }, { "epoch": 4.111920529801324, "grad_norm": 0.9889619008802039, "learning_rate": 0.00010536473811239514, "loss": 0.7812, "step": 6209 }, { "epoch": 4.112582781456954, "grad_norm": 0.9091287209670218, "learning_rate": 0.00010530101845285522, "loss": 0.7227, "step": 6210 }, { "epoch": 4.113245033112583, "grad_norm": 1.3688291205661167, "learning_rate": 0.00010523730764422613, "loss": 1.1719, "step": 6211 }, { "epoch": 4.113907284768212, "grad_norm": 0.9980378099575175, "learning_rate": 0.00010517360569912335, "loss": 0.7617, "step": 6212 }, { "epoch": 4.114569536423841, "grad_norm": 0.9175836566535053, "learning_rate": 0.00010510991263016048, "loss": 0.8125, "step": 6213 }, { "epoch": 4.11523178807947, "grad_norm": 1.0516569556209148, "learning_rate": 0.00010504622844994957, "loss": 0.8086, "step": 6214 }, { "epoch": 4.1158940397351, "grad_norm": 1.0365959501155582, "learning_rate": 0.00010498255317110086, "loss": 0.7891, "step": 6215 }, { "epoch": 4.1165562913907285, "grad_norm": 1.15720381907385, "learning_rate": 0.00010491888680622267, "loss": 1.0156, "step": 6216 }, { "epoch": 4.1172185430463575, "grad_norm": 1.199408812094867, "learning_rate": 0.00010485522936792179, "loss": 0.9609, "step": 6217 }, { "epoch": 4.117880794701986, "grad_norm": 1.1599399951107454, "learning_rate": 0.00010479158086880304, "loss": 0.9766, "step": 6218 }, { "epoch": 4.118543046357616, "grad_norm": 1.1622324165112725, "learning_rate": 0.00010472794132146953, "loss": 1.0, "step": 6219 }, { "epoch": 4.119205298013245, "grad_norm": 0.8778188781542734, "learning_rate": 0.0001046643107385227, "loss": 0.6719, "step": 6220 }, { "epoch": 4.119867549668874, "grad_norm": 1.216221303151822, "learning_rate": 0.00010460068913256214, "loss": 1.0, "step": 6221 }, { "epoch": 4.120529801324503, "grad_norm": 1.172145430794656, "learning_rate": 0.00010453707651618552, "loss": 0.9648, "step": 6222 }, { "epoch": 4.121192052980132, "grad_norm": 0.9187557537192379, "learning_rate": 0.00010447347290198907, "loss": 0.6875, "step": 6223 }, { "epoch": 4.121854304635762, "grad_norm": 0.9386624881605963, "learning_rate": 0.00010440987830256689, "loss": 0.7617, "step": 6224 }, { "epoch": 4.122516556291391, "grad_norm": 1.1798476886717546, "learning_rate": 0.00010434629273051143, "loss": 0.9766, "step": 6225 }, { "epoch": 4.12317880794702, "grad_norm": 0.8959804762812561, "learning_rate": 0.00010428271619841347, "loss": 0.6523, "step": 6226 }, { "epoch": 4.123841059602649, "grad_norm": 0.9649395788065253, "learning_rate": 0.00010421914871886185, "loss": 0.7969, "step": 6227 }, { "epoch": 4.1245033112582785, "grad_norm": 1.02846148805379, "learning_rate": 0.00010415559030444357, "loss": 0.7422, "step": 6228 }, { "epoch": 4.1251655629139075, "grad_norm": 0.9720999574885072, "learning_rate": 0.00010409204096774407, "loss": 0.7227, "step": 6229 }, { "epoch": 4.125827814569536, "grad_norm": 1.1522091483808263, "learning_rate": 0.00010402850072134676, "loss": 0.9219, "step": 6230 }, { "epoch": 4.126490066225165, "grad_norm": 1.1068860264224933, "learning_rate": 0.00010396496957783331, "loss": 0.8438, "step": 6231 }, { "epoch": 4.127152317880794, "grad_norm": 1.1027702806537238, "learning_rate": 0.0001039014475497837, "loss": 0.8984, "step": 6232 }, { "epoch": 4.127814569536424, "grad_norm": 1.149804371153029, "learning_rate": 0.000103837934649776, "loss": 0.8633, "step": 6233 }, { "epoch": 4.128476821192053, "grad_norm": 1.2400737285732375, "learning_rate": 0.00010377443089038641, "loss": 0.9648, "step": 6234 }, { "epoch": 4.129139072847682, "grad_norm": 1.2723883363486652, "learning_rate": 0.00010371093628418951, "loss": 1.1016, "step": 6235 }, { "epoch": 4.129801324503311, "grad_norm": 1.1050848093090886, "learning_rate": 0.0001036474508437579, "loss": 0.8086, "step": 6236 }, { "epoch": 4.130463576158941, "grad_norm": 1.1280700179925658, "learning_rate": 0.0001035839745816624, "loss": 0.9023, "step": 6237 }, { "epoch": 4.13112582781457, "grad_norm": 1.0943141206115414, "learning_rate": 0.0001035205075104721, "loss": 0.7891, "step": 6238 }, { "epoch": 4.131788079470199, "grad_norm": 0.9567971820617802, "learning_rate": 0.00010345704964275423, "loss": 0.7148, "step": 6239 }, { "epoch": 4.132450331125828, "grad_norm": 1.1528538727358535, "learning_rate": 0.00010339360099107405, "loss": 1.0078, "step": 6240 }, { "epoch": 4.1331125827814565, "grad_norm": 1.1189192638773982, "learning_rate": 0.00010333016156799524, "loss": 0.9375, "step": 6241 }, { "epoch": 4.133774834437086, "grad_norm": 1.1216563973328242, "learning_rate": 0.00010326673138607952, "loss": 0.8555, "step": 6242 }, { "epoch": 4.134437086092715, "grad_norm": 1.1434124835532453, "learning_rate": 0.00010320331045788676, "loss": 0.9727, "step": 6243 }, { "epoch": 4.135099337748344, "grad_norm": 1.0984156454885794, "learning_rate": 0.00010313989879597497, "loss": 0.9336, "step": 6244 }, { "epoch": 4.135761589403973, "grad_norm": 0.9002061013936019, "learning_rate": 0.00010307649641290049, "loss": 0.6992, "step": 6245 }, { "epoch": 4.136423841059603, "grad_norm": 0.9842071533064782, "learning_rate": 0.00010301310332121771, "loss": 0.7812, "step": 6246 }, { "epoch": 4.137086092715232, "grad_norm": 1.2086862486439607, "learning_rate": 0.0001029497195334791, "loss": 0.9648, "step": 6247 }, { "epoch": 4.137748344370861, "grad_norm": 0.8534806547668771, "learning_rate": 0.00010288634506223549, "loss": 0.6406, "step": 6248 }, { "epoch": 4.13841059602649, "grad_norm": 0.9911694586844878, "learning_rate": 0.00010282297992003569, "loss": 0.7734, "step": 6249 }, { "epoch": 4.139072847682119, "grad_norm": 1.1491288452083723, "learning_rate": 0.0001027596241194267, "loss": 0.9609, "step": 6250 }, { "epoch": 4.139735099337749, "grad_norm": 0.983134839466708, "learning_rate": 0.00010269627767295374, "loss": 0.7305, "step": 6251 }, { "epoch": 4.140397350993378, "grad_norm": 1.0738283286572465, "learning_rate": 0.00010263294059316012, "loss": 0.8398, "step": 6252 }, { "epoch": 4.1410596026490065, "grad_norm": 1.0088453490717972, "learning_rate": 0.0001025696128925873, "loss": 0.7969, "step": 6253 }, { "epoch": 4.1417218543046355, "grad_norm": 0.9287928479677533, "learning_rate": 0.00010250629458377493, "loss": 0.7383, "step": 6254 }, { "epoch": 4.142384105960265, "grad_norm": 1.1301020691322565, "learning_rate": 0.00010244298567926073, "loss": 0.8594, "step": 6255 }, { "epoch": 4.143046357615894, "grad_norm": 1.1209471438878709, "learning_rate": 0.00010237968619158056, "loss": 0.8594, "step": 6256 }, { "epoch": 4.143708609271523, "grad_norm": 1.1734671077347993, "learning_rate": 0.00010231639613326847, "loss": 0.9375, "step": 6257 }, { "epoch": 4.144370860927152, "grad_norm": 0.9648746114153576, "learning_rate": 0.00010225311551685669, "loss": 0.6836, "step": 6258 }, { "epoch": 4.145033112582781, "grad_norm": 1.0811551504621864, "learning_rate": 0.0001021898443548753, "loss": 0.8203, "step": 6259 }, { "epoch": 4.145695364238411, "grad_norm": 1.0829789277214377, "learning_rate": 0.00010212658265985299, "loss": 0.8711, "step": 6260 }, { "epoch": 4.14635761589404, "grad_norm": 1.1097007770091567, "learning_rate": 0.0001020633304443161, "loss": 0.9258, "step": 6261 }, { "epoch": 4.147019867549669, "grad_norm": 0.9239493341310807, "learning_rate": 0.00010200008772078934, "loss": 0.6953, "step": 6262 }, { "epoch": 4.147682119205298, "grad_norm": 1.031851746059729, "learning_rate": 0.00010193685450179556, "loss": 0.793, "step": 6263 }, { "epoch": 4.1483443708609276, "grad_norm": 1.174899506033891, "learning_rate": 0.00010187363079985563, "loss": 0.9648, "step": 6264 }, { "epoch": 4.1490066225165565, "grad_norm": 1.1427609768080544, "learning_rate": 0.00010181041662748847, "loss": 1.0, "step": 6265 }, { "epoch": 4.149668874172185, "grad_norm": 0.9268195929719053, "learning_rate": 0.00010174721199721137, "loss": 0.7578, "step": 6266 }, { "epoch": 4.150331125827814, "grad_norm": 1.249271326803708, "learning_rate": 0.00010168401692153948, "loss": 1.0391, "step": 6267 }, { "epoch": 4.150993377483443, "grad_norm": 1.2507333029392143, "learning_rate": 0.00010162083141298611, "loss": 1.0, "step": 6268 }, { "epoch": 4.151655629139073, "grad_norm": 1.1300147134367888, "learning_rate": 0.00010155765548406282, "loss": 0.8984, "step": 6269 }, { "epoch": 4.152317880794702, "grad_norm": 1.2434929176061937, "learning_rate": 0.00010149448914727915, "loss": 1.0156, "step": 6270 }, { "epoch": 4.152980132450331, "grad_norm": 0.9021352461958396, "learning_rate": 0.00010143133241514267, "loss": 0.7969, "step": 6271 }, { "epoch": 4.15364238410596, "grad_norm": 1.2346634641243102, "learning_rate": 0.00010136818530015919, "loss": 1.0547, "step": 6272 }, { "epoch": 4.15430463576159, "grad_norm": 1.079344373981255, "learning_rate": 0.00010130504781483259, "loss": 0.9023, "step": 6273 }, { "epoch": 4.154966887417219, "grad_norm": 0.8907119629602627, "learning_rate": 0.00010124191997166475, "loss": 0.6133, "step": 6274 }, { "epoch": 4.155629139072848, "grad_norm": 1.2151958161001144, "learning_rate": 0.00010117880178315583, "loss": 1.0703, "step": 6275 }, { "epoch": 4.156291390728477, "grad_norm": 0.9244592713523764, "learning_rate": 0.00010111569326180385, "loss": 0.7188, "step": 6276 }, { "epoch": 4.156953642384106, "grad_norm": 1.1667763990718285, "learning_rate": 0.00010105259442010497, "loss": 1.0547, "step": 6277 }, { "epoch": 4.157615894039735, "grad_norm": 0.9052862583549124, "learning_rate": 0.00010098950527055362, "loss": 0.6836, "step": 6278 }, { "epoch": 4.158278145695364, "grad_norm": 1.1897168742282023, "learning_rate": 0.00010092642582564212, "loss": 0.9336, "step": 6279 }, { "epoch": 4.158940397350993, "grad_norm": 1.1915588558432273, "learning_rate": 0.00010086335609786086, "loss": 0.957, "step": 6280 }, { "epoch": 4.159602649006622, "grad_norm": 1.2391416344488286, "learning_rate": 0.00010080029609969851, "loss": 1.1094, "step": 6281 }, { "epoch": 4.160264900662252, "grad_norm": 1.2770166828212963, "learning_rate": 0.00010073724584364154, "loss": 1.0312, "step": 6282 }, { "epoch": 4.160927152317881, "grad_norm": 1.158290914590156, "learning_rate": 0.00010067420534217465, "loss": 0.9219, "step": 6283 }, { "epoch": 4.16158940397351, "grad_norm": 1.387910325986089, "learning_rate": 0.00010061117460778064, "loss": 1.1797, "step": 6284 }, { "epoch": 4.162251655629139, "grad_norm": 1.1790062732380073, "learning_rate": 0.00010054815365294032, "loss": 0.9492, "step": 6285 }, { "epoch": 4.162913907284768, "grad_norm": 0.9696999785500117, "learning_rate": 0.0001004851424901324, "loss": 0.6641, "step": 6286 }, { "epoch": 4.163576158940398, "grad_norm": 1.1182208162656357, "learning_rate": 0.00010042214113183405, "loss": 0.9336, "step": 6287 }, { "epoch": 4.164238410596027, "grad_norm": 1.2817489650473708, "learning_rate": 0.00010035914959052013, "loss": 1.1328, "step": 6288 }, { "epoch": 4.164900662251656, "grad_norm": 1.2236976878516763, "learning_rate": 0.00010029616787866365, "loss": 1.0078, "step": 6289 }, { "epoch": 4.1655629139072845, "grad_norm": 0.9817315841573214, "learning_rate": 0.0001002331960087358, "loss": 0.7227, "step": 6290 }, { "epoch": 4.166225165562914, "grad_norm": 1.0963752299713079, "learning_rate": 0.00010017023399320575, "loss": 0.875, "step": 6291 }, { "epoch": 4.166887417218543, "grad_norm": 1.0281943631886468, "learning_rate": 0.00010010728184454058, "loss": 0.8203, "step": 6292 }, { "epoch": 4.167549668874172, "grad_norm": 1.1157241205484807, "learning_rate": 0.00010004433957520561, "loss": 0.8906, "step": 6293 }, { "epoch": 4.168211920529801, "grad_norm": 1.064009821886658, "learning_rate": 9.998140719766417e-05, "loss": 0.8906, "step": 6294 }, { "epoch": 4.16887417218543, "grad_norm": 1.0426246555872825, "learning_rate": 9.991848472437749e-05, "loss": 0.7891, "step": 6295 }, { "epoch": 4.16953642384106, "grad_norm": 1.0609059648050276, "learning_rate": 9.985557216780502e-05, "loss": 0.8047, "step": 6296 }, { "epoch": 4.170198675496689, "grad_norm": 1.2392519524864485, "learning_rate": 9.97926695404042e-05, "loss": 1.0078, "step": 6297 }, { "epoch": 4.170860927152318, "grad_norm": 1.1414525468647627, "learning_rate": 9.972977685463035e-05, "loss": 0.9258, "step": 6298 }, { "epoch": 4.171523178807947, "grad_norm": 0.9348721484046824, "learning_rate": 9.966689412293706e-05, "loss": 0.625, "step": 6299 }, { "epoch": 4.172185430463577, "grad_norm": 1.0877397281266818, "learning_rate": 9.960402135777575e-05, "loss": 0.8086, "step": 6300 }, { "epoch": 4.1728476821192055, "grad_norm": 1.0346749464108425, "learning_rate": 9.954115857159604e-05, "loss": 0.8008, "step": 6301 }, { "epoch": 4.1735099337748345, "grad_norm": 1.0076580589762194, "learning_rate": 9.947830577684536e-05, "loss": 0.8047, "step": 6302 }, { "epoch": 4.174172185430463, "grad_norm": 1.1231548582329725, "learning_rate": 9.941546298596937e-05, "loss": 0.8789, "step": 6303 }, { "epoch": 4.174834437086092, "grad_norm": 0.9683365099245336, "learning_rate": 9.935263021141164e-05, "loss": 0.7227, "step": 6304 }, { "epoch": 4.175496688741722, "grad_norm": 0.9726749254564172, "learning_rate": 9.928980746561376e-05, "loss": 0.668, "step": 6305 }, { "epoch": 4.176158940397351, "grad_norm": 1.2424340084579304, "learning_rate": 9.922699476101541e-05, "loss": 1.0469, "step": 6306 }, { "epoch": 4.17682119205298, "grad_norm": 0.9880361068133409, "learning_rate": 9.916419211005422e-05, "loss": 0.7656, "step": 6307 }, { "epoch": 4.177483443708609, "grad_norm": 0.9433731696873005, "learning_rate": 9.910139952516573e-05, "loss": 0.7031, "step": 6308 }, { "epoch": 4.178145695364238, "grad_norm": 0.9009293169216358, "learning_rate": 9.903861701878369e-05, "loss": 0.7031, "step": 6309 }, { "epoch": 4.178807947019868, "grad_norm": 1.1414129875819874, "learning_rate": 9.897584460333976e-05, "loss": 0.8828, "step": 6310 }, { "epoch": 4.179470198675497, "grad_norm": 1.0062374265018146, "learning_rate": 9.891308229126355e-05, "loss": 0.7109, "step": 6311 }, { "epoch": 4.180132450331126, "grad_norm": 1.1162047658478456, "learning_rate": 9.885033009498278e-05, "loss": 0.8984, "step": 6312 }, { "epoch": 4.180794701986755, "grad_norm": 1.0102415149265176, "learning_rate": 9.878758802692303e-05, "loss": 0.8203, "step": 6313 }, { "epoch": 4.1814569536423845, "grad_norm": 1.3311752437753013, "learning_rate": 9.872485609950796e-05, "loss": 1.1094, "step": 6314 }, { "epoch": 4.182119205298013, "grad_norm": 0.9435009278926006, "learning_rate": 9.866213432515924e-05, "loss": 0.668, "step": 6315 }, { "epoch": 4.182781456953642, "grad_norm": 1.113166157839667, "learning_rate": 9.859942271629655e-05, "loss": 0.9961, "step": 6316 }, { "epoch": 4.183443708609271, "grad_norm": 1.0895652724239613, "learning_rate": 9.853672128533734e-05, "loss": 0.8672, "step": 6317 }, { "epoch": 4.184105960264901, "grad_norm": 1.167106363083232, "learning_rate": 9.847403004469742e-05, "loss": 0.9883, "step": 6318 }, { "epoch": 4.18476821192053, "grad_norm": 1.0912021191359709, "learning_rate": 9.841134900679025e-05, "loss": 0.9062, "step": 6319 }, { "epoch": 4.185430463576159, "grad_norm": 1.049493579459763, "learning_rate": 9.834867818402741e-05, "loss": 0.8398, "step": 6320 }, { "epoch": 4.186092715231788, "grad_norm": 1.3235529551613285, "learning_rate": 9.828601758881846e-05, "loss": 1.0781, "step": 6321 }, { "epoch": 4.186754966887417, "grad_norm": 0.9063972330932647, "learning_rate": 9.822336723357094e-05, "loss": 0.6797, "step": 6322 }, { "epoch": 4.187417218543047, "grad_norm": 1.2421613198266561, "learning_rate": 9.816072713069024e-05, "loss": 1.0312, "step": 6323 }, { "epoch": 4.188079470198676, "grad_norm": 1.204355571800546, "learning_rate": 9.809809729257994e-05, "loss": 0.9727, "step": 6324 }, { "epoch": 4.188741721854305, "grad_norm": 0.9782765307011785, "learning_rate": 9.80354777316414e-05, "loss": 0.7812, "step": 6325 }, { "epoch": 4.1894039735099335, "grad_norm": 0.9739953174615141, "learning_rate": 9.7972868460274e-05, "loss": 0.7578, "step": 6326 }, { "epoch": 4.1900662251655625, "grad_norm": 1.264720229956241, "learning_rate": 9.791026949087512e-05, "loss": 0.9961, "step": 6327 }, { "epoch": 4.190728476821192, "grad_norm": 0.9283017228927886, "learning_rate": 9.784768083584013e-05, "loss": 0.7422, "step": 6328 }, { "epoch": 4.191390728476821, "grad_norm": 1.0598234533667843, "learning_rate": 9.778510250756217e-05, "loss": 0.8828, "step": 6329 }, { "epoch": 4.19205298013245, "grad_norm": 0.9213391711206443, "learning_rate": 9.772253451843256e-05, "loss": 0.6484, "step": 6330 }, { "epoch": 4.192715231788079, "grad_norm": 0.9523749004630827, "learning_rate": 9.765997688084047e-05, "loss": 0.6758, "step": 6331 }, { "epoch": 4.193377483443709, "grad_norm": 1.157662953902676, "learning_rate": 9.759742960717298e-05, "loss": 0.8945, "step": 6332 }, { "epoch": 4.194039735099338, "grad_norm": 1.1331376497619277, "learning_rate": 9.753489270981524e-05, "loss": 0.8867, "step": 6333 }, { "epoch": 4.194701986754967, "grad_norm": 1.1845278704355464, "learning_rate": 9.747236620115026e-05, "loss": 0.9492, "step": 6334 }, { "epoch": 4.195364238410596, "grad_norm": 1.1973833363191007, "learning_rate": 9.740985009355893e-05, "loss": 1.0469, "step": 6335 }, { "epoch": 4.196026490066225, "grad_norm": 1.0864693226545, "learning_rate": 9.734734439942023e-05, "loss": 0.7422, "step": 6336 }, { "epoch": 4.196688741721855, "grad_norm": 1.1687791886864916, "learning_rate": 9.728484913111096e-05, "loss": 0.875, "step": 6337 }, { "epoch": 4.1973509933774835, "grad_norm": 0.9474838355614406, "learning_rate": 9.722236430100592e-05, "loss": 0.7422, "step": 6338 }, { "epoch": 4.1980132450331125, "grad_norm": 1.042709317715414, "learning_rate": 9.715988992147788e-05, "loss": 0.8516, "step": 6339 }, { "epoch": 4.198675496688741, "grad_norm": 1.248408007180437, "learning_rate": 9.70974260048974e-05, "loss": 1.0156, "step": 6340 }, { "epoch": 4.199337748344371, "grad_norm": 1.3485938364841386, "learning_rate": 9.703497256363306e-05, "loss": 1.0625, "step": 6341 }, { "epoch": 4.2, "grad_norm": 1.1740129841952909, "learning_rate": 9.697252961005138e-05, "loss": 0.9844, "step": 6342 }, { "epoch": 4.200662251655629, "grad_norm": 1.1997132316208377, "learning_rate": 9.69100971565168e-05, "loss": 0.9609, "step": 6343 }, { "epoch": 4.201324503311258, "grad_norm": 1.2019450984595004, "learning_rate": 9.684767521539159e-05, "loss": 0.9648, "step": 6344 }, { "epoch": 4.201986754966887, "grad_norm": 1.1931857795889356, "learning_rate": 9.678526379903613e-05, "loss": 0.9805, "step": 6345 }, { "epoch": 4.202649006622517, "grad_norm": 1.1066204311843768, "learning_rate": 9.672286291980848e-05, "loss": 0.9023, "step": 6346 }, { "epoch": 4.203311258278146, "grad_norm": 1.0583767501303423, "learning_rate": 9.666047259006475e-05, "loss": 0.9414, "step": 6347 }, { "epoch": 4.203973509933775, "grad_norm": 1.1246865046668657, "learning_rate": 9.659809282215903e-05, "loss": 0.9102, "step": 6348 }, { "epoch": 4.204635761589404, "grad_norm": 1.2445124791927982, "learning_rate": 9.653572362844315e-05, "loss": 1.0781, "step": 6349 }, { "epoch": 4.2052980132450335, "grad_norm": 1.0025323216943478, "learning_rate": 9.647336502126686e-05, "loss": 0.8906, "step": 6350 }, { "epoch": 4.205960264900662, "grad_norm": 1.1377046472387322, "learning_rate": 9.641101701297806e-05, "loss": 0.9688, "step": 6351 }, { "epoch": 4.206622516556291, "grad_norm": 1.0892224327940327, "learning_rate": 9.634867961592223e-05, "loss": 1.0234, "step": 6352 }, { "epoch": 4.20728476821192, "grad_norm": 0.96781822687656, "learning_rate": 9.62863528424429e-05, "loss": 0.7539, "step": 6353 }, { "epoch": 4.207947019867549, "grad_norm": 1.2210425076185618, "learning_rate": 9.622403670488154e-05, "loss": 0.9727, "step": 6354 }, { "epoch": 4.208609271523179, "grad_norm": 1.0053235016991606, "learning_rate": 9.616173121557748e-05, "loss": 0.7812, "step": 6355 }, { "epoch": 4.209271523178808, "grad_norm": 1.115709294435586, "learning_rate": 9.609943638686783e-05, "loss": 0.9453, "step": 6356 }, { "epoch": 4.209933774834437, "grad_norm": 1.019291596128949, "learning_rate": 9.603715223108768e-05, "loss": 0.8359, "step": 6357 }, { "epoch": 4.210596026490066, "grad_norm": 1.0012472194257234, "learning_rate": 9.597487876057012e-05, "loss": 0.8281, "step": 6358 }, { "epoch": 4.211258278145696, "grad_norm": 1.181660273010959, "learning_rate": 9.591261598764595e-05, "loss": 0.9727, "step": 6359 }, { "epoch": 4.211920529801325, "grad_norm": 0.9677083153229692, "learning_rate": 9.585036392464381e-05, "loss": 0.793, "step": 6360 }, { "epoch": 4.212582781456954, "grad_norm": 1.182116519922221, "learning_rate": 9.578812258389052e-05, "loss": 1.0391, "step": 6361 }, { "epoch": 4.213245033112583, "grad_norm": 1.373992342997325, "learning_rate": 9.572589197771047e-05, "loss": 1.1406, "step": 6362 }, { "epoch": 4.2139072847682115, "grad_norm": 1.135145960929472, "learning_rate": 9.566367211842599e-05, "loss": 0.8867, "step": 6363 }, { "epoch": 4.214569536423841, "grad_norm": 0.9283921631574411, "learning_rate": 9.560146301835739e-05, "loss": 0.7227, "step": 6364 }, { "epoch": 4.21523178807947, "grad_norm": 1.1889076785462431, "learning_rate": 9.553926468982282e-05, "loss": 0.8555, "step": 6365 }, { "epoch": 4.215894039735099, "grad_norm": 0.9736363548200329, "learning_rate": 9.547707714513813e-05, "loss": 0.7578, "step": 6366 }, { "epoch": 4.216556291390728, "grad_norm": 1.0605341844736331, "learning_rate": 9.54149003966173e-05, "loss": 0.8398, "step": 6367 }, { "epoch": 4.217218543046358, "grad_norm": 1.1482125496212838, "learning_rate": 9.5352734456572e-05, "loss": 0.875, "step": 6368 }, { "epoch": 4.217880794701987, "grad_norm": 1.0687781528343903, "learning_rate": 9.529057933731174e-05, "loss": 0.9102, "step": 6369 }, { "epoch": 4.218543046357616, "grad_norm": 1.2079815294237133, "learning_rate": 9.522843505114405e-05, "loss": 0.9805, "step": 6370 }, { "epoch": 4.219205298013245, "grad_norm": 1.1816102529327877, "learning_rate": 9.516630161037416e-05, "loss": 0.8789, "step": 6371 }, { "epoch": 4.219867549668874, "grad_norm": 1.0295758200786869, "learning_rate": 9.510417902730512e-05, "loss": 0.8008, "step": 6372 }, { "epoch": 4.220529801324504, "grad_norm": 1.1124848887901126, "learning_rate": 9.504206731423805e-05, "loss": 0.8086, "step": 6373 }, { "epoch": 4.221192052980133, "grad_norm": 0.9688061778396168, "learning_rate": 9.497996648347172e-05, "loss": 0.8242, "step": 6374 }, { "epoch": 4.2218543046357615, "grad_norm": 0.9340412604967088, "learning_rate": 9.491787654730279e-05, "loss": 0.7148, "step": 6375 }, { "epoch": 4.22251655629139, "grad_norm": 1.0948110720077904, "learning_rate": 9.485579751802583e-05, "loss": 0.9102, "step": 6376 }, { "epoch": 4.22317880794702, "grad_norm": 1.0839621551664844, "learning_rate": 9.479372940793317e-05, "loss": 0.8984, "step": 6377 }, { "epoch": 4.223841059602649, "grad_norm": 1.0051276582715367, "learning_rate": 9.473167222931498e-05, "loss": 0.8203, "step": 6378 }, { "epoch": 4.224503311258278, "grad_norm": 1.3749651843688695, "learning_rate": 9.466962599445932e-05, "loss": 1.0703, "step": 6379 }, { "epoch": 4.225165562913907, "grad_norm": 1.045876160553046, "learning_rate": 9.46075907156521e-05, "loss": 0.793, "step": 6380 }, { "epoch": 4.225827814569536, "grad_norm": 1.0214340321984852, "learning_rate": 9.454556640517688e-05, "loss": 0.8047, "step": 6381 }, { "epoch": 4.226490066225166, "grad_norm": 1.0145596828749481, "learning_rate": 9.448355307531537e-05, "loss": 0.7617, "step": 6382 }, { "epoch": 4.227152317880795, "grad_norm": 1.184006144826606, "learning_rate": 9.442155073834678e-05, "loss": 0.8867, "step": 6383 }, { "epoch": 4.227814569536424, "grad_norm": 1.079674386578136, "learning_rate": 9.435955940654834e-05, "loss": 0.832, "step": 6384 }, { "epoch": 4.228476821192053, "grad_norm": 1.1813038490789038, "learning_rate": 9.429757909219502e-05, "loss": 0.9805, "step": 6385 }, { "epoch": 4.2291390728476825, "grad_norm": 0.8908454648317472, "learning_rate": 9.423560980755971e-05, "loss": 0.7031, "step": 6386 }, { "epoch": 4.2298013245033115, "grad_norm": 1.0673918406426417, "learning_rate": 9.417365156491286e-05, "loss": 0.8359, "step": 6387 }, { "epoch": 4.23046357615894, "grad_norm": 1.0422633164185293, "learning_rate": 9.411170437652314e-05, "loss": 0.8477, "step": 6388 }, { "epoch": 4.231125827814569, "grad_norm": 1.1336143596842143, "learning_rate": 9.404976825465665e-05, "loss": 0.8516, "step": 6389 }, { "epoch": 4.231788079470198, "grad_norm": 1.0407151053510952, "learning_rate": 9.398784321157746e-05, "loss": 0.7617, "step": 6390 }, { "epoch": 4.232450331125828, "grad_norm": 1.0587097752374712, "learning_rate": 9.392592925954752e-05, "loss": 0.7539, "step": 6391 }, { "epoch": 4.233112582781457, "grad_norm": 1.2943396970177838, "learning_rate": 9.386402641082647e-05, "loss": 0.9609, "step": 6392 }, { "epoch": 4.233774834437086, "grad_norm": 1.0051789268195652, "learning_rate": 9.380213467767172e-05, "loss": 0.8086, "step": 6393 }, { "epoch": 4.234437086092715, "grad_norm": 1.0391757209857218, "learning_rate": 9.374025407233859e-05, "loss": 0.7969, "step": 6394 }, { "epoch": 4.235099337748345, "grad_norm": 1.0042265652483715, "learning_rate": 9.367838460708017e-05, "loss": 0.8047, "step": 6395 }, { "epoch": 4.235761589403974, "grad_norm": 1.2312096007987243, "learning_rate": 9.361652629414728e-05, "loss": 0.9688, "step": 6396 }, { "epoch": 4.236423841059603, "grad_norm": 0.984468043810883, "learning_rate": 9.355467914578866e-05, "loss": 0.7539, "step": 6397 }, { "epoch": 4.237086092715232, "grad_norm": 1.1929119621431088, "learning_rate": 9.349284317425065e-05, "loss": 0.8789, "step": 6398 }, { "epoch": 4.237748344370861, "grad_norm": 1.2771271524430903, "learning_rate": 9.343101839177752e-05, "loss": 1.1094, "step": 6399 }, { "epoch": 4.23841059602649, "grad_norm": 1.2359803745335507, "learning_rate": 9.33692048106113e-05, "loss": 0.9648, "step": 6400 }, { "epoch": 4.239072847682119, "grad_norm": 1.0808978899817623, "learning_rate": 9.330740244299178e-05, "loss": 0.8164, "step": 6401 }, { "epoch": 4.239735099337748, "grad_norm": 1.0219057596122603, "learning_rate": 9.324561130115651e-05, "loss": 0.8086, "step": 6402 }, { "epoch": 4.240397350993377, "grad_norm": 1.1998339710722454, "learning_rate": 9.318383139734095e-05, "loss": 0.9492, "step": 6403 }, { "epoch": 4.241059602649006, "grad_norm": 1.189481935349903, "learning_rate": 9.312206274377808e-05, "loss": 0.957, "step": 6404 }, { "epoch": 4.241721854304636, "grad_norm": 1.0501328211492178, "learning_rate": 9.306030535269886e-05, "loss": 0.8086, "step": 6405 }, { "epoch": 4.242384105960265, "grad_norm": 1.2624321155956144, "learning_rate": 9.2998559236332e-05, "loss": 0.9727, "step": 6406 }, { "epoch": 4.243046357615894, "grad_norm": 1.2453895236937362, "learning_rate": 9.293682440690397e-05, "loss": 0.9375, "step": 6407 }, { "epoch": 4.243708609271523, "grad_norm": 1.2223612567697948, "learning_rate": 9.28751008766388e-05, "loss": 0.9062, "step": 6408 }, { "epoch": 4.244370860927153, "grad_norm": 1.2717282055522192, "learning_rate": 9.281338865775865e-05, "loss": 1.0469, "step": 6409 }, { "epoch": 4.245033112582782, "grad_norm": 1.0022742501782054, "learning_rate": 9.275168776248316e-05, "loss": 0.8008, "step": 6410 }, { "epoch": 4.2456953642384105, "grad_norm": 0.9582658288955709, "learning_rate": 9.268999820302978e-05, "loss": 0.7539, "step": 6411 }, { "epoch": 4.2463576158940395, "grad_norm": 1.0669570426076789, "learning_rate": 9.262831999161382e-05, "loss": 0.8828, "step": 6412 }, { "epoch": 4.247019867549669, "grad_norm": 1.1133437265346304, "learning_rate": 9.256665314044828e-05, "loss": 0.9453, "step": 6413 }, { "epoch": 4.247682119205298, "grad_norm": 1.1299656298220075, "learning_rate": 9.250499766174386e-05, "loss": 0.9141, "step": 6414 }, { "epoch": 4.248344370860927, "grad_norm": 1.1116719956950516, "learning_rate": 9.244335356770901e-05, "loss": 0.8672, "step": 6415 }, { "epoch": 4.249006622516556, "grad_norm": 1.292182090610611, "learning_rate": 9.238172087055007e-05, "loss": 1.125, "step": 6416 }, { "epoch": 4.249668874172185, "grad_norm": 1.0895784632044996, "learning_rate": 9.232009958247098e-05, "loss": 0.8125, "step": 6417 }, { "epoch": 4.250331125827815, "grad_norm": 1.0267292317527752, "learning_rate": 9.225848971567335e-05, "loss": 0.7109, "step": 6418 }, { "epoch": 4.250993377483444, "grad_norm": 0.9869951714916474, "learning_rate": 9.219689128235686e-05, "loss": 0.7773, "step": 6419 }, { "epoch": 4.251655629139073, "grad_norm": 1.0953755197497155, "learning_rate": 9.213530429471854e-05, "loss": 0.8672, "step": 6420 }, { "epoch": 4.252317880794702, "grad_norm": 1.1181636857780448, "learning_rate": 9.207372876495334e-05, "loss": 0.8984, "step": 6421 }, { "epoch": 4.252980132450331, "grad_norm": 1.2404276221615507, "learning_rate": 9.201216470525398e-05, "loss": 1.0, "step": 6422 }, { "epoch": 4.2536423841059605, "grad_norm": 1.1511897132471132, "learning_rate": 9.195061212781084e-05, "loss": 0.7969, "step": 6423 }, { "epoch": 4.2543046357615895, "grad_norm": 1.1372988302422178, "learning_rate": 9.188907104481196e-05, "loss": 0.7969, "step": 6424 }, { "epoch": 4.254966887417218, "grad_norm": 1.1535629779356065, "learning_rate": 9.182754146844327e-05, "loss": 0.9766, "step": 6425 }, { "epoch": 4.255629139072847, "grad_norm": 1.184966706426104, "learning_rate": 9.176602341088827e-05, "loss": 0.8359, "step": 6426 }, { "epoch": 4.256291390728477, "grad_norm": 1.2274812342797508, "learning_rate": 9.170451688432827e-05, "loss": 0.9414, "step": 6427 }, { "epoch": 4.256953642384106, "grad_norm": 0.9629787851634235, "learning_rate": 9.164302190094227e-05, "loss": 0.7344, "step": 6428 }, { "epoch": 4.257615894039735, "grad_norm": 1.1161199547174125, "learning_rate": 9.1581538472907e-05, "loss": 0.8672, "step": 6429 }, { "epoch": 4.258278145695364, "grad_norm": 1.2315211510513135, "learning_rate": 9.152006661239682e-05, "loss": 1.0156, "step": 6430 }, { "epoch": 4.258940397350994, "grad_norm": 1.0418513904872728, "learning_rate": 9.145860633158392e-05, "loss": 0.7344, "step": 6431 }, { "epoch": 4.259602649006623, "grad_norm": 1.2916533960519494, "learning_rate": 9.139715764263813e-05, "loss": 1.0703, "step": 6432 }, { "epoch": 4.260264900662252, "grad_norm": 0.9816934513317195, "learning_rate": 9.133572055772695e-05, "loss": 0.8203, "step": 6433 }, { "epoch": 4.260927152317881, "grad_norm": 1.0898418826367926, "learning_rate": 9.127429508901572e-05, "loss": 0.8828, "step": 6434 }, { "epoch": 4.26158940397351, "grad_norm": 1.233387710968643, "learning_rate": 9.12128812486673e-05, "loss": 1.0234, "step": 6435 }, { "epoch": 4.262251655629139, "grad_norm": 1.2284670848756418, "learning_rate": 9.115147904884232e-05, "loss": 1.0312, "step": 6436 }, { "epoch": 4.262913907284768, "grad_norm": 1.1599816403399947, "learning_rate": 9.109008850169923e-05, "loss": 0.9297, "step": 6437 }, { "epoch": 4.263576158940397, "grad_norm": 1.1273815978865702, "learning_rate": 9.102870961939399e-05, "loss": 0.8555, "step": 6438 }, { "epoch": 4.264238410596026, "grad_norm": 1.0727035563369833, "learning_rate": 9.096734241408027e-05, "loss": 0.832, "step": 6439 }, { "epoch": 4.264900662251655, "grad_norm": 1.2769597288504944, "learning_rate": 9.09059868979096e-05, "loss": 1.0391, "step": 6440 }, { "epoch": 4.265562913907285, "grad_norm": 1.0792071819946627, "learning_rate": 9.0844643083031e-05, "loss": 0.7852, "step": 6441 }, { "epoch": 4.266225165562914, "grad_norm": 0.9146475858802257, "learning_rate": 9.078331098159126e-05, "loss": 0.7578, "step": 6442 }, { "epoch": 4.266887417218543, "grad_norm": 1.044225761637071, "learning_rate": 9.072199060573484e-05, "loss": 0.8555, "step": 6443 }, { "epoch": 4.267549668874172, "grad_norm": 1.145641840910904, "learning_rate": 9.066068196760392e-05, "loss": 0.8516, "step": 6444 }, { "epoch": 4.268211920529802, "grad_norm": 1.046860453440571, "learning_rate": 9.059938507933819e-05, "loss": 0.793, "step": 6445 }, { "epoch": 4.268874172185431, "grad_norm": 1.1036731084195965, "learning_rate": 9.053809995307528e-05, "loss": 0.8672, "step": 6446 }, { "epoch": 4.26953642384106, "grad_norm": 1.0325140401122295, "learning_rate": 9.047682660095028e-05, "loss": 0.7969, "step": 6447 }, { "epoch": 4.2701986754966885, "grad_norm": 1.3903200297123874, "learning_rate": 9.0415565035096e-05, "loss": 1.1328, "step": 6448 }, { "epoch": 4.2708609271523175, "grad_norm": 1.1606864109989947, "learning_rate": 9.035431526764299e-05, "loss": 0.8516, "step": 6449 }, { "epoch": 4.271523178807947, "grad_norm": 1.0503341243233026, "learning_rate": 9.029307731071939e-05, "loss": 0.832, "step": 6450 }, { "epoch": 4.272185430463576, "grad_norm": 1.2073721314987047, "learning_rate": 9.023185117645095e-05, "loss": 0.9766, "step": 6451 }, { "epoch": 4.272847682119205, "grad_norm": 1.0775086388924306, "learning_rate": 9.017063687696119e-05, "loss": 0.7344, "step": 6452 }, { "epoch": 4.273509933774834, "grad_norm": 1.2922065922706014, "learning_rate": 9.01094344243713e-05, "loss": 1.0, "step": 6453 }, { "epoch": 4.274172185430464, "grad_norm": 1.0249221324117628, "learning_rate": 9.004824383079995e-05, "loss": 0.7812, "step": 6454 }, { "epoch": 4.274834437086093, "grad_norm": 1.1289590701775016, "learning_rate": 8.998706510836369e-05, "loss": 0.8633, "step": 6455 }, { "epoch": 4.275496688741722, "grad_norm": 1.1817607805243004, "learning_rate": 8.99258982691766e-05, "loss": 0.875, "step": 6456 }, { "epoch": 4.276158940397351, "grad_norm": 1.1859075635059466, "learning_rate": 8.986474332535032e-05, "loss": 0.918, "step": 6457 }, { "epoch": 4.27682119205298, "grad_norm": 1.2898863731799532, "learning_rate": 8.980360028899431e-05, "loss": 1.125, "step": 6458 }, { "epoch": 4.27748344370861, "grad_norm": 1.2921682210258052, "learning_rate": 8.974246917221557e-05, "loss": 1.0469, "step": 6459 }, { "epoch": 4.2781456953642385, "grad_norm": 1.0811499774537532, "learning_rate": 8.968134998711875e-05, "loss": 0.8281, "step": 6460 }, { "epoch": 4.278807947019867, "grad_norm": 1.1002991631122645, "learning_rate": 8.96202427458062e-05, "loss": 0.9062, "step": 6461 }, { "epoch": 4.279470198675496, "grad_norm": 1.0778643230130702, "learning_rate": 8.955914746037783e-05, "loss": 0.8516, "step": 6462 }, { "epoch": 4.280132450331126, "grad_norm": 1.0666939955318964, "learning_rate": 8.949806414293111e-05, "loss": 0.8633, "step": 6463 }, { "epoch": 4.280794701986755, "grad_norm": 1.2875522295064752, "learning_rate": 8.94369928055614e-05, "loss": 1.0391, "step": 6464 }, { "epoch": 4.281456953642384, "grad_norm": 1.2324830155065163, "learning_rate": 8.937593346036145e-05, "loss": 0.9688, "step": 6465 }, { "epoch": 4.282119205298013, "grad_norm": 1.209314565621922, "learning_rate": 8.931488611942162e-05, "loss": 0.9453, "step": 6466 }, { "epoch": 4.282781456953642, "grad_norm": 1.20535288296118, "learning_rate": 8.925385079483016e-05, "loss": 0.8906, "step": 6467 }, { "epoch": 4.283443708609272, "grad_norm": 1.247691862091088, "learning_rate": 8.919282749867266e-05, "loss": 1.0156, "step": 6468 }, { "epoch": 4.284105960264901, "grad_norm": 0.9585489735079032, "learning_rate": 8.91318162430324e-05, "loss": 0.7695, "step": 6469 }, { "epoch": 4.28476821192053, "grad_norm": 1.0582705729775692, "learning_rate": 8.907081703999038e-05, "loss": 0.9141, "step": 6470 }, { "epoch": 4.285430463576159, "grad_norm": 1.1233493330715014, "learning_rate": 8.900982990162516e-05, "loss": 0.8867, "step": 6471 }, { "epoch": 4.2860927152317885, "grad_norm": 1.0405564493758683, "learning_rate": 8.894885484001281e-05, "loss": 0.9102, "step": 6472 }, { "epoch": 4.286754966887417, "grad_norm": 1.0939263309416731, "learning_rate": 8.888789186722712e-05, "loss": 0.7891, "step": 6473 }, { "epoch": 4.287417218543046, "grad_norm": 1.03146645769966, "learning_rate": 8.882694099533948e-05, "loss": 0.8672, "step": 6474 }, { "epoch": 4.288079470198675, "grad_norm": 1.3800269250458674, "learning_rate": 8.876600223641886e-05, "loss": 1.1641, "step": 6475 }, { "epoch": 4.288741721854304, "grad_norm": 1.1548462236047603, "learning_rate": 8.870507560253176e-05, "loss": 0.9492, "step": 6476 }, { "epoch": 4.289403973509934, "grad_norm": 1.098766439727207, "learning_rate": 8.864416110574249e-05, "loss": 0.8789, "step": 6477 }, { "epoch": 4.290066225165563, "grad_norm": 1.138459459204018, "learning_rate": 8.858325875811273e-05, "loss": 0.9688, "step": 6478 }, { "epoch": 4.290728476821192, "grad_norm": 1.213706060706707, "learning_rate": 8.852236857170181e-05, "loss": 0.9883, "step": 6479 }, { "epoch": 4.291390728476821, "grad_norm": 1.0271159636446885, "learning_rate": 8.846149055856678e-05, "loss": 0.7305, "step": 6480 }, { "epoch": 4.292052980132451, "grad_norm": 1.2930202569586278, "learning_rate": 8.840062473076214e-05, "loss": 0.9258, "step": 6481 }, { "epoch": 4.29271523178808, "grad_norm": 1.1778441782102695, "learning_rate": 8.833977110033995e-05, "loss": 0.8867, "step": 6482 }, { "epoch": 4.293377483443709, "grad_norm": 1.1210598243631176, "learning_rate": 8.827892967935008e-05, "loss": 0.8984, "step": 6483 }, { "epoch": 4.294039735099338, "grad_norm": 1.073120318874248, "learning_rate": 8.82181004798397e-05, "loss": 0.832, "step": 6484 }, { "epoch": 4.2947019867549665, "grad_norm": 1.0846231265241009, "learning_rate": 8.815728351385371e-05, "loss": 0.7383, "step": 6485 }, { "epoch": 4.295364238410596, "grad_norm": 1.1419120784735433, "learning_rate": 8.809647879343462e-05, "loss": 0.9297, "step": 6486 }, { "epoch": 4.296026490066225, "grad_norm": 1.275142629972156, "learning_rate": 8.803568633062244e-05, "loss": 0.9688, "step": 6487 }, { "epoch": 4.296688741721854, "grad_norm": 1.2662426107608238, "learning_rate": 8.797490613745472e-05, "loss": 1.0078, "step": 6488 }, { "epoch": 4.297350993377483, "grad_norm": 1.2950733483329975, "learning_rate": 8.79141382259667e-05, "loss": 1.0391, "step": 6489 }, { "epoch": 4.298013245033113, "grad_norm": 1.2061348918891535, "learning_rate": 8.785338260819113e-05, "loss": 0.9922, "step": 6490 }, { "epoch": 4.298675496688742, "grad_norm": 1.1532494720991027, "learning_rate": 8.779263929615824e-05, "loss": 0.8594, "step": 6491 }, { "epoch": 4.299337748344371, "grad_norm": 1.1363772298905315, "learning_rate": 8.773190830189601e-05, "loss": 0.8633, "step": 6492 }, { "epoch": 4.3, "grad_norm": 1.1327265795281738, "learning_rate": 8.767118963742979e-05, "loss": 0.9023, "step": 6493 }, { "epoch": 4.300662251655629, "grad_norm": 1.3511613611938742, "learning_rate": 8.761048331478256e-05, "loss": 1.1094, "step": 6494 }, { "epoch": 4.301324503311259, "grad_norm": 1.0499832504684283, "learning_rate": 8.754978934597499e-05, "loss": 0.8594, "step": 6495 }, { "epoch": 4.3019867549668875, "grad_norm": 1.0500849469597457, "learning_rate": 8.748910774302501e-05, "loss": 0.8047, "step": 6496 }, { "epoch": 4.3026490066225165, "grad_norm": 1.07718259033963, "learning_rate": 8.74284385179484e-05, "loss": 0.8242, "step": 6497 }, { "epoch": 4.303311258278145, "grad_norm": 1.1184508549964862, "learning_rate": 8.736778168275835e-05, "loss": 0.8984, "step": 6498 }, { "epoch": 4.303973509933774, "grad_norm": 1.0543322122960102, "learning_rate": 8.730713724946557e-05, "loss": 0.8359, "step": 6499 }, { "epoch": 4.304635761589404, "grad_norm": 0.9773925031059764, "learning_rate": 8.724650523007828e-05, "loss": 0.7617, "step": 6500 }, { "epoch": 4.305298013245033, "grad_norm": 1.116408911714799, "learning_rate": 8.71858856366025e-05, "loss": 0.9805, "step": 6501 }, { "epoch": 4.305960264900662, "grad_norm": 1.1578753148364553, "learning_rate": 8.712527848104152e-05, "loss": 0.8789, "step": 6502 }, { "epoch": 4.306622516556291, "grad_norm": 1.404442418044603, "learning_rate": 8.706468377539616e-05, "loss": 1.2109, "step": 6503 }, { "epoch": 4.307284768211921, "grad_norm": 1.2105688608967666, "learning_rate": 8.700410153166493e-05, "loss": 0.9961, "step": 6504 }, { "epoch": 4.30794701986755, "grad_norm": 1.0742676311385257, "learning_rate": 8.694353176184392e-05, "loss": 0.8008, "step": 6505 }, { "epoch": 4.308609271523179, "grad_norm": 1.308510772030092, "learning_rate": 8.688297447792645e-05, "loss": 0.9688, "step": 6506 }, { "epoch": 4.309271523178808, "grad_norm": 1.2319254428356263, "learning_rate": 8.682242969190371e-05, "loss": 0.9258, "step": 6507 }, { "epoch": 4.3099337748344375, "grad_norm": 1.048518455518634, "learning_rate": 8.676189741576414e-05, "loss": 0.8984, "step": 6508 }, { "epoch": 4.3105960264900665, "grad_norm": 0.9786386280947232, "learning_rate": 8.670137766149388e-05, "loss": 0.7617, "step": 6509 }, { "epoch": 4.311258278145695, "grad_norm": 0.9867022842855568, "learning_rate": 8.664087044107662e-05, "loss": 0.8047, "step": 6510 }, { "epoch": 4.311920529801324, "grad_norm": 1.204806630029523, "learning_rate": 8.658037576649338e-05, "loss": 1.0, "step": 6511 }, { "epoch": 4.312582781456953, "grad_norm": 1.2314258858619214, "learning_rate": 8.651989364972277e-05, "loss": 1.0859, "step": 6512 }, { "epoch": 4.313245033112583, "grad_norm": 0.931008347347989, "learning_rate": 8.645942410274098e-05, "loss": 0.6562, "step": 6513 }, { "epoch": 4.313907284768212, "grad_norm": 1.105687984199342, "learning_rate": 8.639896713752176e-05, "loss": 0.793, "step": 6514 }, { "epoch": 4.314569536423841, "grad_norm": 1.0126082460091115, "learning_rate": 8.633852276603616e-05, "loss": 0.7734, "step": 6515 }, { "epoch": 4.31523178807947, "grad_norm": 1.0765367173194051, "learning_rate": 8.627809100025287e-05, "loss": 0.8633, "step": 6516 }, { "epoch": 4.315894039735099, "grad_norm": 1.0457040590024647, "learning_rate": 8.621767185213819e-05, "loss": 0.7852, "step": 6517 }, { "epoch": 4.316556291390729, "grad_norm": 1.2584060044538594, "learning_rate": 8.615726533365568e-05, "loss": 1.0234, "step": 6518 }, { "epoch": 4.317218543046358, "grad_norm": 1.1690578053749487, "learning_rate": 8.60968714567666e-05, "loss": 0.9844, "step": 6519 }, { "epoch": 4.317880794701987, "grad_norm": 1.0993124487006185, "learning_rate": 8.603649023342955e-05, "loss": 0.9102, "step": 6520 }, { "epoch": 4.3185430463576155, "grad_norm": 1.114645141275922, "learning_rate": 8.597612167560075e-05, "loss": 0.8164, "step": 6521 }, { "epoch": 4.319205298013245, "grad_norm": 1.1224813662990656, "learning_rate": 8.591576579523393e-05, "loss": 0.875, "step": 6522 }, { "epoch": 4.319867549668874, "grad_norm": 1.0577479989547875, "learning_rate": 8.585542260428018e-05, "loss": 0.7344, "step": 6523 }, { "epoch": 4.320529801324503, "grad_norm": 1.1457339265308462, "learning_rate": 8.57950921146881e-05, "loss": 0.8906, "step": 6524 }, { "epoch": 4.321192052980132, "grad_norm": 1.070397577534459, "learning_rate": 8.573477433840385e-05, "loss": 0.8438, "step": 6525 }, { "epoch": 4.321854304635762, "grad_norm": 1.2811075748783753, "learning_rate": 8.567446928737114e-05, "loss": 0.9414, "step": 6526 }, { "epoch": 4.322516556291391, "grad_norm": 1.1951335390804283, "learning_rate": 8.561417697353095e-05, "loss": 1.0, "step": 6527 }, { "epoch": 4.32317880794702, "grad_norm": 1.2127491848071905, "learning_rate": 8.555389740882184e-05, "loss": 0.957, "step": 6528 }, { "epoch": 4.323841059602649, "grad_norm": 1.0165547434444686, "learning_rate": 8.549363060517988e-05, "loss": 0.7617, "step": 6529 }, { "epoch": 4.324503311258278, "grad_norm": 1.0386171334140835, "learning_rate": 8.543337657453868e-05, "loss": 0.7734, "step": 6530 }, { "epoch": 4.325165562913908, "grad_norm": 1.3543796781944457, "learning_rate": 8.537313532882907e-05, "loss": 1.0, "step": 6531 }, { "epoch": 4.325827814569537, "grad_norm": 1.1205183554875442, "learning_rate": 8.531290687997965e-05, "loss": 0.8945, "step": 6532 }, { "epoch": 4.3264900662251655, "grad_norm": 1.314765940935484, "learning_rate": 8.525269123991623e-05, "loss": 1.0234, "step": 6533 }, { "epoch": 4.3271523178807945, "grad_norm": 1.0472262039832598, "learning_rate": 8.519248842056225e-05, "loss": 0.7734, "step": 6534 }, { "epoch": 4.327814569536423, "grad_norm": 1.0417705748543051, "learning_rate": 8.513229843383859e-05, "loss": 0.8203, "step": 6535 }, { "epoch": 4.328476821192053, "grad_norm": 1.0109130075484016, "learning_rate": 8.507212129166358e-05, "loss": 0.75, "step": 6536 }, { "epoch": 4.329139072847682, "grad_norm": 1.1641529932921637, "learning_rate": 8.501195700595275e-05, "loss": 0.875, "step": 6537 }, { "epoch": 4.329801324503311, "grad_norm": 1.1050832243499442, "learning_rate": 8.495180558861965e-05, "loss": 0.8164, "step": 6538 }, { "epoch": 4.33046357615894, "grad_norm": 1.118964944831242, "learning_rate": 8.489166705157478e-05, "loss": 0.875, "step": 6539 }, { "epoch": 4.33112582781457, "grad_norm": 1.0871735755745358, "learning_rate": 8.483154140672623e-05, "loss": 0.8281, "step": 6540 }, { "epoch": 4.331788079470199, "grad_norm": 1.0563780685586572, "learning_rate": 8.47714286659796e-05, "loss": 0.8398, "step": 6541 }, { "epoch": 4.332450331125828, "grad_norm": 1.1522550878216666, "learning_rate": 8.471132884123801e-05, "loss": 0.9102, "step": 6542 }, { "epoch": 4.333112582781457, "grad_norm": 1.1986314259350248, "learning_rate": 8.465124194440171e-05, "loss": 0.9102, "step": 6543 }, { "epoch": 4.3337748344370866, "grad_norm": 1.1556693760458472, "learning_rate": 8.45911679873688e-05, "loss": 0.8594, "step": 6544 }, { "epoch": 4.3344370860927155, "grad_norm": 1.238035709832722, "learning_rate": 8.453110698203443e-05, "loss": 0.8984, "step": 6545 }, { "epoch": 4.335099337748344, "grad_norm": 1.171110689410266, "learning_rate": 8.447105894029147e-05, "loss": 0.875, "step": 6546 }, { "epoch": 4.335761589403973, "grad_norm": 1.0659584483463174, "learning_rate": 8.441102387403014e-05, "loss": 0.8672, "step": 6547 }, { "epoch": 4.336423841059602, "grad_norm": 1.2909584772547398, "learning_rate": 8.435100179513807e-05, "loss": 1.0078, "step": 6548 }, { "epoch": 4.337086092715232, "grad_norm": 1.1195846118465116, "learning_rate": 8.429099271550022e-05, "loss": 0.7383, "step": 6549 }, { "epoch": 4.337748344370861, "grad_norm": 1.1132429841498623, "learning_rate": 8.423099664699914e-05, "loss": 0.8047, "step": 6550 }, { "epoch": 4.33841059602649, "grad_norm": 1.3415162180677649, "learning_rate": 8.417101360151479e-05, "loss": 1.0547, "step": 6551 }, { "epoch": 4.339072847682119, "grad_norm": 1.0785021012284788, "learning_rate": 8.41110435909244e-05, "loss": 0.8789, "step": 6552 }, { "epoch": 4.339735099337748, "grad_norm": 1.2478490187056797, "learning_rate": 8.405108662710283e-05, "loss": 0.9492, "step": 6553 }, { "epoch": 4.340397350993378, "grad_norm": 1.1605860610427157, "learning_rate": 8.399114272192212e-05, "loss": 0.9219, "step": 6554 }, { "epoch": 4.341059602649007, "grad_norm": 1.073646000099421, "learning_rate": 8.393121188725193e-05, "loss": 0.8164, "step": 6555 }, { "epoch": 4.341721854304636, "grad_norm": 1.1154438850111101, "learning_rate": 8.387129413495928e-05, "loss": 0.8125, "step": 6556 }, { "epoch": 4.342384105960265, "grad_norm": 1.102894165221164, "learning_rate": 8.381138947690849e-05, "loss": 0.8242, "step": 6557 }, { "epoch": 4.343046357615894, "grad_norm": 1.4744681389366043, "learning_rate": 8.375149792496141e-05, "loss": 1.2734, "step": 6558 }, { "epoch": 4.343708609271523, "grad_norm": 1.0504640473517697, "learning_rate": 8.369161949097732e-05, "loss": 0.7578, "step": 6559 }, { "epoch": 4.344370860927152, "grad_norm": 1.2884883224488397, "learning_rate": 8.363175418681275e-05, "loss": 1.0469, "step": 6560 }, { "epoch": 4.345033112582781, "grad_norm": 1.214502760014441, "learning_rate": 8.35719020243217e-05, "loss": 0.9062, "step": 6561 }, { "epoch": 4.34569536423841, "grad_norm": 1.1592231811682447, "learning_rate": 8.351206301535561e-05, "loss": 0.9688, "step": 6562 }, { "epoch": 4.34635761589404, "grad_norm": 1.104092873247346, "learning_rate": 8.345223717176338e-05, "loss": 0.8828, "step": 6563 }, { "epoch": 4.347019867549669, "grad_norm": 1.0051248145490712, "learning_rate": 8.339242450539105e-05, "loss": 0.7852, "step": 6564 }, { "epoch": 4.347682119205298, "grad_norm": 1.192770158583818, "learning_rate": 8.333262502808237e-05, "loss": 0.9688, "step": 6565 }, { "epoch": 4.348344370860927, "grad_norm": 1.071704901045227, "learning_rate": 8.32728387516782e-05, "loss": 0.793, "step": 6566 }, { "epoch": 4.349006622516557, "grad_norm": 0.9664527996154553, "learning_rate": 8.321306568801695e-05, "loss": 0.7695, "step": 6567 }, { "epoch": 4.349668874172186, "grad_norm": 0.9561335542099324, "learning_rate": 8.315330584893444e-05, "loss": 0.7188, "step": 6568 }, { "epoch": 4.350331125827815, "grad_norm": 1.0957160678359565, "learning_rate": 8.309355924626376e-05, "loss": 0.8008, "step": 6569 }, { "epoch": 4.3509933774834435, "grad_norm": 1.1025295533706028, "learning_rate": 8.303382589183524e-05, "loss": 0.8398, "step": 6570 }, { "epoch": 4.3516556291390724, "grad_norm": 1.1950828869136805, "learning_rate": 8.29741057974771e-05, "loss": 0.9102, "step": 6571 }, { "epoch": 4.352317880794702, "grad_norm": 1.1006090408976663, "learning_rate": 8.291439897501444e-05, "loss": 0.7656, "step": 6572 }, { "epoch": 4.352980132450331, "grad_norm": 1.2121962503462504, "learning_rate": 8.285470543626981e-05, "loss": 0.8789, "step": 6573 }, { "epoch": 4.35364238410596, "grad_norm": 1.3151973402304271, "learning_rate": 8.279502519306332e-05, "loss": 1.0, "step": 6574 }, { "epoch": 4.354304635761589, "grad_norm": 1.3618572858536624, "learning_rate": 8.273535825721238e-05, "loss": 1.1016, "step": 6575 }, { "epoch": 4.354966887417219, "grad_norm": 1.300058585797083, "learning_rate": 8.267570464053158e-05, "loss": 0.9805, "step": 6576 }, { "epoch": 4.355629139072848, "grad_norm": 1.184031077250656, "learning_rate": 8.261606435483319e-05, "loss": 0.8633, "step": 6577 }, { "epoch": 4.356291390728477, "grad_norm": 1.1300417310460407, "learning_rate": 8.255643741192654e-05, "loss": 0.832, "step": 6578 }, { "epoch": 4.356953642384106, "grad_norm": 1.114157950226741, "learning_rate": 8.24968238236185e-05, "loss": 0.8164, "step": 6579 }, { "epoch": 4.357615894039735, "grad_norm": 1.0445024191681573, "learning_rate": 8.243722360171329e-05, "loss": 0.8398, "step": 6580 }, { "epoch": 4.3582781456953645, "grad_norm": 1.1685642525673738, "learning_rate": 8.237763675801238e-05, "loss": 0.9453, "step": 6581 }, { "epoch": 4.3589403973509935, "grad_norm": 1.25653984426189, "learning_rate": 8.23180633043146e-05, "loss": 0.9219, "step": 6582 }, { "epoch": 4.359602649006622, "grad_norm": 1.0367693161281861, "learning_rate": 8.225850325241626e-05, "loss": 0.7812, "step": 6583 }, { "epoch": 4.360264900662251, "grad_norm": 1.0947242453991388, "learning_rate": 8.219895661411092e-05, "loss": 0.8047, "step": 6584 }, { "epoch": 4.360927152317881, "grad_norm": 1.2326569821429494, "learning_rate": 8.213942340118951e-05, "loss": 0.9375, "step": 6585 }, { "epoch": 4.36158940397351, "grad_norm": 1.1785066419064292, "learning_rate": 8.207990362544022e-05, "loss": 0.8398, "step": 6586 }, { "epoch": 4.362251655629139, "grad_norm": 1.1873254867853171, "learning_rate": 8.202039729864867e-05, "loss": 0.9219, "step": 6587 }, { "epoch": 4.362913907284768, "grad_norm": 0.9762211203946645, "learning_rate": 8.196090443259787e-05, "loss": 0.6875, "step": 6588 }, { "epoch": 4.363576158940397, "grad_norm": 0.9711837611368349, "learning_rate": 8.190142503906798e-05, "loss": 0.7461, "step": 6589 }, { "epoch": 4.364238410596027, "grad_norm": 1.226985246997386, "learning_rate": 8.184195912983673e-05, "loss": 0.9141, "step": 6590 }, { "epoch": 4.364900662251656, "grad_norm": 1.0652454489846093, "learning_rate": 8.178250671667891e-05, "loss": 0.7734, "step": 6591 }, { "epoch": 4.365562913907285, "grad_norm": 1.1953601354619727, "learning_rate": 8.172306781136685e-05, "loss": 0.9414, "step": 6592 }, { "epoch": 4.366225165562914, "grad_norm": 1.2674960195595504, "learning_rate": 8.166364242567018e-05, "loss": 0.957, "step": 6593 }, { "epoch": 4.3668874172185435, "grad_norm": 1.1190671121053586, "learning_rate": 8.160423057135572e-05, "loss": 0.8945, "step": 6594 }, { "epoch": 4.367549668874172, "grad_norm": 1.193792833073534, "learning_rate": 8.154483226018773e-05, "loss": 0.9102, "step": 6595 }, { "epoch": 4.368211920529801, "grad_norm": 1.235329512097102, "learning_rate": 8.148544750392783e-05, "loss": 0.9375, "step": 6596 }, { "epoch": 4.36887417218543, "grad_norm": 1.1442611056660759, "learning_rate": 8.14260763143348e-05, "loss": 0.8867, "step": 6597 }, { "epoch": 4.369536423841059, "grad_norm": 1.0793076426892187, "learning_rate": 8.136671870316478e-05, "loss": 0.7891, "step": 6598 }, { "epoch": 4.370198675496689, "grad_norm": 1.124819227265678, "learning_rate": 8.130737468217133e-05, "loss": 0.8945, "step": 6599 }, { "epoch": 4.370860927152318, "grad_norm": 1.1622233077799642, "learning_rate": 8.124804426310527e-05, "loss": 0.9648, "step": 6600 }, { "epoch": 4.371523178807947, "grad_norm": 0.9545244146821957, "learning_rate": 8.118872745771461e-05, "loss": 0.8359, "step": 6601 }, { "epoch": 4.372185430463576, "grad_norm": 1.1032754763789947, "learning_rate": 8.112942427774488e-05, "loss": 0.8125, "step": 6602 }, { "epoch": 4.372847682119206, "grad_norm": 1.2388393400328686, "learning_rate": 8.107013473493863e-05, "loss": 0.957, "step": 6603 }, { "epoch": 4.373509933774835, "grad_norm": 1.183275925049816, "learning_rate": 8.101085884103598e-05, "loss": 0.8789, "step": 6604 }, { "epoch": 4.374172185430464, "grad_norm": 1.1528850570288558, "learning_rate": 8.095159660777426e-05, "loss": 0.9336, "step": 6605 }, { "epoch": 4.3748344370860925, "grad_norm": 1.090090580160501, "learning_rate": 8.089234804688803e-05, "loss": 0.8242, "step": 6606 }, { "epoch": 4.3754966887417215, "grad_norm": 1.12574167469852, "learning_rate": 8.083311317010914e-05, "loss": 0.8477, "step": 6607 }, { "epoch": 4.376158940397351, "grad_norm": 1.1478687611613405, "learning_rate": 8.077389198916681e-05, "loss": 0.8789, "step": 6608 }, { "epoch": 4.37682119205298, "grad_norm": 1.0679615791990253, "learning_rate": 8.071468451578754e-05, "loss": 0.7969, "step": 6609 }, { "epoch": 4.377483443708609, "grad_norm": 1.0942488433565987, "learning_rate": 8.065549076169506e-05, "loss": 0.7695, "step": 6610 }, { "epoch": 4.378145695364238, "grad_norm": 0.9454352174887917, "learning_rate": 8.059631073861038e-05, "loss": 0.6641, "step": 6611 }, { "epoch": 4.378807947019867, "grad_norm": 1.074625700476439, "learning_rate": 8.053714445825192e-05, "loss": 0.8125, "step": 6612 }, { "epoch": 4.379470198675497, "grad_norm": 1.0294798648306254, "learning_rate": 8.047799193233518e-05, "loss": 0.7383, "step": 6613 }, { "epoch": 4.380132450331126, "grad_norm": 1.1746390864502145, "learning_rate": 8.041885317257313e-05, "loss": 0.9258, "step": 6614 }, { "epoch": 4.380794701986755, "grad_norm": 1.0888892849645295, "learning_rate": 8.035972819067582e-05, "loss": 0.8555, "step": 6615 }, { "epoch": 4.381456953642384, "grad_norm": 1.2222679589653076, "learning_rate": 8.030061699835071e-05, "loss": 0.9062, "step": 6616 }, { "epoch": 4.382119205298014, "grad_norm": 1.2511019060538426, "learning_rate": 8.024151960730261e-05, "loss": 0.9727, "step": 6617 }, { "epoch": 4.3827814569536425, "grad_norm": 1.0117680564891882, "learning_rate": 8.018243602923335e-05, "loss": 0.7617, "step": 6618 }, { "epoch": 4.3834437086092715, "grad_norm": 1.312960781814946, "learning_rate": 8.012336627584214e-05, "loss": 0.9805, "step": 6619 }, { "epoch": 4.3841059602649, "grad_norm": 1.338241079961762, "learning_rate": 8.006431035882554e-05, "loss": 1.0156, "step": 6620 }, { "epoch": 4.38476821192053, "grad_norm": 1.0272388734369555, "learning_rate": 8.000526828987728e-05, "loss": 0.7344, "step": 6621 }, { "epoch": 4.385430463576159, "grad_norm": 1.0868562584749215, "learning_rate": 7.994624008068834e-05, "loss": 0.8086, "step": 6622 }, { "epoch": 4.386092715231788, "grad_norm": 1.0940004621320567, "learning_rate": 7.988722574294707e-05, "loss": 0.7344, "step": 6623 }, { "epoch": 4.386754966887417, "grad_norm": 1.3052027824056271, "learning_rate": 7.982822528833884e-05, "loss": 1.0078, "step": 6624 }, { "epoch": 4.387417218543046, "grad_norm": 1.0992004292268522, "learning_rate": 7.976923872854651e-05, "loss": 0.875, "step": 6625 }, { "epoch": 4.388079470198676, "grad_norm": 1.1505698007601282, "learning_rate": 7.971026607525014e-05, "loss": 0.8516, "step": 6626 }, { "epoch": 4.388741721854305, "grad_norm": 1.313791013621543, "learning_rate": 7.965130734012694e-05, "loss": 0.9922, "step": 6627 }, { "epoch": 4.389403973509934, "grad_norm": 1.3352336041295405, "learning_rate": 7.959236253485127e-05, "loss": 1.0312, "step": 6628 }, { "epoch": 4.390066225165563, "grad_norm": 1.2759967159200156, "learning_rate": 7.953343167109514e-05, "loss": 0.9805, "step": 6629 }, { "epoch": 4.390728476821192, "grad_norm": 1.0865507842986137, "learning_rate": 7.94745147605274e-05, "loss": 0.7344, "step": 6630 }, { "epoch": 4.391390728476821, "grad_norm": 1.0999034418079447, "learning_rate": 7.941561181481425e-05, "loss": 0.8125, "step": 6631 }, { "epoch": 4.39205298013245, "grad_norm": 1.1413390732396653, "learning_rate": 7.935672284561917e-05, "loss": 0.8438, "step": 6632 }, { "epoch": 4.392715231788079, "grad_norm": 1.068223005112426, "learning_rate": 7.929784786460293e-05, "loss": 0.793, "step": 6633 }, { "epoch": 4.393377483443708, "grad_norm": 1.2246871465100333, "learning_rate": 7.923898688342333e-05, "loss": 0.9688, "step": 6634 }, { "epoch": 4.394039735099338, "grad_norm": 1.052563400200038, "learning_rate": 7.918013991373562e-05, "loss": 0.7734, "step": 6635 }, { "epoch": 4.394701986754967, "grad_norm": 1.1681981956150516, "learning_rate": 7.912130696719208e-05, "loss": 0.8398, "step": 6636 }, { "epoch": 4.395364238410596, "grad_norm": 1.0091363390676518, "learning_rate": 7.906248805544236e-05, "loss": 0.707, "step": 6637 }, { "epoch": 4.396026490066225, "grad_norm": 1.0485955575230692, "learning_rate": 7.900368319013333e-05, "loss": 0.8359, "step": 6638 }, { "epoch": 4.396688741721855, "grad_norm": 1.1305418995374719, "learning_rate": 7.894489238290899e-05, "loss": 0.7969, "step": 6639 }, { "epoch": 4.397350993377484, "grad_norm": 1.2176688685773693, "learning_rate": 7.88861156454105e-05, "loss": 0.9766, "step": 6640 }, { "epoch": 4.398013245033113, "grad_norm": 1.0787966029316884, "learning_rate": 7.882735298927642e-05, "loss": 0.8086, "step": 6641 }, { "epoch": 4.398675496688742, "grad_norm": 1.0972246243590105, "learning_rate": 7.876860442614247e-05, "loss": 0.75, "step": 6642 }, { "epoch": 4.3993377483443705, "grad_norm": 1.138106355840279, "learning_rate": 7.870986996764149e-05, "loss": 0.8047, "step": 6643 }, { "epoch": 4.4, "grad_norm": 1.2784869159372187, "learning_rate": 7.86511496254035e-05, "loss": 1.0078, "step": 6644 }, { "epoch": 4.400662251655629, "grad_norm": 1.1484106856217897, "learning_rate": 7.859244341105589e-05, "loss": 0.7969, "step": 6645 }, { "epoch": 4.401324503311258, "grad_norm": 1.2472132481840061, "learning_rate": 7.853375133622319e-05, "loss": 0.8477, "step": 6646 }, { "epoch": 4.401986754966887, "grad_norm": 1.1231491118048726, "learning_rate": 7.8475073412527e-05, "loss": 0.793, "step": 6647 }, { "epoch": 4.402649006622516, "grad_norm": 1.3065627736935417, "learning_rate": 7.841640965158634e-05, "loss": 0.9648, "step": 6648 }, { "epoch": 4.403311258278146, "grad_norm": 1.2019108809751762, "learning_rate": 7.83577600650172e-05, "loss": 0.8359, "step": 6649 }, { "epoch": 4.403973509933775, "grad_norm": 1.2282193266351717, "learning_rate": 7.829912466443291e-05, "loss": 0.8516, "step": 6650 }, { "epoch": 4.404635761589404, "grad_norm": 1.412753338831306, "learning_rate": 7.824050346144404e-05, "loss": 1.1719, "step": 6651 }, { "epoch": 4.405298013245033, "grad_norm": 1.1599696847232894, "learning_rate": 7.818189646765808e-05, "loss": 0.9492, "step": 6652 }, { "epoch": 4.405960264900663, "grad_norm": 1.187468974939121, "learning_rate": 7.812330369468002e-05, "loss": 0.9297, "step": 6653 }, { "epoch": 4.406622516556292, "grad_norm": 1.3430597505103727, "learning_rate": 7.80647251541119e-05, "loss": 1.0234, "step": 6654 }, { "epoch": 4.4072847682119205, "grad_norm": 1.0254189971110994, "learning_rate": 7.800616085755294e-05, "loss": 0.7695, "step": 6655 }, { "epoch": 4.407947019867549, "grad_norm": 1.0755704577343002, "learning_rate": 7.794761081659945e-05, "loss": 0.8633, "step": 6656 }, { "epoch": 4.408609271523179, "grad_norm": 1.206075096432033, "learning_rate": 7.788907504284505e-05, "loss": 0.9297, "step": 6657 }, { "epoch": 4.409271523178808, "grad_norm": 1.052875857059609, "learning_rate": 7.78305535478806e-05, "loss": 0.8242, "step": 6658 }, { "epoch": 4.409933774834437, "grad_norm": 1.0022569107733932, "learning_rate": 7.777204634329387e-05, "loss": 0.7422, "step": 6659 }, { "epoch": 4.410596026490066, "grad_norm": 1.0823396140997483, "learning_rate": 7.771355344067009e-05, "loss": 0.8594, "step": 6660 }, { "epoch": 4.411258278145695, "grad_norm": 1.0039750163776349, "learning_rate": 7.765507485159141e-05, "loss": 0.8242, "step": 6661 }, { "epoch": 4.411920529801325, "grad_norm": 1.2133633019107544, "learning_rate": 7.759661058763732e-05, "loss": 0.9258, "step": 6662 }, { "epoch": 4.412582781456954, "grad_norm": 1.2276589996590845, "learning_rate": 7.753816066038447e-05, "loss": 0.9219, "step": 6663 }, { "epoch": 4.413245033112583, "grad_norm": 1.1427179977175665, "learning_rate": 7.747972508140653e-05, "loss": 0.8281, "step": 6664 }, { "epoch": 4.413907284768212, "grad_norm": 1.014641238352878, "learning_rate": 7.742130386227436e-05, "loss": 0.7578, "step": 6665 }, { "epoch": 4.414569536423841, "grad_norm": 1.2518525749027354, "learning_rate": 7.736289701455622e-05, "loss": 0.8555, "step": 6666 }, { "epoch": 4.4152317880794705, "grad_norm": 1.00221930214085, "learning_rate": 7.730450454981723e-05, "loss": 0.75, "step": 6667 }, { "epoch": 4.415894039735099, "grad_norm": 1.1249399553512136, "learning_rate": 7.724612647961969e-05, "loss": 0.8945, "step": 6668 }, { "epoch": 4.416556291390728, "grad_norm": 1.22816578943164, "learning_rate": 7.718776281552321e-05, "loss": 0.8789, "step": 6669 }, { "epoch": 4.417218543046357, "grad_norm": 1.1337125458059392, "learning_rate": 7.712941356908454e-05, "loss": 0.7578, "step": 6670 }, { "epoch": 4.417880794701987, "grad_norm": 1.3149060329253341, "learning_rate": 7.707107875185734e-05, "loss": 0.9375, "step": 6671 }, { "epoch": 4.418543046357616, "grad_norm": 1.091746881321148, "learning_rate": 7.70127583753927e-05, "loss": 0.7852, "step": 6672 }, { "epoch": 4.419205298013245, "grad_norm": 1.3320524986648246, "learning_rate": 7.695445245123863e-05, "loss": 1.0391, "step": 6673 }, { "epoch": 4.419867549668874, "grad_norm": 1.1141833598775863, "learning_rate": 7.68961609909404e-05, "loss": 0.8242, "step": 6674 }, { "epoch": 4.420529801324503, "grad_norm": 1.1511271788463762, "learning_rate": 7.683788400604047e-05, "loss": 0.875, "step": 6675 }, { "epoch": 4.421192052980133, "grad_norm": 1.0550247354866535, "learning_rate": 7.67796215080783e-05, "loss": 0.8125, "step": 6676 }, { "epoch": 4.421854304635762, "grad_norm": 1.0532269711721365, "learning_rate": 7.672137350859043e-05, "loss": 0.8164, "step": 6677 }, { "epoch": 4.422516556291391, "grad_norm": 1.2058639555401087, "learning_rate": 7.666314001911073e-05, "loss": 0.8828, "step": 6678 }, { "epoch": 4.42317880794702, "grad_norm": 1.3803461597785633, "learning_rate": 7.660492105117015e-05, "loss": 1.1562, "step": 6679 }, { "epoch": 4.423841059602649, "grad_norm": 1.1376579035350038, "learning_rate": 7.654671661629657e-05, "loss": 0.9883, "step": 6680 }, { "epoch": 4.424503311258278, "grad_norm": 1.1803130353260232, "learning_rate": 7.648852672601529e-05, "loss": 0.8789, "step": 6681 }, { "epoch": 4.425165562913907, "grad_norm": 1.224438180548455, "learning_rate": 7.643035139184842e-05, "loss": 0.9883, "step": 6682 }, { "epoch": 4.425827814569536, "grad_norm": 1.1047768340042923, "learning_rate": 7.637219062531545e-05, "loss": 0.8008, "step": 6683 }, { "epoch": 4.426490066225165, "grad_norm": 0.9131842276168701, "learning_rate": 7.63140444379329e-05, "loss": 0.75, "step": 6684 }, { "epoch": 4.427152317880795, "grad_norm": 1.0965744953084597, "learning_rate": 7.625591284121433e-05, "loss": 0.8828, "step": 6685 }, { "epoch": 4.427814569536424, "grad_norm": 1.1492240174604782, "learning_rate": 7.619779584667035e-05, "loss": 0.918, "step": 6686 }, { "epoch": 4.428476821192053, "grad_norm": 1.2232356648972063, "learning_rate": 7.613969346580902e-05, "loss": 0.9922, "step": 6687 }, { "epoch": 4.429139072847682, "grad_norm": 1.1978655299990777, "learning_rate": 7.608160571013516e-05, "loss": 0.8711, "step": 6688 }, { "epoch": 4.429801324503312, "grad_norm": 0.9964243345615611, "learning_rate": 7.602353259115079e-05, "loss": 0.7852, "step": 6689 }, { "epoch": 4.430463576158941, "grad_norm": 1.0671894519135996, "learning_rate": 7.596547412035505e-05, "loss": 0.8242, "step": 6690 }, { "epoch": 4.4311258278145695, "grad_norm": 1.084647888872076, "learning_rate": 7.590743030924428e-05, "loss": 0.793, "step": 6691 }, { "epoch": 4.4317880794701985, "grad_norm": 1.0882801746530415, "learning_rate": 7.584940116931168e-05, "loss": 0.8047, "step": 6692 }, { "epoch": 4.432450331125827, "grad_norm": 1.1735188989127194, "learning_rate": 7.579138671204783e-05, "loss": 0.9141, "step": 6693 }, { "epoch": 4.433112582781457, "grad_norm": 1.0749665880380326, "learning_rate": 7.573338694894012e-05, "loss": 0.8438, "step": 6694 }, { "epoch": 4.433774834437086, "grad_norm": 1.0690543798102934, "learning_rate": 7.567540189147324e-05, "loss": 0.7656, "step": 6695 }, { "epoch": 4.434437086092715, "grad_norm": 1.1253094998014919, "learning_rate": 7.561743155112894e-05, "loss": 0.9102, "step": 6696 }, { "epoch": 4.435099337748344, "grad_norm": 1.127828035078651, "learning_rate": 7.555947593938596e-05, "loss": 0.7188, "step": 6697 }, { "epoch": 4.435761589403974, "grad_norm": 1.0916928110573747, "learning_rate": 7.550153506772015e-05, "loss": 0.8242, "step": 6698 }, { "epoch": 4.436423841059603, "grad_norm": 1.0912448228023155, "learning_rate": 7.544360894760443e-05, "loss": 0.8945, "step": 6699 }, { "epoch": 4.437086092715232, "grad_norm": 1.2017666477349516, "learning_rate": 7.538569759050901e-05, "loss": 0.8711, "step": 6700 }, { "epoch": 4.437748344370861, "grad_norm": 1.1768409939976037, "learning_rate": 7.532780100790088e-05, "loss": 0.8711, "step": 6701 }, { "epoch": 4.43841059602649, "grad_norm": 1.0554871421196965, "learning_rate": 7.526991921124418e-05, "loss": 0.8086, "step": 6702 }, { "epoch": 4.4390728476821195, "grad_norm": 1.0427813721943868, "learning_rate": 7.521205221200022e-05, "loss": 0.7461, "step": 6703 }, { "epoch": 4.4397350993377485, "grad_norm": 1.272818121417988, "learning_rate": 7.51542000216274e-05, "loss": 0.9023, "step": 6704 }, { "epoch": 4.440397350993377, "grad_norm": 1.1329384483991136, "learning_rate": 7.509636265158102e-05, "loss": 0.8867, "step": 6705 }, { "epoch": 4.441059602649006, "grad_norm": 1.0613040880330014, "learning_rate": 7.503854011331355e-05, "loss": 0.6953, "step": 6706 }, { "epoch": 4.441721854304636, "grad_norm": 1.114863945118355, "learning_rate": 7.49807324182746e-05, "loss": 0.7969, "step": 6707 }, { "epoch": 4.442384105960265, "grad_norm": 1.1381658912110388, "learning_rate": 7.492293957791066e-05, "loss": 0.8203, "step": 6708 }, { "epoch": 4.443046357615894, "grad_norm": 1.235176444381562, "learning_rate": 7.486516160366546e-05, "loss": 0.957, "step": 6709 }, { "epoch": 4.443708609271523, "grad_norm": 1.1615944496132282, "learning_rate": 7.480739850697959e-05, "loss": 0.8672, "step": 6710 }, { "epoch": 4.444370860927152, "grad_norm": 1.1004126624846313, "learning_rate": 7.474965029929088e-05, "loss": 0.8398, "step": 6711 }, { "epoch": 4.445033112582782, "grad_norm": 1.1926545309136802, "learning_rate": 7.469191699203419e-05, "loss": 0.8477, "step": 6712 }, { "epoch": 4.445695364238411, "grad_norm": 1.0778019110793824, "learning_rate": 7.46341985966413e-05, "loss": 0.7383, "step": 6713 }, { "epoch": 4.44635761589404, "grad_norm": 0.9834189626443061, "learning_rate": 7.457649512454108e-05, "loss": 0.6797, "step": 6714 }, { "epoch": 4.447019867549669, "grad_norm": 1.1704619249978323, "learning_rate": 7.451880658715952e-05, "loss": 0.8398, "step": 6715 }, { "epoch": 4.447682119205298, "grad_norm": 1.2497441038757344, "learning_rate": 7.446113299591969e-05, "loss": 0.9844, "step": 6716 }, { "epoch": 4.448344370860927, "grad_norm": 1.074660948360414, "learning_rate": 7.44034743622415e-05, "loss": 0.8281, "step": 6717 }, { "epoch": 4.449006622516556, "grad_norm": 1.106230103840178, "learning_rate": 7.43458306975421e-05, "loss": 0.7344, "step": 6718 }, { "epoch": 4.449668874172185, "grad_norm": 1.1566431650891433, "learning_rate": 7.428820201323553e-05, "loss": 0.9141, "step": 6719 }, { "epoch": 4.450331125827814, "grad_norm": 1.2154256666393897, "learning_rate": 7.423058832073298e-05, "loss": 0.8828, "step": 6720 }, { "epoch": 4.450993377483444, "grad_norm": 1.077345601887816, "learning_rate": 7.417298963144267e-05, "loss": 0.7969, "step": 6721 }, { "epoch": 4.451655629139073, "grad_norm": 1.1735585177017243, "learning_rate": 7.411540595676971e-05, "loss": 0.9062, "step": 6722 }, { "epoch": 4.452317880794702, "grad_norm": 1.0859617178511285, "learning_rate": 7.405783730811627e-05, "loss": 0.7266, "step": 6723 }, { "epoch": 4.452980132450331, "grad_norm": 1.1476941794566047, "learning_rate": 7.400028369688179e-05, "loss": 0.8359, "step": 6724 }, { "epoch": 4.45364238410596, "grad_norm": 1.1583232283756104, "learning_rate": 7.394274513446245e-05, "loss": 0.8438, "step": 6725 }, { "epoch": 4.45430463576159, "grad_norm": 1.197941225233906, "learning_rate": 7.388522163225148e-05, "loss": 0.8672, "step": 6726 }, { "epoch": 4.454966887417219, "grad_norm": 1.1234075819594622, "learning_rate": 7.382771320163928e-05, "loss": 0.832, "step": 6727 }, { "epoch": 4.4556291390728475, "grad_norm": 1.027335912731799, "learning_rate": 7.377021985401319e-05, "loss": 0.6875, "step": 6728 }, { "epoch": 4.4562913907284765, "grad_norm": 1.2154460488969059, "learning_rate": 7.371274160075745e-05, "loss": 0.9062, "step": 6729 }, { "epoch": 4.456953642384106, "grad_norm": 1.2811001253837753, "learning_rate": 7.365527845325355e-05, "loss": 0.9375, "step": 6730 }, { "epoch": 4.457615894039735, "grad_norm": 1.1846120451804887, "learning_rate": 7.359783042287972e-05, "loss": 0.793, "step": 6731 }, { "epoch": 4.458278145695364, "grad_norm": 1.3222825296284855, "learning_rate": 7.354039752101138e-05, "loss": 1.0547, "step": 6732 }, { "epoch": 4.458940397350993, "grad_norm": 1.2269381128740722, "learning_rate": 7.348297975902098e-05, "loss": 0.9062, "step": 6733 }, { "epoch": 4.459602649006623, "grad_norm": 1.1585013450316355, "learning_rate": 7.342557714827781e-05, "loss": 0.8555, "step": 6734 }, { "epoch": 4.460264900662252, "grad_norm": 1.1983171116815126, "learning_rate": 7.336818970014823e-05, "loss": 0.8633, "step": 6735 }, { "epoch": 4.460927152317881, "grad_norm": 1.0940528473858318, "learning_rate": 7.331081742599564e-05, "loss": 0.8008, "step": 6736 }, { "epoch": 4.46158940397351, "grad_norm": 1.028406584522927, "learning_rate": 7.325346033718044e-05, "loss": 0.668, "step": 6737 }, { "epoch": 4.462251655629139, "grad_norm": 1.1110168917534264, "learning_rate": 7.319611844505995e-05, "loss": 0.7852, "step": 6738 }, { "epoch": 4.4629139072847686, "grad_norm": 1.179180020674184, "learning_rate": 7.313879176098856e-05, "loss": 0.9531, "step": 6739 }, { "epoch": 4.4635761589403975, "grad_norm": 1.200395621021466, "learning_rate": 7.308148029631754e-05, "loss": 0.7773, "step": 6740 }, { "epoch": 4.464238410596026, "grad_norm": 1.0868604121971879, "learning_rate": 7.302418406239526e-05, "loss": 0.7344, "step": 6741 }, { "epoch": 4.464900662251655, "grad_norm": 1.262812488573143, "learning_rate": 7.29669030705671e-05, "loss": 0.9805, "step": 6742 }, { "epoch": 4.465562913907284, "grad_norm": 1.0886762113492363, "learning_rate": 7.290963733217522e-05, "loss": 0.7812, "step": 6743 }, { "epoch": 4.466225165562914, "grad_norm": 1.0936529973684774, "learning_rate": 7.285238685855897e-05, "loss": 0.8281, "step": 6744 }, { "epoch": 4.466887417218543, "grad_norm": 1.0590468483077213, "learning_rate": 7.279515166105463e-05, "loss": 0.8438, "step": 6745 }, { "epoch": 4.467549668874172, "grad_norm": 1.0984270583103386, "learning_rate": 7.273793175099541e-05, "loss": 0.8516, "step": 6746 }, { "epoch": 4.468211920529801, "grad_norm": 1.0644782168455313, "learning_rate": 7.268072713971141e-05, "loss": 0.7539, "step": 6747 }, { "epoch": 4.468874172185431, "grad_norm": 1.1834004448101916, "learning_rate": 7.262353783852988e-05, "loss": 0.9336, "step": 6748 }, { "epoch": 4.46953642384106, "grad_norm": 1.0951116809295849, "learning_rate": 7.256636385877502e-05, "loss": 0.8477, "step": 6749 }, { "epoch": 4.470198675496689, "grad_norm": 1.1975272090347655, "learning_rate": 7.250920521176778e-05, "loss": 0.8125, "step": 6750 }, { "epoch": 4.470860927152318, "grad_norm": 1.1658426534888067, "learning_rate": 7.245206190882639e-05, "loss": 0.8477, "step": 6751 }, { "epoch": 4.4715231788079475, "grad_norm": 1.137610707085754, "learning_rate": 7.239493396126572e-05, "loss": 0.7617, "step": 6752 }, { "epoch": 4.472185430463576, "grad_norm": 1.0951370152577016, "learning_rate": 7.233782138039784e-05, "loss": 0.7656, "step": 6753 }, { "epoch": 4.472847682119205, "grad_norm": 1.0504830303291108, "learning_rate": 7.228072417753179e-05, "loss": 0.7734, "step": 6754 }, { "epoch": 4.473509933774834, "grad_norm": 1.1675586275576926, "learning_rate": 7.222364236397334e-05, "loss": 0.8398, "step": 6755 }, { "epoch": 4.474172185430463, "grad_norm": 1.3056239946337038, "learning_rate": 7.216657595102533e-05, "loss": 0.9141, "step": 6756 }, { "epoch": 4.474834437086093, "grad_norm": 1.1389062157913172, "learning_rate": 7.210952494998762e-05, "loss": 0.8633, "step": 6757 }, { "epoch": 4.475496688741722, "grad_norm": 1.0760396513719077, "learning_rate": 7.2052489372157e-05, "loss": 0.6875, "step": 6758 }, { "epoch": 4.476158940397351, "grad_norm": 1.2624964724145045, "learning_rate": 7.199546922882713e-05, "loss": 0.9336, "step": 6759 }, { "epoch": 4.47682119205298, "grad_norm": 1.2817732428302149, "learning_rate": 7.193846453128852e-05, "loss": 0.9141, "step": 6760 }, { "epoch": 4.477483443708609, "grad_norm": 1.1028212360103709, "learning_rate": 7.1881475290829e-05, "loss": 0.7461, "step": 6761 }, { "epoch": 4.478145695364239, "grad_norm": 1.050629603175401, "learning_rate": 7.182450151873298e-05, "loss": 0.7812, "step": 6762 }, { "epoch": 4.478807947019868, "grad_norm": 1.285334177468639, "learning_rate": 7.176754322628185e-05, "loss": 0.9531, "step": 6763 }, { "epoch": 4.479470198675497, "grad_norm": 1.421649903462375, "learning_rate": 7.171060042475407e-05, "loss": 1.0469, "step": 6764 }, { "epoch": 4.4801324503311255, "grad_norm": 1.1753486887616844, "learning_rate": 7.165367312542503e-05, "loss": 0.8555, "step": 6765 }, { "epoch": 4.480794701986755, "grad_norm": 1.0959612777445915, "learning_rate": 7.159676133956685e-05, "loss": 0.7695, "step": 6766 }, { "epoch": 4.481456953642384, "grad_norm": 1.1437403953705947, "learning_rate": 7.153986507844884e-05, "loss": 0.832, "step": 6767 }, { "epoch": 4.482119205298013, "grad_norm": 1.0495238418550779, "learning_rate": 7.148298435333702e-05, "loss": 0.8125, "step": 6768 }, { "epoch": 4.482781456953642, "grad_norm": 0.9966713538813938, "learning_rate": 7.142611917549447e-05, "loss": 0.7617, "step": 6769 }, { "epoch": 4.483443708609272, "grad_norm": 1.2295140365073625, "learning_rate": 7.136926955618118e-05, "loss": 0.8672, "step": 6770 }, { "epoch": 4.484105960264901, "grad_norm": 1.1404065341567566, "learning_rate": 7.1312435506654e-05, "loss": 0.9219, "step": 6771 }, { "epoch": 4.48476821192053, "grad_norm": 1.0344974967528306, "learning_rate": 7.125561703816666e-05, "loss": 0.75, "step": 6772 }, { "epoch": 4.485430463576159, "grad_norm": 1.1612619473265504, "learning_rate": 7.119881416196993e-05, "loss": 0.7812, "step": 6773 }, { "epoch": 4.486092715231788, "grad_norm": 1.2940545465555338, "learning_rate": 7.114202688931146e-05, "loss": 0.9766, "step": 6774 }, { "epoch": 4.486754966887418, "grad_norm": 1.2044969788127284, "learning_rate": 7.108525523143569e-05, "loss": 0.8281, "step": 6775 }, { "epoch": 4.4874172185430465, "grad_norm": 1.1729881396806463, "learning_rate": 7.102849919958417e-05, "loss": 0.8047, "step": 6776 }, { "epoch": 4.4880794701986755, "grad_norm": 1.1130272635739702, "learning_rate": 7.097175880499514e-05, "loss": 0.7344, "step": 6777 }, { "epoch": 4.488741721854304, "grad_norm": 1.1647908140578158, "learning_rate": 7.091503405890388e-05, "loss": 0.8828, "step": 6778 }, { "epoch": 4.489403973509933, "grad_norm": 1.1348591213611854, "learning_rate": 7.085832497254263e-05, "loss": 0.7812, "step": 6779 }, { "epoch": 4.490066225165563, "grad_norm": 1.2101827530364666, "learning_rate": 7.080163155714035e-05, "loss": 0.8633, "step": 6780 }, { "epoch": 4.490728476821192, "grad_norm": 1.3384052932108452, "learning_rate": 7.07449538239229e-05, "loss": 1.1484, "step": 6781 }, { "epoch": 4.491390728476821, "grad_norm": 1.1710742975056254, "learning_rate": 7.068829178411332e-05, "loss": 0.8672, "step": 6782 }, { "epoch": 4.49205298013245, "grad_norm": 1.1323765258272818, "learning_rate": 7.063164544893126e-05, "loss": 0.8398, "step": 6783 }, { "epoch": 4.49271523178808, "grad_norm": 1.2199841579769128, "learning_rate": 7.057501482959326e-05, "loss": 0.8633, "step": 6784 }, { "epoch": 4.493377483443709, "grad_norm": 1.0787597980090142, "learning_rate": 7.051839993731293e-05, "loss": 0.7539, "step": 6785 }, { "epoch": 4.494039735099338, "grad_norm": 1.2222719846194399, "learning_rate": 7.046180078330067e-05, "loss": 0.875, "step": 6786 }, { "epoch": 4.494701986754967, "grad_norm": 1.0968657304756657, "learning_rate": 7.040521737876367e-05, "loss": 0.8555, "step": 6787 }, { "epoch": 4.495364238410596, "grad_norm": 1.1333335591216855, "learning_rate": 7.034864973490622e-05, "loss": 0.8594, "step": 6788 }, { "epoch": 4.4960264900662255, "grad_norm": 1.1599165635864697, "learning_rate": 7.029209786292925e-05, "loss": 0.9219, "step": 6789 }, { "epoch": 4.496688741721854, "grad_norm": 0.9955952001499914, "learning_rate": 7.023556177403072e-05, "loss": 0.7656, "step": 6790 }, { "epoch": 4.497350993377483, "grad_norm": 1.1663865905621547, "learning_rate": 7.017904147940546e-05, "loss": 0.8789, "step": 6791 }, { "epoch": 4.498013245033112, "grad_norm": 1.083419428931796, "learning_rate": 7.012253699024513e-05, "loss": 0.8359, "step": 6792 }, { "epoch": 4.498675496688742, "grad_norm": 1.0119618537706674, "learning_rate": 7.006604831773817e-05, "loss": 0.8008, "step": 6793 }, { "epoch": 4.499337748344371, "grad_norm": 1.2306832357193778, "learning_rate": 7.000957547307006e-05, "loss": 0.9297, "step": 6794 }, { "epoch": 4.5, "grad_norm": 1.2989987952271396, "learning_rate": 6.995311846742314e-05, "loss": 0.9297, "step": 6795 }, { "epoch": 4.500662251655629, "grad_norm": 1.0654184224292123, "learning_rate": 6.98966773119764e-05, "loss": 0.8008, "step": 6796 }, { "epoch": 4.501324503311258, "grad_norm": 1.085661653977342, "learning_rate": 6.98402520179059e-05, "loss": 0.7695, "step": 6797 }, { "epoch": 4.501986754966888, "grad_norm": 1.1960083274566207, "learning_rate": 6.978384259638457e-05, "loss": 0.9102, "step": 6798 }, { "epoch": 4.502649006622517, "grad_norm": 1.2433217542594028, "learning_rate": 6.972744905858199e-05, "loss": 0.9023, "step": 6799 }, { "epoch": 4.503311258278146, "grad_norm": 1.448583137717848, "learning_rate": 6.967107141566485e-05, "loss": 1.1641, "step": 6800 }, { "epoch": 4.5039735099337745, "grad_norm": 1.3465719533460638, "learning_rate": 6.961470967879646e-05, "loss": 1.0078, "step": 6801 }, { "epoch": 4.5046357615894035, "grad_norm": 1.4410126817808542, "learning_rate": 6.955836385913713e-05, "loss": 1.1406, "step": 6802 }, { "epoch": 4.505298013245033, "grad_norm": 1.3144020289736424, "learning_rate": 6.950203396784407e-05, "loss": 1.0156, "step": 6803 }, { "epoch": 4.505960264900662, "grad_norm": 1.1479416533864808, "learning_rate": 6.944572001607114e-05, "loss": 0.8359, "step": 6804 }, { "epoch": 4.506622516556291, "grad_norm": 1.0720422372856222, "learning_rate": 6.938942201496912e-05, "loss": 0.7539, "step": 6805 }, { "epoch": 4.50728476821192, "grad_norm": 1.085355694569158, "learning_rate": 6.933313997568571e-05, "loss": 0.793, "step": 6806 }, { "epoch": 4.50794701986755, "grad_norm": 1.0212772133518924, "learning_rate": 6.927687390936545e-05, "loss": 0.7578, "step": 6807 }, { "epoch": 4.508609271523179, "grad_norm": 1.1719389709700936, "learning_rate": 6.922062382714958e-05, "loss": 0.9883, "step": 6808 }, { "epoch": 4.509271523178808, "grad_norm": 1.1502693021802814, "learning_rate": 6.916438974017635e-05, "loss": 0.8672, "step": 6809 }, { "epoch": 4.509933774834437, "grad_norm": 1.1651513095123094, "learning_rate": 6.910817165958064e-05, "loss": 0.9336, "step": 6810 }, { "epoch": 4.510596026490067, "grad_norm": 1.2441480749041118, "learning_rate": 6.905196959649432e-05, "loss": 0.918, "step": 6811 }, { "epoch": 4.511258278145696, "grad_norm": 1.2381093188067378, "learning_rate": 6.899578356204614e-05, "loss": 0.8633, "step": 6812 }, { "epoch": 4.5119205298013245, "grad_norm": 1.0841895781012965, "learning_rate": 6.893961356736147e-05, "loss": 0.7891, "step": 6813 }, { "epoch": 4.5125827814569535, "grad_norm": 1.1547671189767337, "learning_rate": 6.888345962356261e-05, "loss": 0.875, "step": 6814 }, { "epoch": 4.513245033112582, "grad_norm": 1.1335647726174272, "learning_rate": 6.88273217417687e-05, "loss": 0.7891, "step": 6815 }, { "epoch": 4.513907284768212, "grad_norm": 1.0964364839001548, "learning_rate": 6.877119993309576e-05, "loss": 0.8047, "step": 6816 }, { "epoch": 4.514569536423841, "grad_norm": 1.1540367303330639, "learning_rate": 6.87150942086565e-05, "loss": 0.9297, "step": 6817 }, { "epoch": 4.51523178807947, "grad_norm": 1.166514057517789, "learning_rate": 6.865900457956037e-05, "loss": 0.8359, "step": 6818 }, { "epoch": 4.515894039735099, "grad_norm": 1.0669191495638843, "learning_rate": 6.8602931056914e-05, "loss": 0.8125, "step": 6819 }, { "epoch": 4.516556291390728, "grad_norm": 1.1939004968943268, "learning_rate": 6.854687365182045e-05, "loss": 0.793, "step": 6820 }, { "epoch": 4.517218543046358, "grad_norm": 0.9413070467062662, "learning_rate": 6.849083237537969e-05, "loss": 0.6172, "step": 6821 }, { "epoch": 4.517880794701987, "grad_norm": 1.1258481418357573, "learning_rate": 6.84348072386886e-05, "loss": 0.8398, "step": 6822 }, { "epoch": 4.518543046357616, "grad_norm": 1.109209439715087, "learning_rate": 6.837879825284084e-05, "loss": 0.8438, "step": 6823 }, { "epoch": 4.519205298013245, "grad_norm": 1.1066448070357282, "learning_rate": 6.832280542892673e-05, "loss": 0.793, "step": 6824 }, { "epoch": 4.5198675496688745, "grad_norm": 1.1019115341386592, "learning_rate": 6.826682877803361e-05, "loss": 0.7812, "step": 6825 }, { "epoch": 4.520529801324503, "grad_norm": 1.1156086113145143, "learning_rate": 6.821086831124536e-05, "loss": 0.832, "step": 6826 }, { "epoch": 4.521192052980132, "grad_norm": 1.0841584365570298, "learning_rate": 6.815492403964288e-05, "loss": 0.7578, "step": 6827 }, { "epoch": 4.521854304635761, "grad_norm": 1.1674269118156773, "learning_rate": 6.80989959743038e-05, "loss": 0.8828, "step": 6828 }, { "epoch": 4.522516556291391, "grad_norm": 1.1424777461113194, "learning_rate": 6.804308412630251e-05, "loss": 0.7617, "step": 6829 }, { "epoch": 4.52317880794702, "grad_norm": 1.0897139457485012, "learning_rate": 6.798718850671009e-05, "loss": 0.7461, "step": 6830 }, { "epoch": 4.523841059602649, "grad_norm": 1.231808967696738, "learning_rate": 6.79313091265946e-05, "loss": 0.9141, "step": 6831 }, { "epoch": 4.524503311258278, "grad_norm": 1.1684173144862366, "learning_rate": 6.787544599702085e-05, "loss": 0.8398, "step": 6832 }, { "epoch": 4.525165562913907, "grad_norm": 1.0647251146661088, "learning_rate": 6.781959912905025e-05, "loss": 0.7109, "step": 6833 }, { "epoch": 4.525827814569537, "grad_norm": 1.0726829984318629, "learning_rate": 6.776376853374125e-05, "loss": 0.7656, "step": 6834 }, { "epoch": 4.526490066225166, "grad_norm": 1.3724215844872567, "learning_rate": 6.770795422214884e-05, "loss": 0.9648, "step": 6835 }, { "epoch": 4.527152317880795, "grad_norm": 1.1579244860238105, "learning_rate": 6.765215620532494e-05, "loss": 0.9414, "step": 6836 }, { "epoch": 4.527814569536424, "grad_norm": 1.0666909039984787, "learning_rate": 6.759637449431825e-05, "loss": 0.7109, "step": 6837 }, { "epoch": 4.5284768211920525, "grad_norm": 1.074793224029001, "learning_rate": 6.754060910017409e-05, "loss": 0.6797, "step": 6838 }, { "epoch": 4.529139072847682, "grad_norm": 1.0457583706817897, "learning_rate": 6.748486003393469e-05, "loss": 0.7734, "step": 6839 }, { "epoch": 4.529801324503311, "grad_norm": 1.1347520835838734, "learning_rate": 6.742912730663905e-05, "loss": 0.9141, "step": 6840 }, { "epoch": 4.53046357615894, "grad_norm": 1.1944120611865274, "learning_rate": 6.737341092932287e-05, "loss": 0.9492, "step": 6841 }, { "epoch": 4.531125827814569, "grad_norm": 1.2344141096249022, "learning_rate": 6.731771091301854e-05, "loss": 0.8711, "step": 6842 }, { "epoch": 4.531788079470199, "grad_norm": 1.1339812460824181, "learning_rate": 6.726202726875538e-05, "loss": 0.7422, "step": 6843 }, { "epoch": 4.532450331125828, "grad_norm": 1.2217190290496982, "learning_rate": 6.720636000755943e-05, "loss": 0.8984, "step": 6844 }, { "epoch": 4.533112582781457, "grad_norm": 1.3173167164927722, "learning_rate": 6.715070914045333e-05, "loss": 1.0078, "step": 6845 }, { "epoch": 4.533774834437086, "grad_norm": 1.0258589336170454, "learning_rate": 6.709507467845673e-05, "loss": 0.7109, "step": 6846 }, { "epoch": 4.534437086092716, "grad_norm": 1.1636126402442104, "learning_rate": 6.703945663258575e-05, "loss": 0.7891, "step": 6847 }, { "epoch": 4.535099337748345, "grad_norm": 1.2037385659158286, "learning_rate": 6.698385501385347e-05, "loss": 0.8828, "step": 6848 }, { "epoch": 4.535761589403974, "grad_norm": 1.314627748759114, "learning_rate": 6.692826983326972e-05, "loss": 0.8828, "step": 6849 }, { "epoch": 4.5364238410596025, "grad_norm": 1.1871950037209105, "learning_rate": 6.687270110184091e-05, "loss": 0.9023, "step": 6850 }, { "epoch": 4.5370860927152314, "grad_norm": 1.3138599145713863, "learning_rate": 6.681714883057024e-05, "loss": 0.9805, "step": 6851 }, { "epoch": 4.537748344370861, "grad_norm": 1.0206689946922418, "learning_rate": 6.676161303045778e-05, "loss": 0.7383, "step": 6852 }, { "epoch": 4.53841059602649, "grad_norm": 1.1105534934872179, "learning_rate": 6.670609371250029e-05, "loss": 0.7812, "step": 6853 }, { "epoch": 4.539072847682119, "grad_norm": 1.2410765309943481, "learning_rate": 6.665059088769112e-05, "loss": 0.8984, "step": 6854 }, { "epoch": 4.539735099337748, "grad_norm": 1.1785640148949748, "learning_rate": 6.659510456702052e-05, "loss": 0.918, "step": 6855 }, { "epoch": 4.540397350993377, "grad_norm": 1.2171637470599097, "learning_rate": 6.653963476147549e-05, "loss": 0.9219, "step": 6856 }, { "epoch": 4.541059602649007, "grad_norm": 1.095911838517165, "learning_rate": 6.648418148203956e-05, "loss": 0.8125, "step": 6857 }, { "epoch": 4.541721854304636, "grad_norm": 0.9708478586338003, "learning_rate": 6.642874473969322e-05, "loss": 0.6875, "step": 6858 }, { "epoch": 4.542384105960265, "grad_norm": 1.0448915886735928, "learning_rate": 6.637332454541349e-05, "loss": 0.8047, "step": 6859 }, { "epoch": 4.543046357615894, "grad_norm": 1.0169702498477373, "learning_rate": 6.631792091017423e-05, "loss": 0.7852, "step": 6860 }, { "epoch": 4.5437086092715235, "grad_norm": 1.1482212478413842, "learning_rate": 6.626253384494608e-05, "loss": 0.7539, "step": 6861 }, { "epoch": 4.5443708609271525, "grad_norm": 1.1786362622834043, "learning_rate": 6.620716336069627e-05, "loss": 0.8984, "step": 6862 }, { "epoch": 4.545033112582781, "grad_norm": 1.2607232566072555, "learning_rate": 6.615180946838868e-05, "loss": 0.8711, "step": 6863 }, { "epoch": 4.54569536423841, "grad_norm": 1.253239580661758, "learning_rate": 6.609647217898412e-05, "loss": 0.9688, "step": 6864 }, { "epoch": 4.54635761589404, "grad_norm": 1.118434113834205, "learning_rate": 6.604115150344004e-05, "loss": 0.875, "step": 6865 }, { "epoch": 4.547019867549669, "grad_norm": 1.254557868386404, "learning_rate": 6.598584745271046e-05, "loss": 0.9531, "step": 6866 }, { "epoch": 4.547682119205298, "grad_norm": 1.2197448558170783, "learning_rate": 6.593056003774634e-05, "loss": 0.8867, "step": 6867 }, { "epoch": 4.548344370860927, "grad_norm": 1.2009430698081902, "learning_rate": 6.58752892694951e-05, "loss": 0.8555, "step": 6868 }, { "epoch": 4.549006622516556, "grad_norm": 1.0658656506151505, "learning_rate": 6.58200351589011e-05, "loss": 0.7344, "step": 6869 }, { "epoch": 4.549668874172186, "grad_norm": 1.1686946069001052, "learning_rate": 6.576479771690518e-05, "loss": 0.8164, "step": 6870 }, { "epoch": 4.550331125827815, "grad_norm": 1.123056721411962, "learning_rate": 6.570957695444508e-05, "loss": 0.9258, "step": 6871 }, { "epoch": 4.550993377483444, "grad_norm": 1.166466314338416, "learning_rate": 6.565437288245505e-05, "loss": 0.8555, "step": 6872 }, { "epoch": 4.551655629139073, "grad_norm": 1.1603772619032338, "learning_rate": 6.559918551186617e-05, "loss": 0.7617, "step": 6873 }, { "epoch": 4.552317880794702, "grad_norm": 1.044582636785393, "learning_rate": 6.554401485360625e-05, "loss": 0.7422, "step": 6874 }, { "epoch": 4.552980132450331, "grad_norm": 1.0568924394017138, "learning_rate": 6.548886091859966e-05, "loss": 0.8164, "step": 6875 }, { "epoch": 4.55364238410596, "grad_norm": 0.9340536546779169, "learning_rate": 6.543372371776735e-05, "loss": 0.7422, "step": 6876 }, { "epoch": 4.554304635761589, "grad_norm": 1.1000348977210492, "learning_rate": 6.53786032620274e-05, "loss": 0.8086, "step": 6877 }, { "epoch": 4.554966887417218, "grad_norm": 1.1191784108113767, "learning_rate": 6.532349956229414e-05, "loss": 0.8242, "step": 6878 }, { "epoch": 4.555629139072848, "grad_norm": 1.2077328747670735, "learning_rate": 6.526841262947872e-05, "loss": 0.8203, "step": 6879 }, { "epoch": 4.556291390728477, "grad_norm": 1.139683439202912, "learning_rate": 6.521334247448901e-05, "loss": 0.7656, "step": 6880 }, { "epoch": 4.556953642384106, "grad_norm": 1.0407816168972148, "learning_rate": 6.515828910822957e-05, "loss": 0.7422, "step": 6881 }, { "epoch": 4.557615894039735, "grad_norm": 1.1903400792587258, "learning_rate": 6.510325254160154e-05, "loss": 0.832, "step": 6882 }, { "epoch": 4.558278145695365, "grad_norm": 1.408375708416154, "learning_rate": 6.504823278550285e-05, "loss": 1.1094, "step": 6883 }, { "epoch": 4.558940397350994, "grad_norm": 1.246862266247841, "learning_rate": 6.499322985082795e-05, "loss": 0.8711, "step": 6884 }, { "epoch": 4.559602649006623, "grad_norm": 1.0271767698613408, "learning_rate": 6.493824374846813e-05, "loss": 0.6953, "step": 6885 }, { "epoch": 4.5602649006622515, "grad_norm": 1.176743044749555, "learning_rate": 6.48832744893113e-05, "loss": 0.8594, "step": 6886 }, { "epoch": 4.5609271523178805, "grad_norm": 1.6108906824081386, "learning_rate": 6.482832208424195e-05, "loss": 1.2031, "step": 6887 }, { "epoch": 4.56158940397351, "grad_norm": 1.0828732377861132, "learning_rate": 6.477338654414122e-05, "loss": 0.7266, "step": 6888 }, { "epoch": 4.562251655629139, "grad_norm": 1.003110770144394, "learning_rate": 6.471846787988707e-05, "loss": 0.6797, "step": 6889 }, { "epoch": 4.562913907284768, "grad_norm": 0.9914673537821227, "learning_rate": 6.466356610235403e-05, "loss": 0.7188, "step": 6890 }, { "epoch": 4.563576158940397, "grad_norm": 1.1746249355854508, "learning_rate": 6.460868122241321e-05, "loss": 0.8906, "step": 6891 }, { "epoch": 4.564238410596026, "grad_norm": 1.2805054146315866, "learning_rate": 6.45538132509325e-05, "loss": 0.9766, "step": 6892 }, { "epoch": 4.564900662251656, "grad_norm": 1.0000477651904094, "learning_rate": 6.449896219877642e-05, "loss": 0.7148, "step": 6893 }, { "epoch": 4.565562913907285, "grad_norm": 1.0863621330245323, "learning_rate": 6.444412807680599e-05, "loss": 0.8555, "step": 6894 }, { "epoch": 4.566225165562914, "grad_norm": 1.1008891132711685, "learning_rate": 6.438931089587913e-05, "loss": 0.8008, "step": 6895 }, { "epoch": 4.566887417218543, "grad_norm": 1.123672323139189, "learning_rate": 6.433451066685017e-05, "loss": 0.8008, "step": 6896 }, { "epoch": 4.567549668874172, "grad_norm": 1.156650678347421, "learning_rate": 6.42797274005702e-05, "loss": 0.8516, "step": 6897 }, { "epoch": 4.5682119205298015, "grad_norm": 1.044532393397309, "learning_rate": 6.422496110788702e-05, "loss": 0.6992, "step": 6898 }, { "epoch": 4.5688741721854305, "grad_norm": 1.3027445811147818, "learning_rate": 6.417021179964491e-05, "loss": 0.8906, "step": 6899 }, { "epoch": 4.569536423841059, "grad_norm": 1.3392551441256109, "learning_rate": 6.411547948668483e-05, "loss": 0.9258, "step": 6900 }, { "epoch": 4.570198675496689, "grad_norm": 1.108687095495432, "learning_rate": 6.406076417984444e-05, "loss": 0.793, "step": 6901 }, { "epoch": 4.570860927152318, "grad_norm": 1.1370684483777989, "learning_rate": 6.400606588995805e-05, "loss": 0.875, "step": 6902 }, { "epoch": 4.571523178807947, "grad_norm": 1.0704211262512364, "learning_rate": 6.395138462785647e-05, "loss": 0.7461, "step": 6903 }, { "epoch": 4.572185430463576, "grad_norm": 1.3932304684590486, "learning_rate": 6.389672040436729e-05, "loss": 1.0859, "step": 6904 }, { "epoch": 4.572847682119205, "grad_norm": 1.1987664205999906, "learning_rate": 6.384207323031456e-05, "loss": 0.9648, "step": 6905 }, { "epoch": 4.573509933774835, "grad_norm": 1.1741022043078602, "learning_rate": 6.378744311651908e-05, "loss": 0.8477, "step": 6906 }, { "epoch": 4.574172185430464, "grad_norm": 1.1480275997435436, "learning_rate": 6.373283007379833e-05, "loss": 0.7695, "step": 6907 }, { "epoch": 4.574834437086093, "grad_norm": 1.165518318396098, "learning_rate": 6.367823411296625e-05, "loss": 0.8086, "step": 6908 }, { "epoch": 4.575496688741722, "grad_norm": 1.003600885806037, "learning_rate": 6.362365524483332e-05, "loss": 0.7656, "step": 6909 }, { "epoch": 4.576158940397351, "grad_norm": 1.226389408711565, "learning_rate": 6.356909348020707e-05, "loss": 0.9688, "step": 6910 }, { "epoch": 4.57682119205298, "grad_norm": 1.0687900581503968, "learning_rate": 6.35145488298912e-05, "loss": 0.7734, "step": 6911 }, { "epoch": 4.577483443708609, "grad_norm": 1.1816655492699555, "learning_rate": 6.346002130468613e-05, "loss": 0.8164, "step": 6912 }, { "epoch": 4.578145695364238, "grad_norm": 1.1135066462507233, "learning_rate": 6.340551091538898e-05, "loss": 0.7812, "step": 6913 }, { "epoch": 4.578807947019867, "grad_norm": 1.1638899796333226, "learning_rate": 6.335101767279351e-05, "loss": 0.832, "step": 6914 }, { "epoch": 4.579470198675496, "grad_norm": 1.121946526144474, "learning_rate": 6.329654158768988e-05, "loss": 0.8281, "step": 6915 }, { "epoch": 4.580132450331126, "grad_norm": 1.2845482062303137, "learning_rate": 6.32420826708651e-05, "loss": 0.9531, "step": 6916 }, { "epoch": 4.580794701986755, "grad_norm": 1.2162225708784902, "learning_rate": 6.318764093310253e-05, "loss": 0.8789, "step": 6917 }, { "epoch": 4.581456953642384, "grad_norm": 1.0874712271288844, "learning_rate": 6.313321638518235e-05, "loss": 0.793, "step": 6918 }, { "epoch": 4.582119205298013, "grad_norm": 1.068498489983017, "learning_rate": 6.307880903788125e-05, "loss": 0.7383, "step": 6919 }, { "epoch": 4.582781456953643, "grad_norm": 1.065664103307406, "learning_rate": 6.302441890197249e-05, "loss": 0.7773, "step": 6920 }, { "epoch": 4.583443708609272, "grad_norm": 1.2050575922136004, "learning_rate": 6.297004598822587e-05, "loss": 0.8477, "step": 6921 }, { "epoch": 4.584105960264901, "grad_norm": 1.0857044924821755, "learning_rate": 6.291569030740792e-05, "loss": 0.7656, "step": 6922 }, { "epoch": 4.5847682119205295, "grad_norm": 1.2560705097155973, "learning_rate": 6.286135187028171e-05, "loss": 0.8906, "step": 6923 }, { "epoch": 4.585430463576159, "grad_norm": 1.0581088153393603, "learning_rate": 6.28070306876068e-05, "loss": 0.7031, "step": 6924 }, { "epoch": 4.586092715231788, "grad_norm": 1.174399165521076, "learning_rate": 6.275272677013951e-05, "loss": 0.8008, "step": 6925 }, { "epoch": 4.586754966887417, "grad_norm": 1.166594953066894, "learning_rate": 6.269844012863253e-05, "loss": 0.9219, "step": 6926 }, { "epoch": 4.587417218543046, "grad_norm": 1.0806904357595433, "learning_rate": 6.264417077383532e-05, "loss": 0.7734, "step": 6927 }, { "epoch": 4.588079470198675, "grad_norm": 1.161460304934366, "learning_rate": 6.258991871649376e-05, "loss": 0.793, "step": 6928 }, { "epoch": 4.588741721854305, "grad_norm": 1.165820562666789, "learning_rate": 6.253568396735046e-05, "loss": 0.875, "step": 6929 }, { "epoch": 4.589403973509934, "grad_norm": 1.1285372264594138, "learning_rate": 6.248146653714443e-05, "loss": 0.8047, "step": 6930 }, { "epoch": 4.590066225165563, "grad_norm": 1.1387866223413086, "learning_rate": 6.242726643661138e-05, "loss": 0.9141, "step": 6931 }, { "epoch": 4.590728476821192, "grad_norm": 1.2812130839716394, "learning_rate": 6.237308367648365e-05, "loss": 0.9492, "step": 6932 }, { "epoch": 4.591390728476821, "grad_norm": 1.3182182679587982, "learning_rate": 6.231891826748987e-05, "loss": 0.9805, "step": 6933 }, { "epoch": 4.592052980132451, "grad_norm": 1.0947501243642095, "learning_rate": 6.226477022035554e-05, "loss": 0.793, "step": 6934 }, { "epoch": 4.5927152317880795, "grad_norm": 1.121534402990068, "learning_rate": 6.221063954580259e-05, "loss": 0.7617, "step": 6935 }, { "epoch": 4.593377483443708, "grad_norm": 1.226784154126119, "learning_rate": 6.215652625454949e-05, "loss": 0.9414, "step": 6936 }, { "epoch": 4.594039735099337, "grad_norm": 1.1229386473547285, "learning_rate": 6.210243035731121e-05, "loss": 0.8594, "step": 6937 }, { "epoch": 4.594701986754967, "grad_norm": 1.0468893990884205, "learning_rate": 6.204835186479945e-05, "loss": 0.7539, "step": 6938 }, { "epoch": 4.595364238410596, "grad_norm": 1.265856887695957, "learning_rate": 6.199429078772242e-05, "loss": 0.9062, "step": 6939 }, { "epoch": 4.596026490066225, "grad_norm": 1.1130122323313665, "learning_rate": 6.19402471367847e-05, "loss": 0.7656, "step": 6940 }, { "epoch": 4.596688741721854, "grad_norm": 1.3464327885009477, "learning_rate": 6.188622092268768e-05, "loss": 1.0234, "step": 6941 }, { "epoch": 4.597350993377484, "grad_norm": 0.9566015283958451, "learning_rate": 6.183221215612904e-05, "loss": 0.6055, "step": 6942 }, { "epoch": 4.598013245033113, "grad_norm": 1.1607718191218497, "learning_rate": 6.17782208478032e-05, "loss": 0.8047, "step": 6943 }, { "epoch": 4.598675496688742, "grad_norm": 1.076377943875386, "learning_rate": 6.172424700840114e-05, "loss": 0.7188, "step": 6944 }, { "epoch": 4.599337748344371, "grad_norm": 1.0446350237542574, "learning_rate": 6.16702906486102e-05, "loss": 0.7617, "step": 6945 }, { "epoch": 4.6, "grad_norm": 1.3065927141980471, "learning_rate": 6.161635177911428e-05, "loss": 0.9688, "step": 6946 }, { "epoch": 4.6006622516556295, "grad_norm": 1.1141310748179272, "learning_rate": 6.15624304105941e-05, "loss": 0.8242, "step": 6947 }, { "epoch": 4.601324503311258, "grad_norm": 1.008078207130439, "learning_rate": 6.15085265537266e-05, "loss": 0.7109, "step": 6948 }, { "epoch": 4.601986754966887, "grad_norm": 1.0819306682510477, "learning_rate": 6.145464021918531e-05, "loss": 0.75, "step": 6949 }, { "epoch": 4.602649006622516, "grad_norm": 1.1334017945720658, "learning_rate": 6.14007714176404e-05, "loss": 0.7773, "step": 6950 }, { "epoch": 4.603311258278145, "grad_norm": 1.2391789108066615, "learning_rate": 6.134692015975856e-05, "loss": 0.9023, "step": 6951 }, { "epoch": 4.603973509933775, "grad_norm": 1.118121300960935, "learning_rate": 6.129308645620284e-05, "loss": 0.7461, "step": 6952 }, { "epoch": 4.604635761589404, "grad_norm": 1.2347511166336906, "learning_rate": 6.123927031763307e-05, "loss": 0.8633, "step": 6953 }, { "epoch": 4.605298013245033, "grad_norm": 1.1258387026714443, "learning_rate": 6.11854717547053e-05, "loss": 0.793, "step": 6954 }, { "epoch": 4.605960264900662, "grad_norm": 1.2023163018378926, "learning_rate": 6.113169077807238e-05, "loss": 0.8906, "step": 6955 }, { "epoch": 4.606622516556292, "grad_norm": 1.2214320573615893, "learning_rate": 6.107792739838355e-05, "loss": 0.8594, "step": 6956 }, { "epoch": 4.607284768211921, "grad_norm": 1.0806020221419912, "learning_rate": 6.102418162628457e-05, "loss": 0.7578, "step": 6957 }, { "epoch": 4.60794701986755, "grad_norm": 1.1798177414234585, "learning_rate": 6.097045347241766e-05, "loss": 0.8086, "step": 6958 }, { "epoch": 4.608609271523179, "grad_norm": 1.3769076843443844, "learning_rate": 6.091674294742164e-05, "loss": 1.0859, "step": 6959 }, { "epoch": 4.609271523178808, "grad_norm": 1.0639306015472156, "learning_rate": 6.086305006193187e-05, "loss": 0.8281, "step": 6960 }, { "epoch": 4.609933774834437, "grad_norm": 1.1445094011489165, "learning_rate": 6.080937482658006e-05, "loss": 0.832, "step": 6961 }, { "epoch": 4.610596026490066, "grad_norm": 1.2250906383901035, "learning_rate": 6.075571725199463e-05, "loss": 0.8438, "step": 6962 }, { "epoch": 4.611258278145695, "grad_norm": 1.1676497718174408, "learning_rate": 6.070207734880028e-05, "loss": 0.8828, "step": 6963 }, { "epoch": 4.611920529801324, "grad_norm": 1.197848513711673, "learning_rate": 6.064845512761839e-05, "loss": 0.8633, "step": 6964 }, { "epoch": 4.612582781456954, "grad_norm": 1.253446138575748, "learning_rate": 6.0594850599066805e-05, "loss": 0.9258, "step": 6965 }, { "epoch": 4.613245033112583, "grad_norm": 1.1844430572747644, "learning_rate": 6.0541263773759815e-05, "loss": 0.8281, "step": 6966 }, { "epoch": 4.613907284768212, "grad_norm": 1.0961258586393516, "learning_rate": 6.048769466230808e-05, "loss": 0.7422, "step": 6967 }, { "epoch": 4.614569536423841, "grad_norm": 0.99955978161838, "learning_rate": 6.043414327531915e-05, "loss": 0.6328, "step": 6968 }, { "epoch": 4.61523178807947, "grad_norm": 1.1660423726874696, "learning_rate": 6.0380609623396687e-05, "loss": 0.8633, "step": 6969 }, { "epoch": 4.6158940397351, "grad_norm": 1.2048777759940674, "learning_rate": 6.032709371714092e-05, "loss": 0.8438, "step": 6970 }, { "epoch": 4.6165562913907285, "grad_norm": 1.0701535994214333, "learning_rate": 6.027359556714867e-05, "loss": 0.7188, "step": 6971 }, { "epoch": 4.6172185430463575, "grad_norm": 1.1684494217386998, "learning_rate": 6.0220115184013216e-05, "loss": 0.8164, "step": 6972 }, { "epoch": 4.617880794701986, "grad_norm": 1.3791863451149293, "learning_rate": 6.0166652578324196e-05, "loss": 0.9609, "step": 6973 }, { "epoch": 4.618543046357616, "grad_norm": 1.2076100126399816, "learning_rate": 6.011320776066794e-05, "loss": 0.8633, "step": 6974 }, { "epoch": 4.619205298013245, "grad_norm": 1.3822303782541994, "learning_rate": 6.0059780741627e-05, "loss": 1.0938, "step": 6975 }, { "epoch": 4.619867549668874, "grad_norm": 1.048350775721481, "learning_rate": 6.000637153178061e-05, "loss": 0.7383, "step": 6976 }, { "epoch": 4.620529801324503, "grad_norm": 1.2078187909579006, "learning_rate": 5.995298014170445e-05, "loss": 0.8867, "step": 6977 }, { "epoch": 4.621192052980133, "grad_norm": 1.159730581457243, "learning_rate": 5.989960658197057e-05, "loss": 0.8828, "step": 6978 }, { "epoch": 4.621854304635762, "grad_norm": 1.1941997606678523, "learning_rate": 5.984625086314751e-05, "loss": 0.8516, "step": 6979 }, { "epoch": 4.622516556291391, "grad_norm": 1.2307401843524093, "learning_rate": 5.979291299580036e-05, "loss": 0.7812, "step": 6980 }, { "epoch": 4.62317880794702, "grad_norm": 1.0396628310227214, "learning_rate": 5.973959299049067e-05, "loss": 0.7266, "step": 6981 }, { "epoch": 4.623841059602649, "grad_norm": 1.0570827076998779, "learning_rate": 5.9686290857776385e-05, "loss": 0.75, "step": 6982 }, { "epoch": 4.6245033112582785, "grad_norm": 1.3495148598448052, "learning_rate": 5.963300660821185e-05, "loss": 0.9453, "step": 6983 }, { "epoch": 4.6251655629139075, "grad_norm": 1.0885162229533623, "learning_rate": 5.957974025234804e-05, "loss": 0.7461, "step": 6984 }, { "epoch": 4.625827814569536, "grad_norm": 1.1654507895282973, "learning_rate": 5.952649180073236e-05, "loss": 0.7852, "step": 6985 }, { "epoch": 4.626490066225165, "grad_norm": 1.2356631339621922, "learning_rate": 5.947326126390848e-05, "loss": 0.8281, "step": 6986 }, { "epoch": 4.627152317880794, "grad_norm": 1.433047929277579, "learning_rate": 5.942004865241672e-05, "loss": 1.0312, "step": 6987 }, { "epoch": 4.627814569536424, "grad_norm": 1.0982917907059395, "learning_rate": 5.936685397679387e-05, "loss": 0.7969, "step": 6988 }, { "epoch": 4.628476821192053, "grad_norm": 1.3756166861808772, "learning_rate": 5.9313677247572936e-05, "loss": 1.0391, "step": 6989 }, { "epoch": 4.629139072847682, "grad_norm": 1.2531345233597178, "learning_rate": 5.926051847528365e-05, "loss": 0.9688, "step": 6990 }, { "epoch": 4.629801324503311, "grad_norm": 1.2044522845626304, "learning_rate": 5.9207377670451943e-05, "loss": 0.832, "step": 6991 }, { "epoch": 4.630463576158941, "grad_norm": 1.0982888098807004, "learning_rate": 5.9154254843600354e-05, "loss": 0.7305, "step": 6992 }, { "epoch": 4.63112582781457, "grad_norm": 1.4406317105376167, "learning_rate": 5.91011500052479e-05, "loss": 1.0391, "step": 6993 }, { "epoch": 4.631788079470199, "grad_norm": 1.1117843218100936, "learning_rate": 5.904806316590984e-05, "loss": 0.7578, "step": 6994 }, { "epoch": 4.632450331125828, "grad_norm": 1.0910931878544285, "learning_rate": 5.899499433609795e-05, "loss": 0.7969, "step": 6995 }, { "epoch": 4.633112582781457, "grad_norm": 1.251473743233115, "learning_rate": 5.894194352632053e-05, "loss": 0.8906, "step": 6996 }, { "epoch": 4.633774834437086, "grad_norm": 1.1302000634674498, "learning_rate": 5.888891074708229e-05, "loss": 0.8242, "step": 6997 }, { "epoch": 4.634437086092715, "grad_norm": 1.0437022533348599, "learning_rate": 5.8835896008884216e-05, "loss": 0.7344, "step": 6998 }, { "epoch": 4.635099337748344, "grad_norm": 1.0939240578619611, "learning_rate": 5.878289932222395e-05, "loss": 0.7578, "step": 6999 }, { "epoch": 4.635761589403973, "grad_norm": 0.9955241038263763, "learning_rate": 5.8729920697595345e-05, "loss": 0.7109, "step": 7000 }, { "epoch": 4.636423841059603, "grad_norm": 1.0723762294825157, "learning_rate": 5.8676960145488816e-05, "loss": 0.75, "step": 7001 }, { "epoch": 4.637086092715232, "grad_norm": 1.0176393943293747, "learning_rate": 5.8624017676391214e-05, "loss": 0.7773, "step": 7002 }, { "epoch": 4.637748344370861, "grad_norm": 1.1535670273445204, "learning_rate": 5.8571093300785696e-05, "loss": 0.8594, "step": 7003 }, { "epoch": 4.63841059602649, "grad_norm": 1.1339423634110104, "learning_rate": 5.85181870291518e-05, "loss": 0.8672, "step": 7004 }, { "epoch": 4.639072847682119, "grad_norm": 1.1846229497516882, "learning_rate": 5.8465298871965794e-05, "loss": 0.8672, "step": 7005 }, { "epoch": 4.639735099337749, "grad_norm": 1.013882583684185, "learning_rate": 5.841242883970001e-05, "loss": 0.7109, "step": 7006 }, { "epoch": 4.640397350993378, "grad_norm": 1.1042227067314965, "learning_rate": 5.835957694282329e-05, "loss": 0.7812, "step": 7007 }, { "epoch": 4.6410596026490065, "grad_norm": 1.3040755706230582, "learning_rate": 5.830674319180094e-05, "loss": 0.9688, "step": 7008 }, { "epoch": 4.6417218543046355, "grad_norm": 1.0116156431345809, "learning_rate": 5.825392759709473e-05, "loss": 0.6797, "step": 7009 }, { "epoch": 4.642384105960264, "grad_norm": 1.0228567670487176, "learning_rate": 5.8201130169162654e-05, "loss": 0.6602, "step": 7010 }, { "epoch": 4.643046357615894, "grad_norm": 1.2078493017085987, "learning_rate": 5.814835091845928e-05, "loss": 0.9453, "step": 7011 }, { "epoch": 4.643708609271523, "grad_norm": 1.0966112598112292, "learning_rate": 5.80955898554354e-05, "loss": 0.8203, "step": 7012 }, { "epoch": 4.644370860927152, "grad_norm": 1.1553230993936505, "learning_rate": 5.804284699053842e-05, "loss": 0.7461, "step": 7013 }, { "epoch": 4.645033112582782, "grad_norm": 1.1065082754681614, "learning_rate": 5.7990122334212014e-05, "loss": 0.8086, "step": 7014 }, { "epoch": 4.645695364238411, "grad_norm": 1.2010700836145147, "learning_rate": 5.7937415896896245e-05, "loss": 0.8672, "step": 7015 }, { "epoch": 4.64635761589404, "grad_norm": 1.1840075470929352, "learning_rate": 5.788472768902754e-05, "loss": 0.8594, "step": 7016 }, { "epoch": 4.647019867549669, "grad_norm": 1.1971245083269113, "learning_rate": 5.783205772103883e-05, "loss": 0.8086, "step": 7017 }, { "epoch": 4.647682119205298, "grad_norm": 1.1794082049395023, "learning_rate": 5.77794060033594e-05, "loss": 0.8008, "step": 7018 }, { "epoch": 4.6483443708609276, "grad_norm": 1.1188360751481452, "learning_rate": 5.772677254641483e-05, "loss": 0.75, "step": 7019 }, { "epoch": 4.6490066225165565, "grad_norm": 1.035714479378241, "learning_rate": 5.767415736062719e-05, "loss": 0.6875, "step": 7020 }, { "epoch": 4.649668874172185, "grad_norm": 1.1032633046306326, "learning_rate": 5.762156045641484e-05, "loss": 0.7109, "step": 7021 }, { "epoch": 4.650331125827814, "grad_norm": 1.2112131069540477, "learning_rate": 5.756898184419262e-05, "loss": 0.8008, "step": 7022 }, { "epoch": 4.650993377483443, "grad_norm": 1.185459114317371, "learning_rate": 5.751642153437172e-05, "loss": 0.8008, "step": 7023 }, { "epoch": 4.651655629139073, "grad_norm": 1.4126557688963395, "learning_rate": 5.746387953735964e-05, "loss": 1.0547, "step": 7024 }, { "epoch": 4.652317880794702, "grad_norm": 1.2706397110687273, "learning_rate": 5.7411355863560194e-05, "loss": 0.8828, "step": 7025 }, { "epoch": 4.652980132450331, "grad_norm": 1.066286819523345, "learning_rate": 5.735885052337391e-05, "loss": 0.6289, "step": 7026 }, { "epoch": 4.65364238410596, "grad_norm": 1.1743937643410667, "learning_rate": 5.7306363527197294e-05, "loss": 0.8516, "step": 7027 }, { "epoch": 4.654304635761589, "grad_norm": 1.1574547977324783, "learning_rate": 5.7253894885423355e-05, "loss": 0.7812, "step": 7028 }, { "epoch": 4.654966887417219, "grad_norm": 1.0429272796260174, "learning_rate": 5.7201444608441524e-05, "loss": 0.668, "step": 7029 }, { "epoch": 4.655629139072848, "grad_norm": 1.1223914158006674, "learning_rate": 5.714901270663761e-05, "loss": 0.8438, "step": 7030 }, { "epoch": 4.656291390728477, "grad_norm": 1.1556065914148936, "learning_rate": 5.709659919039364e-05, "loss": 0.7461, "step": 7031 }, { "epoch": 4.656953642384106, "grad_norm": 1.3819540553465524, "learning_rate": 5.70442040700882e-05, "loss": 1.0, "step": 7032 }, { "epoch": 4.657615894039735, "grad_norm": 1.1879022462781499, "learning_rate": 5.699182735609597e-05, "loss": 0.8047, "step": 7033 }, { "epoch": 4.658278145695364, "grad_norm": 1.0812793072490416, "learning_rate": 5.6939469058788255e-05, "loss": 0.7305, "step": 7034 }, { "epoch": 4.658940397350993, "grad_norm": 1.097442110783161, "learning_rate": 5.688712918853261e-05, "loss": 0.8242, "step": 7035 }, { "epoch": 4.659602649006622, "grad_norm": 1.0584212059227276, "learning_rate": 5.683480775569289e-05, "loss": 0.7227, "step": 7036 }, { "epoch": 4.660264900662252, "grad_norm": 1.1828382647985438, "learning_rate": 5.678250477062929e-05, "loss": 0.8477, "step": 7037 }, { "epoch": 4.660927152317881, "grad_norm": 1.2563832106080912, "learning_rate": 5.6730220243698456e-05, "loss": 0.9297, "step": 7038 }, { "epoch": 4.66158940397351, "grad_norm": 1.306344873903242, "learning_rate": 5.667795418525335e-05, "loss": 0.8359, "step": 7039 }, { "epoch": 4.662251655629139, "grad_norm": 1.1662518949848335, "learning_rate": 5.662570660564323e-05, "loss": 0.7891, "step": 7040 }, { "epoch": 4.662913907284768, "grad_norm": 0.99820324650275, "learning_rate": 5.65734775152136e-05, "loss": 0.5664, "step": 7041 }, { "epoch": 4.663576158940398, "grad_norm": 1.0741395049460742, "learning_rate": 5.652126692430663e-05, "loss": 0.75, "step": 7042 }, { "epoch": 4.664238410596027, "grad_norm": 1.067751145176672, "learning_rate": 5.6469074843260506e-05, "loss": 0.7891, "step": 7043 }, { "epoch": 4.664900662251656, "grad_norm": 1.113621769657866, "learning_rate": 5.6416901282409785e-05, "loss": 0.7383, "step": 7044 }, { "epoch": 4.6655629139072845, "grad_norm": 1.1224087805689527, "learning_rate": 5.6364746252085535e-05, "loss": 0.7461, "step": 7045 }, { "epoch": 4.6662251655629134, "grad_norm": 1.1093097628627466, "learning_rate": 5.631260976261507e-05, "loss": 0.8125, "step": 7046 }, { "epoch": 4.666887417218543, "grad_norm": 1.0724445127372304, "learning_rate": 5.6260491824321897e-05, "loss": 0.7461, "step": 7047 }, { "epoch": 4.667549668874172, "grad_norm": 1.1884541826221118, "learning_rate": 5.620839244752608e-05, "loss": 0.9062, "step": 7048 }, { "epoch": 4.668211920529801, "grad_norm": 1.0855767259900309, "learning_rate": 5.61563116425438e-05, "loss": 0.6758, "step": 7049 }, { "epoch": 4.66887417218543, "grad_norm": 1.112874019715515, "learning_rate": 5.6104249419687706e-05, "loss": 0.7773, "step": 7050 }, { "epoch": 4.66953642384106, "grad_norm": 1.1766604990221852, "learning_rate": 5.6052205789266756e-05, "loss": 0.8633, "step": 7051 }, { "epoch": 4.670198675496689, "grad_norm": 1.1506621307446079, "learning_rate": 5.6000180761586125e-05, "loss": 0.7617, "step": 7052 }, { "epoch": 4.670860927152318, "grad_norm": 1.1873435749128756, "learning_rate": 5.594817434694733e-05, "loss": 0.8242, "step": 7053 }, { "epoch": 4.671523178807947, "grad_norm": 1.1243927461307628, "learning_rate": 5.5896186555648284e-05, "loss": 0.8398, "step": 7054 }, { "epoch": 4.672185430463577, "grad_norm": 1.5505940287125382, "learning_rate": 5.584421739798322e-05, "loss": 1.1875, "step": 7055 }, { "epoch": 4.6728476821192055, "grad_norm": 1.3330479328130966, "learning_rate": 5.579226688424251e-05, "loss": 1.0234, "step": 7056 }, { "epoch": 4.6735099337748345, "grad_norm": 1.1569371029468127, "learning_rate": 5.5740335024713075e-05, "loss": 0.7969, "step": 7057 }, { "epoch": 4.674172185430463, "grad_norm": 1.22583991876913, "learning_rate": 5.56884218296779e-05, "loss": 0.8008, "step": 7058 }, { "epoch": 4.674834437086092, "grad_norm": 1.059964586175887, "learning_rate": 5.563652730941645e-05, "loss": 0.7344, "step": 7059 }, { "epoch": 4.675496688741722, "grad_norm": 1.3669486261302486, "learning_rate": 5.558465147420449e-05, "loss": 0.9844, "step": 7060 }, { "epoch": 4.676158940397351, "grad_norm": 1.3586032853928351, "learning_rate": 5.5532794334313965e-05, "loss": 1.0078, "step": 7061 }, { "epoch": 4.67682119205298, "grad_norm": 1.2169532919154653, "learning_rate": 5.5480955900013106e-05, "loss": 0.9688, "step": 7062 }, { "epoch": 4.677483443708609, "grad_norm": 1.1036798576105105, "learning_rate": 5.54291361815667e-05, "loss": 0.7695, "step": 7063 }, { "epoch": 4.678145695364238, "grad_norm": 1.155226298889485, "learning_rate": 5.537733518923554e-05, "loss": 0.7734, "step": 7064 }, { "epoch": 4.678807947019868, "grad_norm": 1.2334251481222667, "learning_rate": 5.532555293327678e-05, "loss": 0.9453, "step": 7065 }, { "epoch": 4.679470198675497, "grad_norm": 1.1813151653522997, "learning_rate": 5.527378942394396e-05, "loss": 0.8047, "step": 7066 }, { "epoch": 4.680132450331126, "grad_norm": 1.1558483712216368, "learning_rate": 5.522204467148687e-05, "loss": 0.875, "step": 7067 }, { "epoch": 4.680794701986755, "grad_norm": 1.1692424355441124, "learning_rate": 5.5170318686151485e-05, "loss": 0.8242, "step": 7068 }, { "epoch": 4.6814569536423845, "grad_norm": 1.2167354805394077, "learning_rate": 5.511861147818023e-05, "loss": 0.8867, "step": 7069 }, { "epoch": 4.682119205298013, "grad_norm": 1.3018414722199823, "learning_rate": 5.5066923057811634e-05, "loss": 1.0156, "step": 7070 }, { "epoch": 4.682781456953642, "grad_norm": 1.1344952685949268, "learning_rate": 5.5015253435280656e-05, "loss": 0.832, "step": 7071 }, { "epoch": 4.683443708609271, "grad_norm": 1.2467812924210178, "learning_rate": 5.49636026208185e-05, "loss": 0.9336, "step": 7072 }, { "epoch": 4.684105960264901, "grad_norm": 1.2580954193186533, "learning_rate": 5.491197062465258e-05, "loss": 0.918, "step": 7073 }, { "epoch": 4.68476821192053, "grad_norm": 1.1215631231511882, "learning_rate": 5.4860357457006594e-05, "loss": 0.8125, "step": 7074 }, { "epoch": 4.685430463576159, "grad_norm": 1.0978783136959187, "learning_rate": 5.4808763128100555e-05, "loss": 0.8086, "step": 7075 }, { "epoch": 4.686092715231788, "grad_norm": 1.2622610259997786, "learning_rate": 5.4757187648150806e-05, "loss": 0.957, "step": 7076 }, { "epoch": 4.686754966887417, "grad_norm": 1.120968151397482, "learning_rate": 5.4705631027369774e-05, "loss": 0.6992, "step": 7077 }, { "epoch": 4.687417218543047, "grad_norm": 1.1085147152685302, "learning_rate": 5.4654093275966367e-05, "loss": 0.832, "step": 7078 }, { "epoch": 4.688079470198676, "grad_norm": 1.1402191673054127, "learning_rate": 5.4602574404145535e-05, "loss": 0.8242, "step": 7079 }, { "epoch": 4.688741721854305, "grad_norm": 1.2032560947634245, "learning_rate": 5.455107442210867e-05, "loss": 0.793, "step": 7080 }, { "epoch": 4.6894039735099335, "grad_norm": 1.0947289845339399, "learning_rate": 5.449959334005341e-05, "loss": 0.7305, "step": 7081 }, { "epoch": 4.6900662251655625, "grad_norm": 1.0495007219487504, "learning_rate": 5.444813116817349e-05, "loss": 0.6836, "step": 7082 }, { "epoch": 4.690728476821192, "grad_norm": 1.2230074675120852, "learning_rate": 5.439668791665906e-05, "loss": 0.8398, "step": 7083 }, { "epoch": 4.691390728476821, "grad_norm": 1.1468296423903233, "learning_rate": 5.434526359569654e-05, "loss": 0.7969, "step": 7084 }, { "epoch": 4.69205298013245, "grad_norm": 1.1667174000097313, "learning_rate": 5.4293858215468475e-05, "loss": 0.8164, "step": 7085 }, { "epoch": 4.692715231788079, "grad_norm": 1.0918912375854812, "learning_rate": 5.4242471786153676e-05, "loss": 0.668, "step": 7086 }, { "epoch": 4.693377483443709, "grad_norm": 1.121677768080626, "learning_rate": 5.419110431792726e-05, "loss": 0.7461, "step": 7087 }, { "epoch": 4.694039735099338, "grad_norm": 1.0847243475563966, "learning_rate": 5.413975582096067e-05, "loss": 0.7812, "step": 7088 }, { "epoch": 4.694701986754967, "grad_norm": 1.0367631214908204, "learning_rate": 5.4088426305421385e-05, "loss": 0.6797, "step": 7089 }, { "epoch": 4.695364238410596, "grad_norm": 1.0772441961787123, "learning_rate": 5.403711578147333e-05, "loss": 0.8047, "step": 7090 }, { "epoch": 4.696026490066226, "grad_norm": 1.21661670190246, "learning_rate": 5.398582425927646e-05, "loss": 0.9883, "step": 7091 }, { "epoch": 4.696688741721855, "grad_norm": 1.1931615473169208, "learning_rate": 5.393455174898718e-05, "loss": 0.832, "step": 7092 }, { "epoch": 4.6973509933774835, "grad_norm": 1.3109990322608516, "learning_rate": 5.3883298260758037e-05, "loss": 0.9727, "step": 7093 }, { "epoch": 4.6980132450331125, "grad_norm": 1.114938914632661, "learning_rate": 5.383206380473779e-05, "loss": 0.7539, "step": 7094 }, { "epoch": 4.698675496688741, "grad_norm": 1.0586752490670794, "learning_rate": 5.378084839107133e-05, "loss": 0.7227, "step": 7095 }, { "epoch": 4.699337748344371, "grad_norm": 1.1078226393374917, "learning_rate": 5.3729652029900124e-05, "loss": 0.7539, "step": 7096 }, { "epoch": 4.7, "grad_norm": 1.2215658070496844, "learning_rate": 5.3678474731361505e-05, "loss": 0.8672, "step": 7097 }, { "epoch": 4.700662251655629, "grad_norm": 1.2491521687973715, "learning_rate": 5.362731650558919e-05, "loss": 0.8125, "step": 7098 }, { "epoch": 4.701324503311258, "grad_norm": 1.3257241365235994, "learning_rate": 5.357617736271296e-05, "loss": 0.9648, "step": 7099 }, { "epoch": 4.701986754966887, "grad_norm": 1.1708630982472266, "learning_rate": 5.3525057312859204e-05, "loss": 0.7852, "step": 7100 }, { "epoch": 4.702649006622517, "grad_norm": 1.0418924288205724, "learning_rate": 5.347395636615013e-05, "loss": 0.6875, "step": 7101 }, { "epoch": 4.703311258278146, "grad_norm": 1.2697256010250566, "learning_rate": 5.342287453270429e-05, "loss": 0.8633, "step": 7102 }, { "epoch": 4.703973509933775, "grad_norm": 1.1446557965870396, "learning_rate": 5.33718118226365e-05, "loss": 0.8125, "step": 7103 }, { "epoch": 4.704635761589404, "grad_norm": 0.9749906697692001, "learning_rate": 5.332076824605784e-05, "loss": 0.6094, "step": 7104 }, { "epoch": 4.7052980132450335, "grad_norm": 1.090784089440086, "learning_rate": 5.3269743813075397e-05, "loss": 0.7539, "step": 7105 }, { "epoch": 4.705960264900662, "grad_norm": 1.1440954459408965, "learning_rate": 5.321873853379272e-05, "loss": 0.8086, "step": 7106 }, { "epoch": 4.706622516556291, "grad_norm": 1.2214631534523952, "learning_rate": 5.316775241830932e-05, "loss": 0.8984, "step": 7107 }, { "epoch": 4.70728476821192, "grad_norm": 1.2165313386065622, "learning_rate": 5.311678547672108e-05, "loss": 0.9336, "step": 7108 }, { "epoch": 4.70794701986755, "grad_norm": 1.015415883944974, "learning_rate": 5.30658377191201e-05, "loss": 0.7188, "step": 7109 }, { "epoch": 4.708609271523179, "grad_norm": 1.215510323040822, "learning_rate": 5.301490915559457e-05, "loss": 0.8789, "step": 7110 }, { "epoch": 4.709271523178808, "grad_norm": 1.1603161379768387, "learning_rate": 5.296399979622887e-05, "loss": 0.8633, "step": 7111 }, { "epoch": 4.709933774834437, "grad_norm": 1.2345625844744474, "learning_rate": 5.291310965110368e-05, "loss": 0.9375, "step": 7112 }, { "epoch": 4.710596026490066, "grad_norm": 1.0987886549309647, "learning_rate": 5.286223873029591e-05, "loss": 0.75, "step": 7113 }, { "epoch": 4.711258278145696, "grad_norm": 0.9715037384497646, "learning_rate": 5.281138704387846e-05, "loss": 0.6836, "step": 7114 }, { "epoch": 4.711920529801325, "grad_norm": 1.055482340302242, "learning_rate": 5.2760554601920666e-05, "loss": 0.6641, "step": 7115 }, { "epoch": 4.712582781456954, "grad_norm": 1.1258207864591117, "learning_rate": 5.2709741414487814e-05, "loss": 0.7656, "step": 7116 }, { "epoch": 4.713245033112583, "grad_norm": 0.9993116413421633, "learning_rate": 5.265894749164157e-05, "loss": 0.6758, "step": 7117 }, { "epoch": 4.7139072847682115, "grad_norm": 1.1995881161745794, "learning_rate": 5.2608172843439744e-05, "loss": 0.875, "step": 7118 }, { "epoch": 4.714569536423841, "grad_norm": 1.212359892525248, "learning_rate": 5.255741747993625e-05, "loss": 0.9062, "step": 7119 }, { "epoch": 4.71523178807947, "grad_norm": 1.237288541773043, "learning_rate": 5.2506681411181144e-05, "loss": 0.8867, "step": 7120 }, { "epoch": 4.715894039735099, "grad_norm": 1.032104432730371, "learning_rate": 5.245596464722095e-05, "loss": 0.7109, "step": 7121 }, { "epoch": 4.716556291390728, "grad_norm": 1.2014457968659664, "learning_rate": 5.240526719809805e-05, "loss": 0.8477, "step": 7122 }, { "epoch": 4.717218543046357, "grad_norm": 1.193918382555, "learning_rate": 5.23545890738511e-05, "loss": 0.832, "step": 7123 }, { "epoch": 4.717880794701987, "grad_norm": 1.0920126587261676, "learning_rate": 5.230393028451495e-05, "loss": 0.8047, "step": 7124 }, { "epoch": 4.718543046357616, "grad_norm": 1.1612780794891169, "learning_rate": 5.225329084012073e-05, "loss": 0.7578, "step": 7125 }, { "epoch": 4.719205298013245, "grad_norm": 1.179832100183041, "learning_rate": 5.220267075069549e-05, "loss": 0.7578, "step": 7126 }, { "epoch": 4.719867549668874, "grad_norm": 1.1393915666642696, "learning_rate": 5.2152070026262713e-05, "loss": 0.8164, "step": 7127 }, { "epoch": 4.720529801324504, "grad_norm": 1.0795246085473353, "learning_rate": 5.21014886768418e-05, "loss": 0.7539, "step": 7128 }, { "epoch": 4.721192052980133, "grad_norm": 1.1642049867785575, "learning_rate": 5.2050926712448503e-05, "loss": 0.7969, "step": 7129 }, { "epoch": 4.7218543046357615, "grad_norm": 1.1726173359119982, "learning_rate": 5.200038414309471e-05, "loss": 0.7891, "step": 7130 }, { "epoch": 4.72251655629139, "grad_norm": 1.1135955208566248, "learning_rate": 5.194986097878839e-05, "loss": 0.7422, "step": 7131 }, { "epoch": 4.72317880794702, "grad_norm": 1.2309882883062173, "learning_rate": 5.189935722953367e-05, "loss": 0.9062, "step": 7132 }, { "epoch": 4.723841059602649, "grad_norm": 1.2099927357816274, "learning_rate": 5.1848872905330885e-05, "loss": 0.8203, "step": 7133 }, { "epoch": 4.724503311258278, "grad_norm": 1.3481521140344148, "learning_rate": 5.1798408016176585e-05, "loss": 0.957, "step": 7134 }, { "epoch": 4.725165562913907, "grad_norm": 1.074083335688575, "learning_rate": 5.1747962572063294e-05, "loss": 0.6641, "step": 7135 }, { "epoch": 4.725827814569536, "grad_norm": 1.2718830611086729, "learning_rate": 5.169753658297984e-05, "loss": 0.8711, "step": 7136 }, { "epoch": 4.726490066225166, "grad_norm": 1.1755623476665542, "learning_rate": 5.164713005891119e-05, "loss": 0.7773, "step": 7137 }, { "epoch": 4.727152317880795, "grad_norm": 1.2852782952449455, "learning_rate": 5.159674300983832e-05, "loss": 0.8594, "step": 7138 }, { "epoch": 4.727814569536424, "grad_norm": 1.095535218691356, "learning_rate": 5.1546375445738535e-05, "loss": 0.7812, "step": 7139 }, { "epoch": 4.728476821192053, "grad_norm": 1.0662498717182023, "learning_rate": 5.1496027376585106e-05, "loss": 0.7188, "step": 7140 }, { "epoch": 4.729139072847682, "grad_norm": 1.2747544030337339, "learning_rate": 5.144569881234756e-05, "loss": 0.9297, "step": 7141 }, { "epoch": 4.7298013245033115, "grad_norm": 1.0955048279431736, "learning_rate": 5.139538976299158e-05, "loss": 0.7188, "step": 7142 }, { "epoch": 4.73046357615894, "grad_norm": 1.0978554846546678, "learning_rate": 5.1345100238478905e-05, "loss": 0.7344, "step": 7143 }, { "epoch": 4.731125827814569, "grad_norm": 1.0603748675151685, "learning_rate": 5.1294830248767374e-05, "loss": 0.7852, "step": 7144 }, { "epoch": 4.731788079470198, "grad_norm": 1.0095152885970147, "learning_rate": 5.1244579803811055e-05, "loss": 0.7344, "step": 7145 }, { "epoch": 4.732450331125828, "grad_norm": 1.3373337810808805, "learning_rate": 5.119434891356019e-05, "loss": 0.9766, "step": 7146 }, { "epoch": 4.733112582781457, "grad_norm": 1.0346943298261204, "learning_rate": 5.114413758796096e-05, "loss": 0.7852, "step": 7147 }, { "epoch": 4.733774834437086, "grad_norm": 1.0369904510703065, "learning_rate": 5.109394583695589e-05, "loss": 0.6953, "step": 7148 }, { "epoch": 4.734437086092715, "grad_norm": 1.1388018166029203, "learning_rate": 5.104377367048342e-05, "loss": 0.8164, "step": 7149 }, { "epoch": 4.735099337748345, "grad_norm": 1.0910870857532011, "learning_rate": 5.099362109847824e-05, "loss": 0.8125, "step": 7150 }, { "epoch": 4.735761589403974, "grad_norm": 1.0975274580958678, "learning_rate": 5.094348813087121e-05, "loss": 0.7578, "step": 7151 }, { "epoch": 4.736423841059603, "grad_norm": 1.0581097904987777, "learning_rate": 5.089337477758919e-05, "loss": 0.7305, "step": 7152 }, { "epoch": 4.737086092715232, "grad_norm": 1.1201964295424447, "learning_rate": 5.084328104855511e-05, "loss": 0.8633, "step": 7153 }, { "epoch": 4.737748344370861, "grad_norm": 1.2435739535741215, "learning_rate": 5.079320695368819e-05, "loss": 0.8906, "step": 7154 }, { "epoch": 4.73841059602649, "grad_norm": 1.231746864094534, "learning_rate": 5.074315250290371e-05, "loss": 0.8672, "step": 7155 }, { "epoch": 4.739072847682119, "grad_norm": 1.1583337821362971, "learning_rate": 5.069311770611298e-05, "loss": 0.8125, "step": 7156 }, { "epoch": 4.739735099337748, "grad_norm": 1.1955960800541316, "learning_rate": 5.064310257322336e-05, "loss": 0.8125, "step": 7157 }, { "epoch": 4.740397350993377, "grad_norm": 1.1938773778790783, "learning_rate": 5.059310711413861e-05, "loss": 0.9297, "step": 7158 }, { "epoch": 4.741059602649006, "grad_norm": 1.230874836972059, "learning_rate": 5.054313133875833e-05, "loss": 0.8633, "step": 7159 }, { "epoch": 4.741721854304636, "grad_norm": 1.1991547294273075, "learning_rate": 5.049317525697823e-05, "loss": 0.8086, "step": 7160 }, { "epoch": 4.742384105960265, "grad_norm": 1.066769628467188, "learning_rate": 5.0443238878690217e-05, "loss": 0.6992, "step": 7161 }, { "epoch": 4.743046357615894, "grad_norm": 1.1067580674252477, "learning_rate": 5.0393322213782365e-05, "loss": 0.7578, "step": 7162 }, { "epoch": 4.743708609271523, "grad_norm": 1.2021088167374228, "learning_rate": 5.0343425272138606e-05, "loss": 0.875, "step": 7163 }, { "epoch": 4.744370860927153, "grad_norm": 1.348539761863938, "learning_rate": 5.029354806363921e-05, "loss": 1.0234, "step": 7164 }, { "epoch": 4.745033112582782, "grad_norm": 1.1541055714489195, "learning_rate": 5.024369059816035e-05, "loss": 0.8086, "step": 7165 }, { "epoch": 4.7456953642384105, "grad_norm": 1.2174528328538805, "learning_rate": 5.019385288557443e-05, "loss": 0.8672, "step": 7166 }, { "epoch": 4.7463576158940395, "grad_norm": 1.2117772520516918, "learning_rate": 5.0144034935749924e-05, "loss": 0.8516, "step": 7167 }, { "epoch": 4.747019867549669, "grad_norm": 1.1674270718010678, "learning_rate": 5.00942367585513e-05, "loss": 0.8828, "step": 7168 }, { "epoch": 4.747682119205298, "grad_norm": 1.0832283664466127, "learning_rate": 5.0044458363839144e-05, "loss": 0.7383, "step": 7169 }, { "epoch": 4.748344370860927, "grad_norm": 1.570906943764126, "learning_rate": 4.999469976147018e-05, "loss": 1.1094, "step": 7170 }, { "epoch": 4.749006622516556, "grad_norm": 1.1680387424937768, "learning_rate": 4.994496096129724e-05, "loss": 0.832, "step": 7171 }, { "epoch": 4.749668874172185, "grad_norm": 1.2290367897883943, "learning_rate": 4.9895241973169064e-05, "loss": 0.8477, "step": 7172 }, { "epoch": 4.750331125827815, "grad_norm": 1.1643297827947199, "learning_rate": 4.984554280693069e-05, "loss": 0.793, "step": 7173 }, { "epoch": 4.750993377483444, "grad_norm": 1.171335135311921, "learning_rate": 4.979586347242304e-05, "loss": 0.8242, "step": 7174 }, { "epoch": 4.751655629139073, "grad_norm": 1.0771202991260578, "learning_rate": 4.974620397948321e-05, "loss": 0.6875, "step": 7175 }, { "epoch": 4.752317880794702, "grad_norm": 1.1206934814728844, "learning_rate": 4.969656433794442e-05, "loss": 0.7617, "step": 7176 }, { "epoch": 4.752980132450331, "grad_norm": 1.1536339424463673, "learning_rate": 4.964694455763579e-05, "loss": 0.7852, "step": 7177 }, { "epoch": 4.7536423841059605, "grad_norm": 1.282018150722163, "learning_rate": 4.959734464838263e-05, "loss": 0.9492, "step": 7178 }, { "epoch": 4.7543046357615895, "grad_norm": 1.1007523001660744, "learning_rate": 4.9547764620006374e-05, "loss": 0.7344, "step": 7179 }, { "epoch": 4.754966887417218, "grad_norm": 1.2453785299082134, "learning_rate": 4.949820448232437e-05, "loss": 0.9297, "step": 7180 }, { "epoch": 4.755629139072847, "grad_norm": 1.277048308645179, "learning_rate": 4.944866424515003e-05, "loss": 0.9531, "step": 7181 }, { "epoch": 4.756291390728477, "grad_norm": 1.1832355031340849, "learning_rate": 4.939914391829296e-05, "loss": 0.8281, "step": 7182 }, { "epoch": 4.756953642384106, "grad_norm": 1.1944618209184032, "learning_rate": 4.93496435115588e-05, "loss": 0.8164, "step": 7183 }, { "epoch": 4.757615894039735, "grad_norm": 1.0784770790747988, "learning_rate": 4.93001630347491e-05, "loss": 0.8125, "step": 7184 }, { "epoch": 4.758278145695364, "grad_norm": 1.1928876695646673, "learning_rate": 4.925070249766164e-05, "loss": 0.9102, "step": 7185 }, { "epoch": 4.758940397350994, "grad_norm": 1.1339832149548208, "learning_rate": 4.92012619100901e-05, "loss": 0.8555, "step": 7186 }, { "epoch": 4.759602649006623, "grad_norm": 1.2044585172247242, "learning_rate": 4.9151841281824306e-05, "loss": 0.8125, "step": 7187 }, { "epoch": 4.760264900662252, "grad_norm": 1.1524824445690633, "learning_rate": 4.910244062265018e-05, "loss": 0.8438, "step": 7188 }, { "epoch": 4.760927152317881, "grad_norm": 1.2065007595189678, "learning_rate": 4.905305994234958e-05, "loss": 0.8242, "step": 7189 }, { "epoch": 4.76158940397351, "grad_norm": 1.0803387360724992, "learning_rate": 4.900369925070031e-05, "loss": 0.707, "step": 7190 }, { "epoch": 4.762251655629139, "grad_norm": 1.0858832587653307, "learning_rate": 4.895435855747659e-05, "loss": 0.7461, "step": 7191 }, { "epoch": 4.762913907284768, "grad_norm": 1.142773232752676, "learning_rate": 4.8905037872448315e-05, "loss": 0.832, "step": 7192 }, { "epoch": 4.763576158940397, "grad_norm": 1.3004154049541379, "learning_rate": 4.8855737205381503e-05, "loss": 0.9766, "step": 7193 }, { "epoch": 4.764238410596026, "grad_norm": 1.0666638591675446, "learning_rate": 4.880645656603831e-05, "loss": 0.7773, "step": 7194 }, { "epoch": 4.764900662251655, "grad_norm": 1.143019185769525, "learning_rate": 4.875719596417692e-05, "loss": 0.8906, "step": 7195 }, { "epoch": 4.765562913907285, "grad_norm": 1.202084605849404, "learning_rate": 4.870795540955138e-05, "loss": 0.8945, "step": 7196 }, { "epoch": 4.766225165562914, "grad_norm": 1.314272584439363, "learning_rate": 4.8658734911912014e-05, "loss": 0.8984, "step": 7197 }, { "epoch": 4.766887417218543, "grad_norm": 1.2843177845513354, "learning_rate": 4.86095344810049e-05, "loss": 0.9297, "step": 7198 }, { "epoch": 4.767549668874172, "grad_norm": 1.0176165726182163, "learning_rate": 4.8560354126572375e-05, "loss": 0.6602, "step": 7199 }, { "epoch": 4.768211920529802, "grad_norm": 1.1734179397178321, "learning_rate": 4.851119385835274e-05, "loss": 0.8359, "step": 7200 }, { "epoch": 4.768874172185431, "grad_norm": 1.0447069450270847, "learning_rate": 4.8462053686080264e-05, "loss": 0.6797, "step": 7201 }, { "epoch": 4.76953642384106, "grad_norm": 1.206181089514286, "learning_rate": 4.841293361948521e-05, "loss": 0.7656, "step": 7202 }, { "epoch": 4.7701986754966885, "grad_norm": 1.2038718853843677, "learning_rate": 4.836383366829395e-05, "loss": 0.8359, "step": 7203 }, { "epoch": 4.770860927152318, "grad_norm": 1.072471860513805, "learning_rate": 4.8314753842228886e-05, "loss": 0.6953, "step": 7204 }, { "epoch": 4.771523178807947, "grad_norm": 1.3284983442020013, "learning_rate": 4.82656941510083e-05, "loss": 0.9805, "step": 7205 }, { "epoch": 4.772185430463576, "grad_norm": 1.131439219344057, "learning_rate": 4.821665460434667e-05, "loss": 0.793, "step": 7206 }, { "epoch": 4.772847682119205, "grad_norm": 1.1942074755603045, "learning_rate": 4.8167635211954286e-05, "loss": 0.8477, "step": 7207 }, { "epoch": 4.773509933774834, "grad_norm": 1.222358636204658, "learning_rate": 4.81186359835376e-05, "loss": 0.8398, "step": 7208 }, { "epoch": 4.774172185430464, "grad_norm": 1.2014557929778804, "learning_rate": 4.806965692879907e-05, "loss": 0.8047, "step": 7209 }, { "epoch": 4.774834437086093, "grad_norm": 1.1699279034341865, "learning_rate": 4.8020698057437054e-05, "loss": 0.7227, "step": 7210 }, { "epoch": 4.775496688741722, "grad_norm": 1.1270894561086955, "learning_rate": 4.797175937914593e-05, "loss": 0.7227, "step": 7211 }, { "epoch": 4.776158940397351, "grad_norm": 1.2429824851769182, "learning_rate": 4.792284090361617e-05, "loss": 0.8242, "step": 7212 }, { "epoch": 4.77682119205298, "grad_norm": 1.3799122542420719, "learning_rate": 4.7873942640534234e-05, "loss": 0.9453, "step": 7213 }, { "epoch": 4.77748344370861, "grad_norm": 1.141064224729231, "learning_rate": 4.782506459958244e-05, "loss": 0.7344, "step": 7214 }, { "epoch": 4.7781456953642385, "grad_norm": 1.160508310016984, "learning_rate": 4.7776206790439256e-05, "loss": 0.7617, "step": 7215 }, { "epoch": 4.778807947019867, "grad_norm": 1.2270888783739182, "learning_rate": 4.772736922277915e-05, "loss": 0.8867, "step": 7216 }, { "epoch": 4.779470198675496, "grad_norm": 1.0712416443160644, "learning_rate": 4.7678551906272426e-05, "loss": 0.7188, "step": 7217 }, { "epoch": 4.780132450331126, "grad_norm": 1.1849827168098768, "learning_rate": 4.7629754850585475e-05, "loss": 0.8398, "step": 7218 }, { "epoch": 4.780794701986755, "grad_norm": 1.0937599741585422, "learning_rate": 4.758097806538069e-05, "loss": 0.707, "step": 7219 }, { "epoch": 4.781456953642384, "grad_norm": 1.1853483038709904, "learning_rate": 4.75322215603165e-05, "loss": 0.7773, "step": 7220 }, { "epoch": 4.782119205298013, "grad_norm": 1.164385890556684, "learning_rate": 4.748348534504714e-05, "loss": 0.7617, "step": 7221 }, { "epoch": 4.782781456953643, "grad_norm": 1.0737112184816073, "learning_rate": 4.743476942922305e-05, "loss": 0.6562, "step": 7222 }, { "epoch": 4.783443708609272, "grad_norm": 1.1136643304812823, "learning_rate": 4.7386073822490435e-05, "loss": 0.7461, "step": 7223 }, { "epoch": 4.784105960264901, "grad_norm": 1.1232421727022595, "learning_rate": 4.733739853449161e-05, "loss": 0.8008, "step": 7224 }, { "epoch": 4.78476821192053, "grad_norm": 1.126474321587426, "learning_rate": 4.728874357486495e-05, "loss": 0.7812, "step": 7225 }, { "epoch": 4.785430463576159, "grad_norm": 1.1873502910203428, "learning_rate": 4.724010895324458e-05, "loss": 0.8008, "step": 7226 }, { "epoch": 4.7860927152317885, "grad_norm": 1.0430732437740926, "learning_rate": 4.7191494679260705e-05, "loss": 0.7422, "step": 7227 }, { "epoch": 4.786754966887417, "grad_norm": 1.2281989077400766, "learning_rate": 4.714290076253955e-05, "loss": 0.7812, "step": 7228 }, { "epoch": 4.787417218543046, "grad_norm": 1.267796257062858, "learning_rate": 4.7094327212703296e-05, "loss": 0.832, "step": 7229 }, { "epoch": 4.788079470198675, "grad_norm": 1.0957312526178395, "learning_rate": 4.7045774039369995e-05, "loss": 0.7539, "step": 7230 }, { "epoch": 4.788741721854304, "grad_norm": 1.129782064356979, "learning_rate": 4.699724125215376e-05, "loss": 0.7695, "step": 7231 }, { "epoch": 4.789403973509934, "grad_norm": 1.2872351134949995, "learning_rate": 4.69487288606647e-05, "loss": 0.875, "step": 7232 }, { "epoch": 4.790066225165563, "grad_norm": 1.2303684799279542, "learning_rate": 4.6900236874508704e-05, "loss": 0.8242, "step": 7233 }, { "epoch": 4.790728476821192, "grad_norm": 1.108360759388955, "learning_rate": 4.685176530328787e-05, "loss": 0.7148, "step": 7234 }, { "epoch": 4.791390728476821, "grad_norm": 1.1420107292553023, "learning_rate": 4.6803314156600005e-05, "loss": 0.7695, "step": 7235 }, { "epoch": 4.79205298013245, "grad_norm": 1.0606104148340048, "learning_rate": 4.675488344403906e-05, "loss": 0.6836, "step": 7236 }, { "epoch": 4.79271523178808, "grad_norm": 1.0929066685688216, "learning_rate": 4.67064731751949e-05, "loss": 0.7422, "step": 7237 }, { "epoch": 4.793377483443709, "grad_norm": 1.130344448663671, "learning_rate": 4.665808335965327e-05, "loss": 0.7031, "step": 7238 }, { "epoch": 4.794039735099338, "grad_norm": 1.2692255276753004, "learning_rate": 4.660971400699586e-05, "loss": 0.8086, "step": 7239 }, { "epoch": 4.7947019867549665, "grad_norm": 1.1811844635449245, "learning_rate": 4.6561365126800414e-05, "loss": 0.8086, "step": 7240 }, { "epoch": 4.795364238410596, "grad_norm": 1.412055619775279, "learning_rate": 4.6513036728640616e-05, "loss": 0.9375, "step": 7241 }, { "epoch": 4.796026490066225, "grad_norm": 1.1680503005248548, "learning_rate": 4.6464728822085915e-05, "loss": 0.8242, "step": 7242 }, { "epoch": 4.796688741721854, "grad_norm": 1.227137338284097, "learning_rate": 4.6416441416701975e-05, "loss": 0.8008, "step": 7243 }, { "epoch": 4.797350993377483, "grad_norm": 1.1027400623574954, "learning_rate": 4.636817452205012e-05, "loss": 0.7227, "step": 7244 }, { "epoch": 4.798013245033113, "grad_norm": 1.2064146052760607, "learning_rate": 4.6319928147687804e-05, "loss": 0.8672, "step": 7245 }, { "epoch": 4.798675496688742, "grad_norm": 1.0645242665138956, "learning_rate": 4.6271702303168425e-05, "loss": 0.6914, "step": 7246 }, { "epoch": 4.799337748344371, "grad_norm": 1.1412294577166242, "learning_rate": 4.622349699804119e-05, "loss": 0.7734, "step": 7247 }, { "epoch": 4.8, "grad_norm": 1.055176889692667, "learning_rate": 4.6175312241851214e-05, "loss": 0.6641, "step": 7248 }, { "epoch": 4.800662251655629, "grad_norm": 1.144096998756092, "learning_rate": 4.612714804413982e-05, "loss": 0.7383, "step": 7249 }, { "epoch": 4.801324503311259, "grad_norm": 1.1725531716423667, "learning_rate": 4.607900441444397e-05, "loss": 0.8555, "step": 7250 }, { "epoch": 4.8019867549668875, "grad_norm": 1.477249261971805, "learning_rate": 4.6030881362296633e-05, "loss": 1.1094, "step": 7251 }, { "epoch": 4.8026490066225165, "grad_norm": 1.103034167549984, "learning_rate": 4.598277889722674e-05, "loss": 0.7109, "step": 7252 }, { "epoch": 4.803311258278145, "grad_norm": 1.191031974826129, "learning_rate": 4.5934697028759185e-05, "loss": 0.7578, "step": 7253 }, { "epoch": 4.803973509933774, "grad_norm": 1.049916896903748, "learning_rate": 4.588663576641466e-05, "loss": 0.7227, "step": 7254 }, { "epoch": 4.804635761589404, "grad_norm": 1.106471165351405, "learning_rate": 4.583859511970991e-05, "loss": 0.7461, "step": 7255 }, { "epoch": 4.805298013245033, "grad_norm": 1.1065069565393768, "learning_rate": 4.5790575098157444e-05, "loss": 0.7422, "step": 7256 }, { "epoch": 4.805960264900662, "grad_norm": 1.0676115276861464, "learning_rate": 4.574257571126585e-05, "loss": 0.793, "step": 7257 }, { "epoch": 4.806622516556291, "grad_norm": 1.3485860573481734, "learning_rate": 4.5694596968539556e-05, "loss": 0.9492, "step": 7258 }, { "epoch": 4.807284768211921, "grad_norm": 1.0946302294894306, "learning_rate": 4.5646638879478905e-05, "loss": 0.7422, "step": 7259 }, { "epoch": 4.80794701986755, "grad_norm": 1.1644860782479798, "learning_rate": 4.5598701453580076e-05, "loss": 0.8398, "step": 7260 }, { "epoch": 4.808609271523179, "grad_norm": 1.2460177599886844, "learning_rate": 4.5550784700335285e-05, "loss": 0.8477, "step": 7261 }, { "epoch": 4.809271523178808, "grad_norm": 1.1573926437812292, "learning_rate": 4.550288862923264e-05, "loss": 0.832, "step": 7262 }, { "epoch": 4.8099337748344375, "grad_norm": 1.3365551600813912, "learning_rate": 4.545501324975601e-05, "loss": 0.9922, "step": 7263 }, { "epoch": 4.8105960264900665, "grad_norm": 1.41429628585556, "learning_rate": 4.540715857138539e-05, "loss": 0.9883, "step": 7264 }, { "epoch": 4.811258278145695, "grad_norm": 1.4715731329949375, "learning_rate": 4.535932460359643e-05, "loss": 1.1406, "step": 7265 }, { "epoch": 4.811920529801324, "grad_norm": 1.119232244641852, "learning_rate": 4.531151135586086e-05, "loss": 0.75, "step": 7266 }, { "epoch": 4.812582781456953, "grad_norm": 1.0968290081627292, "learning_rate": 4.526371883764632e-05, "loss": 0.7188, "step": 7267 }, { "epoch": 4.813245033112583, "grad_norm": 1.1985449875114043, "learning_rate": 4.521594705841615e-05, "loss": 0.7773, "step": 7268 }, { "epoch": 4.813907284768212, "grad_norm": 1.4390652035768554, "learning_rate": 4.516819602762983e-05, "loss": 1.0781, "step": 7269 }, { "epoch": 4.814569536423841, "grad_norm": 1.239334825772509, "learning_rate": 4.5120465754742514e-05, "loss": 0.8867, "step": 7270 }, { "epoch": 4.81523178807947, "grad_norm": 1.0662899560050692, "learning_rate": 4.5072756249205414e-05, "loss": 0.7305, "step": 7271 }, { "epoch": 4.815894039735099, "grad_norm": 1.211115363466212, "learning_rate": 4.5025067520465486e-05, "loss": 0.8672, "step": 7272 }, { "epoch": 4.816556291390729, "grad_norm": 1.1501225565940723, "learning_rate": 4.49773995779657e-05, "loss": 0.8203, "step": 7273 }, { "epoch": 4.817218543046358, "grad_norm": 1.0520112958921706, "learning_rate": 4.492975243114488e-05, "loss": 0.7148, "step": 7274 }, { "epoch": 4.817880794701987, "grad_norm": 1.15823393551042, "learning_rate": 4.488212608943765e-05, "loss": 0.7344, "step": 7275 }, { "epoch": 4.8185430463576155, "grad_norm": 1.1941209897866594, "learning_rate": 4.4834520562274565e-05, "loss": 0.9102, "step": 7276 }, { "epoch": 4.819205298013245, "grad_norm": 1.1144820690331463, "learning_rate": 4.47869358590821e-05, "loss": 0.7578, "step": 7277 }, { "epoch": 4.819867549668874, "grad_norm": 1.2278086652452413, "learning_rate": 4.4739371989282605e-05, "loss": 0.8047, "step": 7278 }, { "epoch": 4.820529801324503, "grad_norm": 1.133994046167678, "learning_rate": 4.469182896229417e-05, "loss": 0.7305, "step": 7279 }, { "epoch": 4.821192052980132, "grad_norm": 1.2798745930153606, "learning_rate": 4.464430678753096e-05, "loss": 0.8242, "step": 7280 }, { "epoch": 4.821854304635762, "grad_norm": 1.3500526098963184, "learning_rate": 4.4596805474402845e-05, "loss": 0.9648, "step": 7281 }, { "epoch": 4.822516556291391, "grad_norm": 1.2230366733534423, "learning_rate": 4.454932503231564e-05, "loss": 0.8594, "step": 7282 }, { "epoch": 4.82317880794702, "grad_norm": 1.1459466475464861, "learning_rate": 4.4501865470671095e-05, "loss": 0.8867, "step": 7283 }, { "epoch": 4.823841059602649, "grad_norm": 1.2965641426234706, "learning_rate": 4.445442679886667e-05, "loss": 0.9688, "step": 7284 }, { "epoch": 4.824503311258278, "grad_norm": 1.148667004616082, "learning_rate": 4.440700902629568e-05, "loss": 0.7461, "step": 7285 }, { "epoch": 4.825165562913908, "grad_norm": 1.1450806733741905, "learning_rate": 4.4359612162347594e-05, "loss": 0.875, "step": 7286 }, { "epoch": 4.825827814569537, "grad_norm": 1.2967620261005925, "learning_rate": 4.431223621640743e-05, "loss": 0.8945, "step": 7287 }, { "epoch": 4.8264900662251655, "grad_norm": 1.2618002949089853, "learning_rate": 4.4264881197856093e-05, "loss": 0.793, "step": 7288 }, { "epoch": 4.8271523178807945, "grad_norm": 1.0833462742744386, "learning_rate": 4.421754711607052e-05, "loss": 0.7305, "step": 7289 }, { "epoch": 4.827814569536423, "grad_norm": 1.2721430255211277, "learning_rate": 4.4170233980423414e-05, "loss": 0.8789, "step": 7290 }, { "epoch": 4.828476821192053, "grad_norm": 1.117055495391883, "learning_rate": 4.412294180028322e-05, "loss": 0.7891, "step": 7291 }, { "epoch": 4.829139072847682, "grad_norm": 1.063283806776567, "learning_rate": 4.407567058501445e-05, "loss": 0.6875, "step": 7292 }, { "epoch": 4.829801324503311, "grad_norm": 1.3350242838116635, "learning_rate": 4.4028420343977235e-05, "loss": 0.9336, "step": 7293 }, { "epoch": 4.83046357615894, "grad_norm": 1.1351646141885594, "learning_rate": 4.398119108652772e-05, "loss": 0.7695, "step": 7294 }, { "epoch": 4.83112582781457, "grad_norm": 1.0186945442175683, "learning_rate": 4.3933982822017876e-05, "loss": 0.6914, "step": 7295 }, { "epoch": 4.831788079470199, "grad_norm": 1.1620782701115138, "learning_rate": 4.388679555979544e-05, "loss": 0.7656, "step": 7296 }, { "epoch": 4.832450331125828, "grad_norm": 1.2153072972189078, "learning_rate": 4.383962930920399e-05, "loss": 0.8359, "step": 7297 }, { "epoch": 4.833112582781457, "grad_norm": 1.0455454477178099, "learning_rate": 4.3792484079583036e-05, "loss": 0.7266, "step": 7298 }, { "epoch": 4.8337748344370866, "grad_norm": 1.1255336400653617, "learning_rate": 4.3745359880267895e-05, "loss": 0.75, "step": 7299 }, { "epoch": 4.8344370860927155, "grad_norm": 1.3057101323437887, "learning_rate": 4.369825672058962e-05, "loss": 0.8125, "step": 7300 }, { "epoch": 4.835099337748344, "grad_norm": 1.1708970991693863, "learning_rate": 4.3651174609875284e-05, "loss": 0.7891, "step": 7301 }, { "epoch": 4.835761589403973, "grad_norm": 1.0417176198254414, "learning_rate": 4.360411355744756e-05, "loss": 0.7578, "step": 7302 }, { "epoch": 4.836423841059602, "grad_norm": 1.1206252567475228, "learning_rate": 4.355707357262515e-05, "loss": 0.6875, "step": 7303 }, { "epoch": 4.837086092715232, "grad_norm": 1.201093092778024, "learning_rate": 4.3510054664722546e-05, "loss": 0.832, "step": 7304 }, { "epoch": 4.837748344370861, "grad_norm": 1.0382061224308494, "learning_rate": 4.3463056843049995e-05, "loss": 0.6406, "step": 7305 }, { "epoch": 4.83841059602649, "grad_norm": 1.1789375649684357, "learning_rate": 4.341608011691349e-05, "loss": 0.7695, "step": 7306 }, { "epoch": 4.839072847682119, "grad_norm": 1.1584086134294098, "learning_rate": 4.3369124495615176e-05, "loss": 0.6914, "step": 7307 }, { "epoch": 4.839735099337748, "grad_norm": 1.1118714008213975, "learning_rate": 4.3322189988452674e-05, "loss": 0.7227, "step": 7308 }, { "epoch": 4.840397350993378, "grad_norm": 1.0953891782432186, "learning_rate": 4.327527660471954e-05, "loss": 0.7656, "step": 7309 }, { "epoch": 4.841059602649007, "grad_norm": 1.1831892743111774, "learning_rate": 4.3228384353705194e-05, "loss": 0.8242, "step": 7310 }, { "epoch": 4.841721854304636, "grad_norm": 1.1211450736125075, "learning_rate": 4.3181513244694905e-05, "loss": 0.6914, "step": 7311 }, { "epoch": 4.842384105960265, "grad_norm": 1.1579053118449358, "learning_rate": 4.313466328696955e-05, "loss": 0.8008, "step": 7312 }, { "epoch": 4.843046357615894, "grad_norm": 1.1254179317054087, "learning_rate": 4.30878344898061e-05, "loss": 0.7578, "step": 7313 }, { "epoch": 4.843708609271523, "grad_norm": 1.1586489666176043, "learning_rate": 4.304102686247708e-05, "loss": 0.8555, "step": 7314 }, { "epoch": 4.844370860927152, "grad_norm": 1.3016385643646549, "learning_rate": 4.299424041425099e-05, "loss": 0.8867, "step": 7315 }, { "epoch": 4.845033112582781, "grad_norm": 1.104516746616668, "learning_rate": 4.2947475154392124e-05, "loss": 0.75, "step": 7316 }, { "epoch": 4.845695364238411, "grad_norm": 1.3259959689923941, "learning_rate": 4.29007310921605e-05, "loss": 0.8867, "step": 7317 }, { "epoch": 4.84635761589404, "grad_norm": 1.2595064765106416, "learning_rate": 4.2854008236811915e-05, "loss": 0.8086, "step": 7318 }, { "epoch": 4.847019867549669, "grad_norm": 1.0927094435415716, "learning_rate": 4.28073065975981e-05, "loss": 0.6875, "step": 7319 }, { "epoch": 4.847682119205298, "grad_norm": 1.0176775356399204, "learning_rate": 4.276062618376654e-05, "loss": 0.7109, "step": 7320 }, { "epoch": 4.848344370860927, "grad_norm": 1.1131454656987099, "learning_rate": 4.27139670045604e-05, "loss": 0.7227, "step": 7321 }, { "epoch": 4.849006622516557, "grad_norm": 1.1894464678290488, "learning_rate": 4.2667329069218845e-05, "loss": 0.8008, "step": 7322 }, { "epoch": 4.849668874172186, "grad_norm": 1.2134539698532596, "learning_rate": 4.262071238697663e-05, "loss": 0.7812, "step": 7323 }, { "epoch": 4.850331125827815, "grad_norm": 1.1812973051351516, "learning_rate": 4.257411696706445e-05, "loss": 0.8047, "step": 7324 }, { "epoch": 4.8509933774834435, "grad_norm": 1.2616527120329524, "learning_rate": 4.252754281870867e-05, "loss": 0.8125, "step": 7325 }, { "epoch": 4.8516556291390724, "grad_norm": 1.023269705573654, "learning_rate": 4.248098995113155e-05, "loss": 0.6445, "step": 7326 }, { "epoch": 4.852317880794702, "grad_norm": 1.0977692850328138, "learning_rate": 4.243445837355111e-05, "loss": 0.7656, "step": 7327 }, { "epoch": 4.852980132450331, "grad_norm": 1.1585361827293845, "learning_rate": 4.2387948095181085e-05, "loss": 0.9336, "step": 7328 }, { "epoch": 4.85364238410596, "grad_norm": 1.0955392079935942, "learning_rate": 4.2341459125231106e-05, "loss": 0.7461, "step": 7329 }, { "epoch": 4.854304635761589, "grad_norm": 1.1209557351927768, "learning_rate": 4.229499147290645e-05, "loss": 0.7109, "step": 7330 }, { "epoch": 4.854966887417218, "grad_norm": 1.2349125637116352, "learning_rate": 4.224854514740827e-05, "loss": 0.8516, "step": 7331 }, { "epoch": 4.855629139072848, "grad_norm": 1.1365769374893557, "learning_rate": 4.220212015793353e-05, "loss": 0.7031, "step": 7332 }, { "epoch": 4.856291390728477, "grad_norm": 1.23597994581889, "learning_rate": 4.215571651367486e-05, "loss": 0.8164, "step": 7333 }, { "epoch": 4.856953642384106, "grad_norm": 1.2618447845655199, "learning_rate": 4.210933422382067e-05, "loss": 0.8633, "step": 7334 }, { "epoch": 4.857615894039736, "grad_norm": 1.2240117497689331, "learning_rate": 4.206297329755523e-05, "loss": 0.8594, "step": 7335 }, { "epoch": 4.8582781456953645, "grad_norm": 1.2345581190940509, "learning_rate": 4.201663374405857e-05, "loss": 0.832, "step": 7336 }, { "epoch": 4.8589403973509935, "grad_norm": 1.1263304846181992, "learning_rate": 4.197031557250638e-05, "loss": 0.8125, "step": 7337 }, { "epoch": 4.859602649006622, "grad_norm": 1.1565296619018002, "learning_rate": 4.1924018792070283e-05, "loss": 0.7891, "step": 7338 }, { "epoch": 4.860264900662251, "grad_norm": 1.1231202633561745, "learning_rate": 4.187774341191745e-05, "loss": 0.6758, "step": 7339 }, { "epoch": 4.860927152317881, "grad_norm": 1.2867019548779108, "learning_rate": 4.1831489441211016e-05, "loss": 0.8945, "step": 7340 }, { "epoch": 4.86158940397351, "grad_norm": 1.191010978987119, "learning_rate": 4.1785256889109825e-05, "loss": 0.8203, "step": 7341 }, { "epoch": 4.862251655629139, "grad_norm": 1.1319905310306495, "learning_rate": 4.1739045764768414e-05, "loss": 0.7578, "step": 7342 }, { "epoch": 4.862913907284768, "grad_norm": 1.0638498665992946, "learning_rate": 4.169285607733703e-05, "loss": 0.6641, "step": 7343 }, { "epoch": 4.863576158940397, "grad_norm": 1.161734758139168, "learning_rate": 4.1646687835961915e-05, "loss": 0.7656, "step": 7344 }, { "epoch": 4.864238410596027, "grad_norm": 1.1280494274083037, "learning_rate": 4.160054104978485e-05, "loss": 0.7539, "step": 7345 }, { "epoch": 4.864900662251656, "grad_norm": 1.0537209988343053, "learning_rate": 4.155441572794338e-05, "loss": 0.7383, "step": 7346 }, { "epoch": 4.865562913907285, "grad_norm": 1.2399869568515673, "learning_rate": 4.150831187957088e-05, "loss": 0.8008, "step": 7347 }, { "epoch": 4.866225165562914, "grad_norm": 1.1484586856293266, "learning_rate": 4.1462229513796474e-05, "loss": 0.8047, "step": 7348 }, { "epoch": 4.866887417218543, "grad_norm": 1.195994033044378, "learning_rate": 4.141616863974494e-05, "loss": 0.793, "step": 7349 }, { "epoch": 4.867549668874172, "grad_norm": 1.0873323855434467, "learning_rate": 4.1370129266536926e-05, "loss": 0.7305, "step": 7350 }, { "epoch": 4.868211920529801, "grad_norm": 1.1855084135421061, "learning_rate": 4.132411140328868e-05, "loss": 0.7891, "step": 7351 }, { "epoch": 4.86887417218543, "grad_norm": 1.1897271924176702, "learning_rate": 4.127811505911229e-05, "loss": 0.8047, "step": 7352 }, { "epoch": 4.869536423841059, "grad_norm": 1.2456769290462968, "learning_rate": 4.123214024311564e-05, "loss": 0.8203, "step": 7353 }, { "epoch": 4.870198675496689, "grad_norm": 1.1353353165025122, "learning_rate": 4.1186186964402193e-05, "loss": 0.7031, "step": 7354 }, { "epoch": 4.870860927152318, "grad_norm": 1.3195166399252585, "learning_rate": 4.114025523207119e-05, "loss": 0.8633, "step": 7355 }, { "epoch": 4.871523178807947, "grad_norm": 1.1461496251778847, "learning_rate": 4.109434505521769e-05, "loss": 0.707, "step": 7356 }, { "epoch": 4.872185430463576, "grad_norm": 1.0749130449152144, "learning_rate": 4.104845644293247e-05, "loss": 0.6992, "step": 7357 }, { "epoch": 4.872847682119206, "grad_norm": 1.290555725984743, "learning_rate": 4.100258940430192e-05, "loss": 0.8125, "step": 7358 }, { "epoch": 4.873509933774835, "grad_norm": 1.094416815594876, "learning_rate": 4.0956743948408335e-05, "loss": 0.7148, "step": 7359 }, { "epoch": 4.874172185430464, "grad_norm": 1.1350508059230782, "learning_rate": 4.0910920084329546e-05, "loss": 0.7422, "step": 7360 }, { "epoch": 4.8748344370860925, "grad_norm": 1.0414071647330025, "learning_rate": 4.0865117821139234e-05, "loss": 0.7031, "step": 7361 }, { "epoch": 4.8754966887417215, "grad_norm": 1.3061344760682405, "learning_rate": 4.081933716790684e-05, "loss": 0.8086, "step": 7362 }, { "epoch": 4.876158940397351, "grad_norm": 1.252790970920827, "learning_rate": 4.077357813369734e-05, "loss": 0.8477, "step": 7363 }, { "epoch": 4.87682119205298, "grad_norm": 1.1621631190292105, "learning_rate": 4.072784072757163e-05, "loss": 0.8047, "step": 7364 }, { "epoch": 4.877483443708609, "grad_norm": 1.1359621830483646, "learning_rate": 4.068212495858625e-05, "loss": 0.8125, "step": 7365 }, { "epoch": 4.878145695364238, "grad_norm": 0.9911549000381724, "learning_rate": 4.0636430835793435e-05, "loss": 0.6992, "step": 7366 }, { "epoch": 4.878807947019867, "grad_norm": 1.197650709045364, "learning_rate": 4.059075836824106e-05, "loss": 0.8242, "step": 7367 }, { "epoch": 4.879470198675497, "grad_norm": 1.1239053335877072, "learning_rate": 4.054510756497287e-05, "loss": 0.8359, "step": 7368 }, { "epoch": 4.880132450331126, "grad_norm": 1.2112996895774413, "learning_rate": 4.04994784350283e-05, "loss": 0.8086, "step": 7369 }, { "epoch": 4.880794701986755, "grad_norm": 1.1607143138848424, "learning_rate": 4.0453870987442326e-05, "loss": 0.8281, "step": 7370 }, { "epoch": 4.881456953642384, "grad_norm": 1.2662302782815515, "learning_rate": 4.040828523124585e-05, "loss": 0.8594, "step": 7371 }, { "epoch": 4.882119205298014, "grad_norm": 1.134417942226216, "learning_rate": 4.036272117546527e-05, "loss": 0.7266, "step": 7372 }, { "epoch": 4.8827814569536425, "grad_norm": 1.183900115758477, "learning_rate": 4.031717882912288e-05, "loss": 0.8242, "step": 7373 }, { "epoch": 4.8834437086092715, "grad_norm": 1.1413414710918592, "learning_rate": 4.0271658201236564e-05, "loss": 0.7188, "step": 7374 }, { "epoch": 4.8841059602649, "grad_norm": 1.1853129716550297, "learning_rate": 4.0226159300819935e-05, "loss": 0.793, "step": 7375 }, { "epoch": 4.88476821192053, "grad_norm": 1.2455545688884337, "learning_rate": 4.018068213688223e-05, "loss": 0.8477, "step": 7376 }, { "epoch": 4.885430463576159, "grad_norm": 1.268705041051395, "learning_rate": 4.0135226718428503e-05, "loss": 0.8633, "step": 7377 }, { "epoch": 4.886092715231788, "grad_norm": 1.131993471624313, "learning_rate": 4.008979305445949e-05, "loss": 0.7773, "step": 7378 }, { "epoch": 4.886754966887417, "grad_norm": 1.1500662783551423, "learning_rate": 4.0044381153971474e-05, "loss": 0.7031, "step": 7379 }, { "epoch": 4.887417218543046, "grad_norm": 1.2138783555705959, "learning_rate": 3.99989910259566e-05, "loss": 0.918, "step": 7380 }, { "epoch": 4.888079470198676, "grad_norm": 1.2309992196131647, "learning_rate": 3.9953622679402684e-05, "loss": 0.8516, "step": 7381 }, { "epoch": 4.888741721854305, "grad_norm": 1.1910162264895454, "learning_rate": 3.99082761232931e-05, "loss": 0.7656, "step": 7382 }, { "epoch": 4.889403973509934, "grad_norm": 1.2438151380717715, "learning_rate": 3.986295136660696e-05, "loss": 0.8242, "step": 7383 }, { "epoch": 4.890066225165563, "grad_norm": 1.1880058028807245, "learning_rate": 3.981764841831913e-05, "loss": 0.8984, "step": 7384 }, { "epoch": 4.890728476821192, "grad_norm": 1.1206723809412584, "learning_rate": 3.977236728740015e-05, "loss": 0.6914, "step": 7385 }, { "epoch": 4.891390728476821, "grad_norm": 1.1357520591782848, "learning_rate": 3.972710798281614e-05, "loss": 0.7812, "step": 7386 }, { "epoch": 4.89205298013245, "grad_norm": 1.0862549076758212, "learning_rate": 3.968187051352903e-05, "loss": 0.6953, "step": 7387 }, { "epoch": 4.892715231788079, "grad_norm": 1.151811337553754, "learning_rate": 3.963665488849625e-05, "loss": 0.7695, "step": 7388 }, { "epoch": 4.893377483443708, "grad_norm": 1.1061325935668704, "learning_rate": 3.9591461116671086e-05, "loss": 0.793, "step": 7389 }, { "epoch": 4.894039735099338, "grad_norm": 1.3668979957365022, "learning_rate": 3.954628920700246e-05, "loss": 0.9766, "step": 7390 }, { "epoch": 4.894701986754967, "grad_norm": 1.2967495649681884, "learning_rate": 3.9501139168434874e-05, "loss": 0.8945, "step": 7391 }, { "epoch": 4.895364238410596, "grad_norm": 1.1139410964612915, "learning_rate": 3.945601100990854e-05, "loss": 0.7422, "step": 7392 }, { "epoch": 4.896026490066225, "grad_norm": 1.2826204265690138, "learning_rate": 3.941090474035935e-05, "loss": 0.9492, "step": 7393 }, { "epoch": 4.896688741721855, "grad_norm": 1.2576171826140257, "learning_rate": 3.9365820368718934e-05, "loss": 0.8477, "step": 7394 }, { "epoch": 4.897350993377484, "grad_norm": 1.1058632325962618, "learning_rate": 3.932075790391441e-05, "loss": 0.7617, "step": 7395 }, { "epoch": 4.898013245033113, "grad_norm": 1.1177047076256934, "learning_rate": 3.927571735486878e-05, "loss": 0.7344, "step": 7396 }, { "epoch": 4.898675496688742, "grad_norm": 0.9661482192716608, "learning_rate": 3.923069873050046e-05, "loss": 0.6016, "step": 7397 }, { "epoch": 4.8993377483443705, "grad_norm": 1.1196751148827702, "learning_rate": 3.918570203972375e-05, "loss": 0.8906, "step": 7398 }, { "epoch": 4.9, "grad_norm": 1.2279922519779698, "learning_rate": 3.9140727291448496e-05, "loss": 0.9023, "step": 7399 }, { "epoch": 4.900662251655629, "grad_norm": 1.2380465029829928, "learning_rate": 3.909577449458021e-05, "loss": 0.9336, "step": 7400 }, { "epoch": 4.901324503311258, "grad_norm": 1.0945563577955695, "learning_rate": 3.905084365801997e-05, "loss": 0.7578, "step": 7401 }, { "epoch": 4.901986754966887, "grad_norm": 1.1520975178139263, "learning_rate": 3.900593479066475e-05, "loss": 0.7578, "step": 7402 }, { "epoch": 4.902649006622516, "grad_norm": 1.1320352380759897, "learning_rate": 3.896104790140695e-05, "loss": 0.7422, "step": 7403 }, { "epoch": 4.903311258278146, "grad_norm": 1.0634766131282478, "learning_rate": 3.891618299913464e-05, "loss": 0.707, "step": 7404 }, { "epoch": 4.903973509933775, "grad_norm": 1.1500455703912453, "learning_rate": 3.887134009273165e-05, "loss": 0.7031, "step": 7405 }, { "epoch": 4.904635761589404, "grad_norm": 1.15901713103062, "learning_rate": 3.882651919107739e-05, "loss": 0.7852, "step": 7406 }, { "epoch": 4.905298013245033, "grad_norm": 1.0764623165055063, "learning_rate": 3.878172030304686e-05, "loss": 0.6641, "step": 7407 }, { "epoch": 4.905960264900663, "grad_norm": 1.3904913464729862, "learning_rate": 3.873694343751083e-05, "loss": 0.8633, "step": 7408 }, { "epoch": 4.906622516556292, "grad_norm": 1.196145800648721, "learning_rate": 3.869218860333555e-05, "loss": 0.8242, "step": 7409 }, { "epoch": 4.9072847682119205, "grad_norm": 1.1121733244338483, "learning_rate": 3.8647455809383006e-05, "loss": 0.7383, "step": 7410 }, { "epoch": 4.907947019867549, "grad_norm": 1.1811706188537838, "learning_rate": 3.8602745064510895e-05, "loss": 0.7461, "step": 7411 }, { "epoch": 4.908609271523179, "grad_norm": 1.2397208688265213, "learning_rate": 3.8558056377572376e-05, "loss": 0.7773, "step": 7412 }, { "epoch": 4.909271523178808, "grad_norm": 1.2785836511803408, "learning_rate": 3.85133897574163e-05, "loss": 0.832, "step": 7413 }, { "epoch": 4.909933774834437, "grad_norm": 1.2637534615777861, "learning_rate": 3.846874521288719e-05, "loss": 0.8945, "step": 7414 }, { "epoch": 4.910596026490066, "grad_norm": 1.106875400447583, "learning_rate": 3.842412275282524e-05, "loss": 0.7266, "step": 7415 }, { "epoch": 4.911258278145695, "grad_norm": 1.150522406354129, "learning_rate": 3.837952238606613e-05, "loss": 0.7656, "step": 7416 }, { "epoch": 4.911920529801325, "grad_norm": 1.3162627602555983, "learning_rate": 3.833494412144126e-05, "loss": 0.957, "step": 7417 }, { "epoch": 4.912582781456954, "grad_norm": 1.1418974662127217, "learning_rate": 3.829038796777771e-05, "loss": 0.6836, "step": 7418 }, { "epoch": 4.913245033112583, "grad_norm": 1.1073578960391084, "learning_rate": 3.824585393389799e-05, "loss": 0.6797, "step": 7419 }, { "epoch": 4.913907284768212, "grad_norm": 1.0973551710581024, "learning_rate": 3.820134202862047e-05, "loss": 0.7539, "step": 7420 }, { "epoch": 4.914569536423841, "grad_norm": 1.215744566754476, "learning_rate": 3.815685226075891e-05, "loss": 0.8398, "step": 7421 }, { "epoch": 4.9152317880794705, "grad_norm": 1.1106865172834657, "learning_rate": 3.811238463912284e-05, "loss": 0.7383, "step": 7422 }, { "epoch": 4.915894039735099, "grad_norm": 1.263285886371103, "learning_rate": 3.8067939172517405e-05, "loss": 0.8633, "step": 7423 }, { "epoch": 4.916556291390728, "grad_norm": 1.0446859986871406, "learning_rate": 3.802351586974326e-05, "loss": 0.6133, "step": 7424 }, { "epoch": 4.917218543046357, "grad_norm": 1.1841798161897914, "learning_rate": 3.797911473959673e-05, "loss": 0.8125, "step": 7425 }, { "epoch": 4.917880794701987, "grad_norm": 1.1782557272645842, "learning_rate": 3.793473579086973e-05, "loss": 0.8125, "step": 7426 }, { "epoch": 4.918543046357616, "grad_norm": 1.2110764770630233, "learning_rate": 3.789037903234989e-05, "loss": 0.7656, "step": 7427 }, { "epoch": 4.919205298013245, "grad_norm": 1.27252552564894, "learning_rate": 3.784604447282024e-05, "loss": 0.8398, "step": 7428 }, { "epoch": 4.919867549668874, "grad_norm": 1.1759647720892619, "learning_rate": 3.7801732121059645e-05, "loss": 0.7617, "step": 7429 }, { "epoch": 4.920529801324504, "grad_norm": 1.2524792378718823, "learning_rate": 3.775744198584234e-05, "loss": 0.9336, "step": 7430 }, { "epoch": 4.921192052980133, "grad_norm": 1.3238806590255883, "learning_rate": 3.771317407593836e-05, "loss": 0.8438, "step": 7431 }, { "epoch": 4.921854304635762, "grad_norm": 1.29174805138259, "learning_rate": 3.766892840011327e-05, "loss": 0.8984, "step": 7432 }, { "epoch": 4.922516556291391, "grad_norm": 1.3434236817615206, "learning_rate": 3.76247049671282e-05, "loss": 0.957, "step": 7433 }, { "epoch": 4.92317880794702, "grad_norm": 1.0572505452828616, "learning_rate": 3.75805037857398e-05, "loss": 0.6602, "step": 7434 }, { "epoch": 4.923841059602649, "grad_norm": 1.2583039369837488, "learning_rate": 3.753632486470057e-05, "loss": 0.7773, "step": 7435 }, { "epoch": 4.924503311258278, "grad_norm": 1.2320566017604937, "learning_rate": 3.7492168212758385e-05, "loss": 0.793, "step": 7436 }, { "epoch": 4.925165562913907, "grad_norm": 1.2195255261468456, "learning_rate": 3.74480338386567e-05, "loss": 0.8242, "step": 7437 }, { "epoch": 4.925827814569536, "grad_norm": 1.1563347398383332, "learning_rate": 3.740392175113468e-05, "loss": 0.8086, "step": 7438 }, { "epoch": 4.926490066225165, "grad_norm": 1.2029028194282652, "learning_rate": 3.735983195892709e-05, "loss": 0.8047, "step": 7439 }, { "epoch": 4.927152317880795, "grad_norm": 1.3334485563015717, "learning_rate": 3.731576447076413e-05, "loss": 0.875, "step": 7440 }, { "epoch": 4.927814569536424, "grad_norm": 1.2242311989144472, "learning_rate": 3.727171929537165e-05, "loss": 0.8281, "step": 7441 }, { "epoch": 4.928476821192053, "grad_norm": 1.1574454437553914, "learning_rate": 3.722769644147113e-05, "loss": 0.7031, "step": 7442 }, { "epoch": 4.929139072847682, "grad_norm": 1.314755917685958, "learning_rate": 3.7183695917779654e-05, "loss": 0.9531, "step": 7443 }, { "epoch": 4.929801324503311, "grad_norm": 1.2808965441751783, "learning_rate": 3.7139717733009755e-05, "loss": 0.875, "step": 7444 }, { "epoch": 4.930463576158941, "grad_norm": 1.1608377121142828, "learning_rate": 3.709576189586967e-05, "loss": 0.75, "step": 7445 }, { "epoch": 4.9311258278145695, "grad_norm": 1.0637980299028116, "learning_rate": 3.70518284150631e-05, "loss": 0.7148, "step": 7446 }, { "epoch": 4.9317880794701985, "grad_norm": 1.2466151028087193, "learning_rate": 3.700791729928941e-05, "loss": 0.8633, "step": 7447 }, { "epoch": 4.932450331125828, "grad_norm": 1.1503797462143852, "learning_rate": 3.696402855724355e-05, "loss": 0.7539, "step": 7448 }, { "epoch": 4.933112582781457, "grad_norm": 1.1347038740165059, "learning_rate": 3.6920162197615946e-05, "loss": 0.7734, "step": 7449 }, { "epoch": 4.933774834437086, "grad_norm": 1.1479145721749426, "learning_rate": 3.68763182290926e-05, "loss": 0.7578, "step": 7450 }, { "epoch": 4.934437086092715, "grad_norm": 1.17119451162271, "learning_rate": 3.683249666035518e-05, "loss": 0.7539, "step": 7451 }, { "epoch": 4.935099337748344, "grad_norm": 1.255683675399053, "learning_rate": 3.6788697500080873e-05, "loss": 0.8203, "step": 7452 }, { "epoch": 4.935761589403974, "grad_norm": 1.2166069484502346, "learning_rate": 3.674492075694235e-05, "loss": 0.7617, "step": 7453 }, { "epoch": 4.936423841059603, "grad_norm": 1.293524751942597, "learning_rate": 3.6701166439607986e-05, "loss": 0.9219, "step": 7454 }, { "epoch": 4.937086092715232, "grad_norm": 1.1380733954306739, "learning_rate": 3.665743455674156e-05, "loss": 0.707, "step": 7455 }, { "epoch": 4.937748344370861, "grad_norm": 1.1057824056770715, "learning_rate": 3.6613725117002504e-05, "loss": 0.6406, "step": 7456 }, { "epoch": 4.93841059602649, "grad_norm": 1.3008113397257013, "learning_rate": 3.657003812904587e-05, "loss": 0.9531, "step": 7457 }, { "epoch": 4.9390728476821195, "grad_norm": 1.2433896470831225, "learning_rate": 3.652637360152206e-05, "loss": 0.9805, "step": 7458 }, { "epoch": 4.9397350993377485, "grad_norm": 1.128983515437301, "learning_rate": 3.648273154307722e-05, "loss": 0.7266, "step": 7459 }, { "epoch": 4.940397350993377, "grad_norm": 1.02943786139513, "learning_rate": 3.643911196235301e-05, "loss": 0.7305, "step": 7460 }, { "epoch": 4.941059602649006, "grad_norm": 1.198739269944828, "learning_rate": 3.639551486798656e-05, "loss": 0.8594, "step": 7461 }, { "epoch": 4.941721854304635, "grad_norm": 1.1996968657632952, "learning_rate": 3.635194026861055e-05, "loss": 0.8008, "step": 7462 }, { "epoch": 4.942384105960265, "grad_norm": 1.1052941105463687, "learning_rate": 3.630838817285331e-05, "loss": 0.75, "step": 7463 }, { "epoch": 4.943046357615894, "grad_norm": 1.1270742449877194, "learning_rate": 3.626485858933869e-05, "loss": 0.6758, "step": 7464 }, { "epoch": 4.943708609271523, "grad_norm": 1.1048820578428868, "learning_rate": 3.622135152668594e-05, "loss": 0.6602, "step": 7465 }, { "epoch": 4.944370860927152, "grad_norm": 1.4043697484814968, "learning_rate": 3.6177866993510075e-05, "loss": 0.9727, "step": 7466 }, { "epoch": 4.945033112582782, "grad_norm": 1.2444902663189326, "learning_rate": 3.6134404998421427e-05, "loss": 0.8672, "step": 7467 }, { "epoch": 4.945695364238411, "grad_norm": 1.3427258463420468, "learning_rate": 3.6090965550026e-05, "loss": 0.8984, "step": 7468 }, { "epoch": 4.94635761589404, "grad_norm": 1.1172602365098394, "learning_rate": 3.604754865692538e-05, "loss": 0.8008, "step": 7469 }, { "epoch": 4.947019867549669, "grad_norm": 1.1912974810649115, "learning_rate": 3.600415432771654e-05, "loss": 0.7266, "step": 7470 }, { "epoch": 4.947682119205298, "grad_norm": 1.2521758416805533, "learning_rate": 3.596078257099199e-05, "loss": 0.8164, "step": 7471 }, { "epoch": 4.948344370860927, "grad_norm": 1.048188369889491, "learning_rate": 3.591743339533998e-05, "loss": 0.668, "step": 7472 }, { "epoch": 4.949006622516556, "grad_norm": 1.1878281854425727, "learning_rate": 3.587410680934407e-05, "loss": 0.8398, "step": 7473 }, { "epoch": 4.949668874172185, "grad_norm": 1.1778401486222665, "learning_rate": 3.583080282158339e-05, "loss": 0.6836, "step": 7474 }, { "epoch": 4.950331125827814, "grad_norm": 1.160314496342303, "learning_rate": 3.578752144063266e-05, "loss": 0.8047, "step": 7475 }, { "epoch": 4.950993377483444, "grad_norm": 1.1922544008096467, "learning_rate": 3.574426267506212e-05, "loss": 0.7461, "step": 7476 }, { "epoch": 4.951655629139073, "grad_norm": 1.225946826593522, "learning_rate": 3.570102653343743e-05, "loss": 0.8477, "step": 7477 }, { "epoch": 4.952317880794702, "grad_norm": 1.2179236593698697, "learning_rate": 3.5657813024319934e-05, "loss": 0.8398, "step": 7478 }, { "epoch": 4.952980132450331, "grad_norm": 0.9680398261141759, "learning_rate": 3.5614622156266306e-05, "loss": 0.6797, "step": 7479 }, { "epoch": 4.95364238410596, "grad_norm": 1.252135755973023, "learning_rate": 3.5571453937828856e-05, "loss": 0.8086, "step": 7480 }, { "epoch": 4.95430463576159, "grad_norm": 1.137597017348754, "learning_rate": 3.552830837755546e-05, "loss": 0.8164, "step": 7481 }, { "epoch": 4.954966887417219, "grad_norm": 1.3740265276835795, "learning_rate": 3.5485185483989384e-05, "loss": 0.9258, "step": 7482 }, { "epoch": 4.9556291390728475, "grad_norm": 1.2489682887227498, "learning_rate": 3.5442085265669414e-05, "loss": 0.8242, "step": 7483 }, { "epoch": 4.9562913907284765, "grad_norm": 1.130198087835507, "learning_rate": 3.539900773112992e-05, "loss": 0.6875, "step": 7484 }, { "epoch": 4.956953642384106, "grad_norm": 1.0476198496935811, "learning_rate": 3.5355952888900804e-05, "loss": 0.6602, "step": 7485 }, { "epoch": 4.957615894039735, "grad_norm": 1.1307172134039818, "learning_rate": 3.531292074750731e-05, "loss": 0.6953, "step": 7486 }, { "epoch": 4.958278145695364, "grad_norm": 1.1625804782409128, "learning_rate": 3.526991131547039e-05, "loss": 0.7773, "step": 7487 }, { "epoch": 4.958940397350993, "grad_norm": 1.2319314865879187, "learning_rate": 3.522692460130634e-05, "loss": 0.8477, "step": 7488 }, { "epoch": 4.959602649006623, "grad_norm": 1.2139190941692548, "learning_rate": 3.518396061352704e-05, "loss": 0.8086, "step": 7489 }, { "epoch": 4.960264900662252, "grad_norm": 1.137661099085646, "learning_rate": 3.5141019360639923e-05, "loss": 0.8203, "step": 7490 }, { "epoch": 4.960927152317881, "grad_norm": 1.1122517167542987, "learning_rate": 3.509810085114777e-05, "loss": 0.6562, "step": 7491 }, { "epoch": 4.96158940397351, "grad_norm": 1.2293611851743, "learning_rate": 3.505520509354886e-05, "loss": 0.8125, "step": 7492 }, { "epoch": 4.962251655629139, "grad_norm": 1.3160409703409424, "learning_rate": 3.501233209633725e-05, "loss": 0.8945, "step": 7493 }, { "epoch": 4.9629139072847686, "grad_norm": 1.1951690874547671, "learning_rate": 3.496948186800217e-05, "loss": 0.7773, "step": 7494 }, { "epoch": 4.9635761589403975, "grad_norm": 1.0770505000102806, "learning_rate": 3.492665441702845e-05, "loss": 0.6602, "step": 7495 }, { "epoch": 4.964238410596026, "grad_norm": 1.2562016087542116, "learning_rate": 3.4883849751896355e-05, "loss": 0.7695, "step": 7496 }, { "epoch": 4.964900662251655, "grad_norm": 1.0886596565913125, "learning_rate": 3.484106788108185e-05, "loss": 0.7031, "step": 7497 }, { "epoch": 4.965562913907284, "grad_norm": 1.136453072228562, "learning_rate": 3.4798308813056156e-05, "loss": 0.6914, "step": 7498 }, { "epoch": 4.966225165562914, "grad_norm": 1.3035674731585984, "learning_rate": 3.4755572556286035e-05, "loss": 0.8828, "step": 7499 }, { "epoch": 4.966887417218543, "grad_norm": 1.396453685841964, "learning_rate": 3.471285911923377e-05, "loss": 0.9492, "step": 7500 }, { "epoch": 4.967549668874172, "grad_norm": 1.1879446344275402, "learning_rate": 3.467016851035716e-05, "loss": 0.9062, "step": 7501 }, { "epoch": 4.968211920529801, "grad_norm": 1.039337657449268, "learning_rate": 3.462750073810937e-05, "loss": 0.6406, "step": 7502 }, { "epoch": 4.968874172185431, "grad_norm": 1.141881459745817, "learning_rate": 3.458485581093916e-05, "loss": 0.7266, "step": 7503 }, { "epoch": 4.96953642384106, "grad_norm": 1.0440468679533492, "learning_rate": 3.4542233737290674e-05, "loss": 0.6602, "step": 7504 }, { "epoch": 4.970198675496689, "grad_norm": 1.4654282730806794, "learning_rate": 3.4499634525603565e-05, "loss": 1.0078, "step": 7505 }, { "epoch": 4.970860927152318, "grad_norm": 1.2545131609773548, "learning_rate": 3.4457058184313046e-05, "loss": 0.8711, "step": 7506 }, { "epoch": 4.9715231788079475, "grad_norm": 1.248092534823807, "learning_rate": 3.441450472184966e-05, "loss": 0.8906, "step": 7507 }, { "epoch": 4.972185430463576, "grad_norm": 1.1147056702422578, "learning_rate": 3.437197414663944e-05, "loss": 0.7188, "step": 7508 }, { "epoch": 4.972847682119205, "grad_norm": 1.1999466164983423, "learning_rate": 3.432946646710397e-05, "loss": 0.8359, "step": 7509 }, { "epoch": 4.973509933774834, "grad_norm": 1.2355026689556867, "learning_rate": 3.4286981691660305e-05, "loss": 0.8711, "step": 7510 }, { "epoch": 4.974172185430463, "grad_norm": 1.1408771446297343, "learning_rate": 3.424451982872082e-05, "loss": 0.7656, "step": 7511 }, { "epoch": 4.974834437086093, "grad_norm": 1.265672152514104, "learning_rate": 3.42020808866935e-05, "loss": 0.8789, "step": 7512 }, { "epoch": 4.975496688741722, "grad_norm": 1.1881263466349128, "learning_rate": 3.4159664873981794e-05, "loss": 0.832, "step": 7513 }, { "epoch": 4.976158940397351, "grad_norm": 1.1700901473541707, "learning_rate": 3.411727179898447e-05, "loss": 0.7617, "step": 7514 }, { "epoch": 4.97682119205298, "grad_norm": 1.2488511161550804, "learning_rate": 3.407490167009592e-05, "loss": 0.8594, "step": 7515 }, { "epoch": 4.977483443708609, "grad_norm": 1.168683204277143, "learning_rate": 3.4032554495705843e-05, "loss": 0.793, "step": 7516 }, { "epoch": 4.978145695364239, "grad_norm": 1.1572188069134803, "learning_rate": 3.399023028419949e-05, "loss": 0.8516, "step": 7517 }, { "epoch": 4.978807947019868, "grad_norm": 1.12968111219171, "learning_rate": 3.39479290439576e-05, "loss": 0.8047, "step": 7518 }, { "epoch": 4.979470198675497, "grad_norm": 1.1150208735749265, "learning_rate": 3.390565078335623e-05, "loss": 0.7109, "step": 7519 }, { "epoch": 4.9801324503311255, "grad_norm": 1.079977805472051, "learning_rate": 3.386339551076697e-05, "loss": 0.7734, "step": 7520 }, { "epoch": 4.980794701986755, "grad_norm": 1.1244656952360088, "learning_rate": 3.3821163234556846e-05, "loss": 0.7109, "step": 7521 }, { "epoch": 4.981456953642384, "grad_norm": 1.2696485504524433, "learning_rate": 3.37789539630884e-05, "loss": 0.8203, "step": 7522 }, { "epoch": 4.982119205298013, "grad_norm": 1.2319092806641767, "learning_rate": 3.373676770471945e-05, "loss": 0.832, "step": 7523 }, { "epoch": 4.982781456953642, "grad_norm": 1.075944280524066, "learning_rate": 3.3694604467803454e-05, "loss": 0.6836, "step": 7524 }, { "epoch": 4.983443708609272, "grad_norm": 1.2454103365015539, "learning_rate": 3.3652464260689126e-05, "loss": 0.9062, "step": 7525 }, { "epoch": 4.984105960264901, "grad_norm": 1.1768257560896835, "learning_rate": 3.361034709172076e-05, "loss": 0.7695, "step": 7526 }, { "epoch": 4.98476821192053, "grad_norm": 1.132109065523027, "learning_rate": 3.3568252969238074e-05, "loss": 0.7578, "step": 7527 }, { "epoch": 4.985430463576159, "grad_norm": 1.2659992806552987, "learning_rate": 3.352618190157614e-05, "loss": 0.7734, "step": 7528 }, { "epoch": 4.986092715231788, "grad_norm": 1.2595062399369894, "learning_rate": 3.348413389706543e-05, "loss": 0.7852, "step": 7529 }, { "epoch": 4.986754966887418, "grad_norm": 1.2776243586999054, "learning_rate": 3.344210896403212e-05, "loss": 0.8711, "step": 7530 }, { "epoch": 4.9874172185430465, "grad_norm": 1.1308286578787268, "learning_rate": 3.340010711079753e-05, "loss": 0.7891, "step": 7531 }, { "epoch": 4.9880794701986755, "grad_norm": 1.1807833677725073, "learning_rate": 3.335812834567844e-05, "loss": 0.7891, "step": 7532 }, { "epoch": 4.988741721854304, "grad_norm": 1.28548609694829, "learning_rate": 3.331617267698722e-05, "loss": 0.8594, "step": 7533 }, { "epoch": 4.989403973509933, "grad_norm": 1.1890087576490285, "learning_rate": 3.327424011303158e-05, "loss": 0.8398, "step": 7534 }, { "epoch": 4.990066225165563, "grad_norm": 1.302953743445622, "learning_rate": 3.323233066211457e-05, "loss": 0.8438, "step": 7535 }, { "epoch": 4.990728476821192, "grad_norm": 1.0552955828714154, "learning_rate": 3.319044433253482e-05, "loss": 0.6992, "step": 7536 }, { "epoch": 4.991390728476821, "grad_norm": 1.1417964748147067, "learning_rate": 3.3148581132586234e-05, "loss": 0.793, "step": 7537 }, { "epoch": 4.99205298013245, "grad_norm": 1.1727463109522698, "learning_rate": 3.310674107055824e-05, "loss": 0.7383, "step": 7538 }, { "epoch": 4.99271523178808, "grad_norm": 1.3291269885926662, "learning_rate": 3.30649241547357e-05, "loss": 0.8086, "step": 7539 }, { "epoch": 4.993377483443709, "grad_norm": 1.0818342967622132, "learning_rate": 3.302313039339879e-05, "loss": 0.6602, "step": 7540 }, { "epoch": 4.994039735099338, "grad_norm": 1.2417378260311274, "learning_rate": 3.298135979482312e-05, "loss": 0.8047, "step": 7541 }, { "epoch": 4.994701986754967, "grad_norm": 1.0489255761247729, "learning_rate": 3.293961236727978e-05, "loss": 0.6797, "step": 7542 }, { "epoch": 4.9953642384105965, "grad_norm": 1.1839504977215751, "learning_rate": 3.28978881190353e-05, "loss": 0.7891, "step": 7543 }, { "epoch": 4.9960264900662255, "grad_norm": 1.1474390051561356, "learning_rate": 3.285618705835144e-05, "loss": 0.7188, "step": 7544 }, { "epoch": 4.996688741721854, "grad_norm": 1.2849876109418392, "learning_rate": 3.2814509193485615e-05, "loss": 0.8125, "step": 7545 }, { "epoch": 4.997350993377483, "grad_norm": 1.5099332667838625, "learning_rate": 3.277285453269041e-05, "loss": 1.1406, "step": 7546 }, { "epoch": 4.998013245033112, "grad_norm": 1.286214208187156, "learning_rate": 3.2731223084213966e-05, "loss": 0.9297, "step": 7547 }, { "epoch": 4.998675496688742, "grad_norm": 1.1222759365816126, "learning_rate": 3.2689614856299845e-05, "loss": 0.7344, "step": 7548 }, { "epoch": 4.999337748344371, "grad_norm": 1.2871668091427075, "learning_rate": 3.264802985718689e-05, "loss": 0.8633, "step": 7549 }, { "epoch": 5.0, "grad_norm": 1.183089364256464, "learning_rate": 3.260646809510935e-05, "loss": 0.7852, "step": 7550 }, { "epoch": 5.0, "eval_loss": 2.6689627170562744, "eval_runtime": 33.9038, "eval_samples_per_second": 9.969, "eval_steps_per_second": 9.969, "step": 7550 }, { "epoch": 5.000662251655629, "grad_norm": 0.6045593851255149, "learning_rate": 3.256492957829707e-05, "loss": 0.3105, "step": 7551 }, { "epoch": 5.001324503311258, "grad_norm": 0.614453358918784, "learning_rate": 3.2523414314975096e-05, "loss": 0.3379, "step": 7552 }, { "epoch": 5.001986754966888, "grad_norm": 0.5363776568778021, "learning_rate": 3.2481922313363855e-05, "loss": 0.3418, "step": 7553 }, { "epoch": 5.002649006622517, "grad_norm": 0.5436052580347087, "learning_rate": 3.2440453581679304e-05, "loss": 0.293, "step": 7554 }, { "epoch": 5.003311258278146, "grad_norm": 0.6263897891439537, "learning_rate": 3.239900812813277e-05, "loss": 0.3008, "step": 7555 }, { "epoch": 5.0039735099337745, "grad_norm": 0.6139806337730688, "learning_rate": 3.235758596093087e-05, "loss": 0.3281, "step": 7556 }, { "epoch": 5.004635761589404, "grad_norm": 0.6213473034125038, "learning_rate": 3.2316187088275644e-05, "loss": 0.3105, "step": 7557 }, { "epoch": 5.005298013245033, "grad_norm": 0.6081866789009767, "learning_rate": 3.2274811518364555e-05, "loss": 0.3184, "step": 7558 }, { "epoch": 5.005960264900662, "grad_norm": 0.6230614642586979, "learning_rate": 3.223345925939053e-05, "loss": 0.3398, "step": 7559 }, { "epoch": 5.006622516556291, "grad_norm": 0.5760426665791908, "learning_rate": 3.2192130319541656e-05, "loss": 0.2617, "step": 7560 }, { "epoch": 5.00728476821192, "grad_norm": 0.6335750509798314, "learning_rate": 3.215082470700166e-05, "loss": 0.2852, "step": 7561 }, { "epoch": 5.00794701986755, "grad_norm": 0.6597385082153511, "learning_rate": 3.210954242994941e-05, "loss": 0.291, "step": 7562 }, { "epoch": 5.008609271523179, "grad_norm": 0.6751536733643929, "learning_rate": 3.206828349655934e-05, "loss": 0.3203, "step": 7563 }, { "epoch": 5.009271523178808, "grad_norm": 0.7095708921880808, "learning_rate": 3.202704791500123e-05, "loss": 0.334, "step": 7564 }, { "epoch": 5.009933774834437, "grad_norm": 0.6748278147310006, "learning_rate": 3.1985835693440135e-05, "loss": 0.2773, "step": 7565 }, { "epoch": 5.010596026490067, "grad_norm": 0.7253035123265196, "learning_rate": 3.194464684003649e-05, "loss": 0.2969, "step": 7566 }, { "epoch": 5.011258278145696, "grad_norm": 0.6468843304344135, "learning_rate": 3.1903481362946284e-05, "loss": 0.2793, "step": 7567 }, { "epoch": 5.0119205298013245, "grad_norm": 0.6388316129245085, "learning_rate": 3.1862339270320734e-05, "loss": 0.2305, "step": 7568 }, { "epoch": 5.0125827814569535, "grad_norm": 0.736180253215124, "learning_rate": 3.182122057030636e-05, "loss": 0.3047, "step": 7569 }, { "epoch": 5.013245033112582, "grad_norm": 0.7005766682478414, "learning_rate": 3.178012527104518e-05, "loss": 0.2256, "step": 7570 }, { "epoch": 5.013907284768212, "grad_norm": 0.7391476320830732, "learning_rate": 3.1739053380674605e-05, "loss": 0.2891, "step": 7571 }, { "epoch": 5.014569536423841, "grad_norm": 0.6737616588863078, "learning_rate": 3.169800490732722e-05, "loss": 0.2559, "step": 7572 }, { "epoch": 5.01523178807947, "grad_norm": 0.7375107099437852, "learning_rate": 3.16569798591312e-05, "loss": 0.2812, "step": 7573 }, { "epoch": 5.015894039735099, "grad_norm": 0.5479396878081634, "learning_rate": 3.161597824420988e-05, "loss": 0.1992, "step": 7574 }, { "epoch": 5.016556291390729, "grad_norm": 0.6618811652801873, "learning_rate": 3.1575000070682096e-05, "loss": 0.2246, "step": 7575 }, { "epoch": 5.017218543046358, "grad_norm": 0.8143921681845607, "learning_rate": 3.153404534666204e-05, "loss": 0.3477, "step": 7576 }, { "epoch": 5.017880794701987, "grad_norm": 0.7975892096703138, "learning_rate": 3.149311408025917e-05, "loss": 0.291, "step": 7577 }, { "epoch": 5.018543046357616, "grad_norm": 0.7144677117162048, "learning_rate": 3.1452206279578315e-05, "loss": 0.2656, "step": 7578 }, { "epoch": 5.019205298013245, "grad_norm": 0.7442934532373326, "learning_rate": 3.141132195271973e-05, "loss": 0.3105, "step": 7579 }, { "epoch": 5.0198675496688745, "grad_norm": 0.6622531023736544, "learning_rate": 3.1370461107779e-05, "loss": 0.2578, "step": 7580 }, { "epoch": 5.020529801324503, "grad_norm": 0.6697685183960234, "learning_rate": 3.132962375284696e-05, "loss": 0.2695, "step": 7581 }, { "epoch": 5.021192052980132, "grad_norm": 0.6214571286122939, "learning_rate": 3.1288809896009994e-05, "loss": 0.2295, "step": 7582 }, { "epoch": 5.021854304635761, "grad_norm": 0.8289040032620009, "learning_rate": 3.1248019545349595e-05, "loss": 0.3223, "step": 7583 }, { "epoch": 5.022516556291391, "grad_norm": 0.5874854051799077, "learning_rate": 3.1207252708942776e-05, "loss": 0.2471, "step": 7584 }, { "epoch": 5.02317880794702, "grad_norm": 0.688340521126013, "learning_rate": 3.1166509394861876e-05, "loss": 0.2598, "step": 7585 }, { "epoch": 5.023841059602649, "grad_norm": 0.923833078630724, "learning_rate": 3.11257896111745e-05, "loss": 0.3105, "step": 7586 }, { "epoch": 5.024503311258278, "grad_norm": 0.6914107273557181, "learning_rate": 3.108509336594355e-05, "loss": 0.2715, "step": 7587 }, { "epoch": 5.025165562913907, "grad_norm": 0.767632264094937, "learning_rate": 3.10444206672275e-05, "loss": 0.3203, "step": 7588 }, { "epoch": 5.025827814569537, "grad_norm": 0.8290049959724409, "learning_rate": 3.1003771523079955e-05, "loss": 0.3125, "step": 7589 }, { "epoch": 5.026490066225166, "grad_norm": 0.7062920284406934, "learning_rate": 3.0963145941549835e-05, "loss": 0.3125, "step": 7590 }, { "epoch": 5.027152317880795, "grad_norm": 0.7124114501782699, "learning_rate": 3.092254393068155e-05, "loss": 0.2969, "step": 7591 }, { "epoch": 5.027814569536424, "grad_norm": 0.7163962835330602, "learning_rate": 3.0881965498514774e-05, "loss": 0.2676, "step": 7592 }, { "epoch": 5.028476821192053, "grad_norm": 0.8675738561091091, "learning_rate": 3.084141065308444e-05, "loss": 0.3242, "step": 7593 }, { "epoch": 5.029139072847682, "grad_norm": 0.7424484951438328, "learning_rate": 3.0800879402420936e-05, "loss": 0.3633, "step": 7594 }, { "epoch": 5.029801324503311, "grad_norm": 0.6846913241868403, "learning_rate": 3.076037175454985e-05, "loss": 0.2432, "step": 7595 }, { "epoch": 5.03046357615894, "grad_norm": 0.6756363459499994, "learning_rate": 3.071988771749219e-05, "loss": 0.2402, "step": 7596 }, { "epoch": 5.031125827814569, "grad_norm": 0.5954603225645252, "learning_rate": 3.067942729926431e-05, "loss": 0.2109, "step": 7597 }, { "epoch": 5.031788079470199, "grad_norm": 0.6702748916283935, "learning_rate": 3.063899050787779e-05, "loss": 0.2852, "step": 7598 }, { "epoch": 5.032450331125828, "grad_norm": 0.8713949748005583, "learning_rate": 3.0598577351339535e-05, "loss": 0.3379, "step": 7599 }, { "epoch": 5.033112582781457, "grad_norm": 0.6034982603757743, "learning_rate": 3.055818783765185e-05, "loss": 0.2432, "step": 7600 }, { "epoch": 5.033774834437086, "grad_norm": 0.7060689594445229, "learning_rate": 3.051782197481238e-05, "loss": 0.2676, "step": 7601 }, { "epoch": 5.034437086092716, "grad_norm": 0.6941983929403234, "learning_rate": 3.0477479770813945e-05, "loss": 0.2598, "step": 7602 }, { "epoch": 5.035099337748345, "grad_norm": 0.639792293258147, "learning_rate": 3.0437161233644823e-05, "loss": 0.208, "step": 7603 }, { "epoch": 5.035761589403974, "grad_norm": 0.7198302677323313, "learning_rate": 3.039686637128848e-05, "loss": 0.2773, "step": 7604 }, { "epoch": 5.0364238410596025, "grad_norm": 0.6641335071807877, "learning_rate": 3.0356595191723796e-05, "loss": 0.2441, "step": 7605 }, { "epoch": 5.0370860927152314, "grad_norm": 0.8385299960824646, "learning_rate": 3.031634770292498e-05, "loss": 0.3301, "step": 7606 }, { "epoch": 5.037748344370861, "grad_norm": 1.0519939954638169, "learning_rate": 3.0276123912861406e-05, "loss": 0.4238, "step": 7607 }, { "epoch": 5.03841059602649, "grad_norm": 0.7752802139520284, "learning_rate": 3.0235923829497897e-05, "loss": 0.2617, "step": 7608 }, { "epoch": 5.039072847682119, "grad_norm": 0.6767035505422887, "learning_rate": 3.0195747460794544e-05, "loss": 0.2412, "step": 7609 }, { "epoch": 5.039735099337748, "grad_norm": 0.7487400542011913, "learning_rate": 3.0155594814706714e-05, "loss": 0.3281, "step": 7610 }, { "epoch": 5.040397350993377, "grad_norm": 0.7143849430055839, "learning_rate": 3.0115465899185044e-05, "loss": 0.2871, "step": 7611 }, { "epoch": 5.041059602649007, "grad_norm": 0.8067559916070459, "learning_rate": 3.0075360722175545e-05, "loss": 0.2832, "step": 7612 }, { "epoch": 5.041721854304636, "grad_norm": 0.7615298702149778, "learning_rate": 3.0035279291619552e-05, "loss": 0.2754, "step": 7613 }, { "epoch": 5.042384105960265, "grad_norm": 0.6388843296769425, "learning_rate": 2.999522161545363e-05, "loss": 0.2217, "step": 7614 }, { "epoch": 5.043046357615894, "grad_norm": 0.7072617079719921, "learning_rate": 2.9955187701609583e-05, "loss": 0.2676, "step": 7615 }, { "epoch": 5.0437086092715235, "grad_norm": 0.740433366157003, "learning_rate": 2.991517755801464e-05, "loss": 0.2949, "step": 7616 }, { "epoch": 5.0443708609271525, "grad_norm": 0.6572320440827792, "learning_rate": 2.9875191192591315e-05, "loss": 0.2559, "step": 7617 }, { "epoch": 5.045033112582781, "grad_norm": 0.8230001013759648, "learning_rate": 2.9835228613257267e-05, "loss": 0.2988, "step": 7618 }, { "epoch": 5.04569536423841, "grad_norm": 0.67089813454419, "learning_rate": 2.9795289827925655e-05, "loss": 0.2559, "step": 7619 }, { "epoch": 5.046357615894039, "grad_norm": 0.7627826459506433, "learning_rate": 2.9755374844504705e-05, "loss": 0.3145, "step": 7620 }, { "epoch": 5.047019867549669, "grad_norm": 0.9513975737147948, "learning_rate": 2.9715483670898105e-05, "loss": 0.3164, "step": 7621 }, { "epoch": 5.047682119205298, "grad_norm": 0.6849829838889234, "learning_rate": 2.9675616315004792e-05, "loss": 0.2432, "step": 7622 }, { "epoch": 5.048344370860927, "grad_norm": 0.6888569081378938, "learning_rate": 2.963577278471893e-05, "loss": 0.2969, "step": 7623 }, { "epoch": 5.049006622516556, "grad_norm": 0.8288091735587798, "learning_rate": 2.9595953087929898e-05, "loss": 0.2988, "step": 7624 }, { "epoch": 5.049668874172186, "grad_norm": 0.7930344652639415, "learning_rate": 2.9556157232522615e-05, "loss": 0.3086, "step": 7625 }, { "epoch": 5.050331125827815, "grad_norm": 0.6548584610603714, "learning_rate": 2.9516385226377066e-05, "loss": 0.2578, "step": 7626 }, { "epoch": 5.050993377483444, "grad_norm": 0.7796059686270767, "learning_rate": 2.947663707736848e-05, "loss": 0.2412, "step": 7627 }, { "epoch": 5.051655629139073, "grad_norm": 0.7116051506582936, "learning_rate": 2.9436912793367517e-05, "loss": 0.2441, "step": 7628 }, { "epoch": 5.052317880794702, "grad_norm": 0.8213692371199846, "learning_rate": 2.9397212382240067e-05, "loss": 0.3496, "step": 7629 }, { "epoch": 5.052980132450331, "grad_norm": 0.7801315946656635, "learning_rate": 2.9357535851847202e-05, "loss": 0.3047, "step": 7630 }, { "epoch": 5.05364238410596, "grad_norm": 0.7482613323816574, "learning_rate": 2.9317883210045397e-05, "loss": 0.2695, "step": 7631 }, { "epoch": 5.054304635761589, "grad_norm": 0.67907499185157, "learning_rate": 2.9278254464686247e-05, "loss": 0.2314, "step": 7632 }, { "epoch": 5.054966887417218, "grad_norm": 0.7758730145353828, "learning_rate": 2.9238649623616744e-05, "loss": 0.2832, "step": 7633 }, { "epoch": 5.055629139072848, "grad_norm": 0.6587963006206099, "learning_rate": 2.9199068694679136e-05, "loss": 0.2207, "step": 7634 }, { "epoch": 5.056291390728477, "grad_norm": 0.6544764391843848, "learning_rate": 2.915951168571086e-05, "loss": 0.2451, "step": 7635 }, { "epoch": 5.056953642384106, "grad_norm": 0.6520405499517448, "learning_rate": 2.9119978604544632e-05, "loss": 0.2383, "step": 7636 }, { "epoch": 5.057615894039735, "grad_norm": 0.6313472675535934, "learning_rate": 2.9080469459008478e-05, "loss": 0.2334, "step": 7637 }, { "epoch": 5.058278145695364, "grad_norm": 0.6895802592774316, "learning_rate": 2.9040984256925708e-05, "loss": 0.2441, "step": 7638 }, { "epoch": 5.058940397350994, "grad_norm": 0.8088756672108751, "learning_rate": 2.9001523006114775e-05, "loss": 0.3086, "step": 7639 }, { "epoch": 5.059602649006623, "grad_norm": 0.7220205133085944, "learning_rate": 2.8962085714389506e-05, "loss": 0.2197, "step": 7640 }, { "epoch": 5.0602649006622515, "grad_norm": 0.820990773722928, "learning_rate": 2.8922672389558903e-05, "loss": 0.2656, "step": 7641 }, { "epoch": 5.0609271523178805, "grad_norm": 0.786487377711401, "learning_rate": 2.888328303942726e-05, "loss": 0.2949, "step": 7642 }, { "epoch": 5.06158940397351, "grad_norm": 0.7922244188622611, "learning_rate": 2.8843917671794177e-05, "loss": 0.2832, "step": 7643 }, { "epoch": 5.062251655629139, "grad_norm": 0.835564970658743, "learning_rate": 2.8804576294454405e-05, "loss": 0.3223, "step": 7644 }, { "epoch": 5.062913907284768, "grad_norm": 0.7896123519572305, "learning_rate": 2.876525891519791e-05, "loss": 0.2812, "step": 7645 }, { "epoch": 5.063576158940397, "grad_norm": 0.7558894164031373, "learning_rate": 2.8725965541810137e-05, "loss": 0.2734, "step": 7646 }, { "epoch": 5.064238410596026, "grad_norm": 0.7526194985669108, "learning_rate": 2.868669618207155e-05, "loss": 0.2793, "step": 7647 }, { "epoch": 5.064900662251656, "grad_norm": 0.6886377270516222, "learning_rate": 2.8647450843757897e-05, "loss": 0.2617, "step": 7648 }, { "epoch": 5.065562913907285, "grad_norm": 0.8592849933825653, "learning_rate": 2.8608229534640225e-05, "loss": 0.3281, "step": 7649 }, { "epoch": 5.066225165562914, "grad_norm": 0.8247954472251052, "learning_rate": 2.856903226248486e-05, "loss": 0.2695, "step": 7650 }, { "epoch": 5.066887417218543, "grad_norm": 0.6603437357196821, "learning_rate": 2.8529859035053236e-05, "loss": 0.2539, "step": 7651 }, { "epoch": 5.067549668874173, "grad_norm": 0.7145933704465248, "learning_rate": 2.8490709860102174e-05, "loss": 0.2598, "step": 7652 }, { "epoch": 5.0682119205298015, "grad_norm": 0.805700932033964, "learning_rate": 2.8451584745383566e-05, "loss": 0.2773, "step": 7653 }, { "epoch": 5.0688741721854305, "grad_norm": 0.7994612318899332, "learning_rate": 2.8412483698644684e-05, "loss": 0.2734, "step": 7654 }, { "epoch": 5.069536423841059, "grad_norm": 0.6800294021719491, "learning_rate": 2.8373406727628036e-05, "loss": 0.2617, "step": 7655 }, { "epoch": 5.070198675496688, "grad_norm": 0.718540195425804, "learning_rate": 2.833435384007126e-05, "loss": 0.2432, "step": 7656 }, { "epoch": 5.070860927152318, "grad_norm": 0.7048242233506135, "learning_rate": 2.829532504370723e-05, "loss": 0.2734, "step": 7657 }, { "epoch": 5.071523178807947, "grad_norm": 0.829166561652657, "learning_rate": 2.825632034626414e-05, "loss": 0.3145, "step": 7658 }, { "epoch": 5.072185430463576, "grad_norm": 0.8044672132338115, "learning_rate": 2.8217339755465394e-05, "loss": 0.2812, "step": 7659 }, { "epoch": 5.072847682119205, "grad_norm": 0.8604951843726184, "learning_rate": 2.817838327902956e-05, "loss": 0.3281, "step": 7660 }, { "epoch": 5.073509933774835, "grad_norm": 0.7026263856387941, "learning_rate": 2.8139450924670443e-05, "loss": 0.2676, "step": 7661 }, { "epoch": 5.074172185430464, "grad_norm": 0.8829538387056132, "learning_rate": 2.8100542700097195e-05, "loss": 0.3262, "step": 7662 }, { "epoch": 5.074834437086093, "grad_norm": 0.8277355279150733, "learning_rate": 2.806165861301397e-05, "loss": 0.3398, "step": 7663 }, { "epoch": 5.075496688741722, "grad_norm": 0.7517399962052583, "learning_rate": 2.8022798671120362e-05, "loss": 0.2871, "step": 7664 }, { "epoch": 5.076158940397351, "grad_norm": 0.6924743487511614, "learning_rate": 2.7983962882110995e-05, "loss": 0.2383, "step": 7665 }, { "epoch": 5.07682119205298, "grad_norm": 0.7332136228225022, "learning_rate": 2.7945151253675885e-05, "loss": 0.2852, "step": 7666 }, { "epoch": 5.077483443708609, "grad_norm": 0.7569197505053394, "learning_rate": 2.7906363793500102e-05, "loss": 0.2734, "step": 7667 }, { "epoch": 5.078145695364238, "grad_norm": 0.7259861765065343, "learning_rate": 2.7867600509264086e-05, "loss": 0.2852, "step": 7668 }, { "epoch": 5.078807947019867, "grad_norm": 0.9074679048279021, "learning_rate": 2.782886140864333e-05, "loss": 0.3418, "step": 7669 }, { "epoch": 5.079470198675497, "grad_norm": 0.7312402952255912, "learning_rate": 2.779014649930867e-05, "loss": 0.2539, "step": 7670 }, { "epoch": 5.080132450331126, "grad_norm": 0.635122803365682, "learning_rate": 2.7751455788926132e-05, "loss": 0.2236, "step": 7671 }, { "epoch": 5.080794701986755, "grad_norm": 0.7524611601496549, "learning_rate": 2.7712789285156874e-05, "loss": 0.2656, "step": 7672 }, { "epoch": 5.081456953642384, "grad_norm": 0.6782038970456145, "learning_rate": 2.7674146995657288e-05, "loss": 0.2207, "step": 7673 }, { "epoch": 5.082119205298013, "grad_norm": 0.6910893158617771, "learning_rate": 2.7635528928079004e-05, "loss": 0.2334, "step": 7674 }, { "epoch": 5.082781456953643, "grad_norm": 0.7919554516432932, "learning_rate": 2.759693509006889e-05, "loss": 0.2773, "step": 7675 }, { "epoch": 5.083443708609272, "grad_norm": 0.7593597600474896, "learning_rate": 2.7558365489268912e-05, "loss": 0.249, "step": 7676 }, { "epoch": 5.084105960264901, "grad_norm": 0.660311192066355, "learning_rate": 2.7519820133316333e-05, "loss": 0.2236, "step": 7677 }, { "epoch": 5.0847682119205295, "grad_norm": 0.8387875744629516, "learning_rate": 2.748129902984353e-05, "loss": 0.2812, "step": 7678 }, { "epoch": 5.085430463576159, "grad_norm": 0.9545842267535178, "learning_rate": 2.7442802186478145e-05, "loss": 0.2852, "step": 7679 }, { "epoch": 5.086092715231788, "grad_norm": 0.8173956244935504, "learning_rate": 2.7404329610843055e-05, "loss": 0.3477, "step": 7680 }, { "epoch": 5.086754966887417, "grad_norm": 0.6770257090971682, "learning_rate": 2.7365881310556197e-05, "loss": 0.2559, "step": 7681 }, { "epoch": 5.087417218543046, "grad_norm": 0.7069628119785532, "learning_rate": 2.7327457293230732e-05, "loss": 0.2773, "step": 7682 }, { "epoch": 5.088079470198675, "grad_norm": 0.7532216163228226, "learning_rate": 2.728905756647517e-05, "loss": 0.2285, "step": 7683 }, { "epoch": 5.088741721854305, "grad_norm": 0.8830951638311905, "learning_rate": 2.7250682137893065e-05, "loss": 0.332, "step": 7684 }, { "epoch": 5.089403973509934, "grad_norm": 0.7151510775475609, "learning_rate": 2.721233101508312e-05, "loss": 0.2695, "step": 7685 }, { "epoch": 5.090066225165563, "grad_norm": 0.7210162369513717, "learning_rate": 2.7174004205639364e-05, "loss": 0.2227, "step": 7686 }, { "epoch": 5.090728476821192, "grad_norm": 0.7196282053692017, "learning_rate": 2.713570171715096e-05, "loss": 0.2598, "step": 7687 }, { "epoch": 5.091390728476822, "grad_norm": 0.7520314610407249, "learning_rate": 2.7097423557202168e-05, "loss": 0.2734, "step": 7688 }, { "epoch": 5.092052980132451, "grad_norm": 0.789713228110885, "learning_rate": 2.70591697333726e-05, "loss": 0.2734, "step": 7689 }, { "epoch": 5.0927152317880795, "grad_norm": 0.646033166661115, "learning_rate": 2.702094025323684e-05, "loss": 0.2373, "step": 7690 }, { "epoch": 5.093377483443708, "grad_norm": 0.7291339596802344, "learning_rate": 2.6982735124364817e-05, "loss": 0.2559, "step": 7691 }, { "epoch": 5.094039735099337, "grad_norm": 0.6612067237021902, "learning_rate": 2.6944554354321634e-05, "loss": 0.2441, "step": 7692 }, { "epoch": 5.094701986754967, "grad_norm": 0.7376475337643713, "learning_rate": 2.6906397950667475e-05, "loss": 0.2617, "step": 7693 }, { "epoch": 5.095364238410596, "grad_norm": 0.7146897106493991, "learning_rate": 2.6868265920957693e-05, "loss": 0.2578, "step": 7694 }, { "epoch": 5.096026490066225, "grad_norm": 0.6958686201005283, "learning_rate": 2.683015827274292e-05, "loss": 0.2441, "step": 7695 }, { "epoch": 5.096688741721854, "grad_norm": 0.7579195655202206, "learning_rate": 2.6792075013568948e-05, "loss": 0.2559, "step": 7696 }, { "epoch": 5.097350993377484, "grad_norm": 0.7864199275594701, "learning_rate": 2.6754016150976615e-05, "loss": 0.3086, "step": 7697 }, { "epoch": 5.098013245033113, "grad_norm": 0.6233627457073797, "learning_rate": 2.671598169250209e-05, "loss": 0.2285, "step": 7698 }, { "epoch": 5.098675496688742, "grad_norm": 0.6326262813221883, "learning_rate": 2.667797164567657e-05, "loss": 0.2119, "step": 7699 }, { "epoch": 5.099337748344371, "grad_norm": 0.7902654980104751, "learning_rate": 2.66399860180265e-05, "loss": 0.2598, "step": 7700 }, { "epoch": 5.1, "grad_norm": 0.7828906035845621, "learning_rate": 2.660202481707351e-05, "loss": 0.2578, "step": 7701 }, { "epoch": 5.1006622516556295, "grad_norm": 0.6938564471375939, "learning_rate": 2.656408805033427e-05, "loss": 0.2354, "step": 7702 }, { "epoch": 5.101324503311258, "grad_norm": 0.861259806632186, "learning_rate": 2.652617572532076e-05, "loss": 0.3574, "step": 7703 }, { "epoch": 5.101986754966887, "grad_norm": 0.8101499906582765, "learning_rate": 2.648828784954007e-05, "loss": 0.2871, "step": 7704 }, { "epoch": 5.102649006622516, "grad_norm": 0.7571199321679258, "learning_rate": 2.6450424430494398e-05, "loss": 0.2891, "step": 7705 }, { "epoch": 5.103311258278145, "grad_norm": 0.7664986801151913, "learning_rate": 2.6412585475681104e-05, "loss": 0.2871, "step": 7706 }, { "epoch": 5.103973509933775, "grad_norm": 0.6777814653379343, "learning_rate": 2.6374770992592752e-05, "loss": 0.209, "step": 7707 }, { "epoch": 5.104635761589404, "grad_norm": 0.661350490812791, "learning_rate": 2.633698098871711e-05, "loss": 0.2275, "step": 7708 }, { "epoch": 5.105298013245033, "grad_norm": 0.8431602069568898, "learning_rate": 2.6299215471536955e-05, "loss": 0.3125, "step": 7709 }, { "epoch": 5.105960264900662, "grad_norm": 0.7585370379359879, "learning_rate": 2.6261474448530358e-05, "loss": 0.2578, "step": 7710 }, { "epoch": 5.106622516556292, "grad_norm": 0.6627061756432032, "learning_rate": 2.6223757927170386e-05, "loss": 0.2158, "step": 7711 }, { "epoch": 5.107284768211921, "grad_norm": 0.7995715316480778, "learning_rate": 2.6186065914925387e-05, "loss": 0.3242, "step": 7712 }, { "epoch": 5.10794701986755, "grad_norm": 0.6875009407298466, "learning_rate": 2.614839841925887e-05, "loss": 0.2188, "step": 7713 }, { "epoch": 5.108609271523179, "grad_norm": 0.9641227108671551, "learning_rate": 2.611075544762937e-05, "loss": 0.3105, "step": 7714 }, { "epoch": 5.109271523178808, "grad_norm": 0.7429097109738173, "learning_rate": 2.6073137007490564e-05, "loss": 0.2852, "step": 7715 }, { "epoch": 5.109933774834437, "grad_norm": 0.7058542894901473, "learning_rate": 2.6035543106291484e-05, "loss": 0.252, "step": 7716 }, { "epoch": 5.110596026490066, "grad_norm": 0.684963279270005, "learning_rate": 2.5997973751476064e-05, "loss": 0.248, "step": 7717 }, { "epoch": 5.111258278145695, "grad_norm": 0.849185884310407, "learning_rate": 2.5960428950483452e-05, "loss": 0.2832, "step": 7718 }, { "epoch": 5.111920529801324, "grad_norm": 0.8564624585664711, "learning_rate": 2.592290871074797e-05, "loss": 0.3086, "step": 7719 }, { "epoch": 5.112582781456954, "grad_norm": 0.7306409626867655, "learning_rate": 2.5885413039699103e-05, "loss": 0.2852, "step": 7720 }, { "epoch": 5.113245033112583, "grad_norm": 0.7584688151781532, "learning_rate": 2.5847941944761334e-05, "loss": 0.2402, "step": 7721 }, { "epoch": 5.113907284768212, "grad_norm": 0.9400954173095813, "learning_rate": 2.5810495433354462e-05, "loss": 0.3379, "step": 7722 }, { "epoch": 5.114569536423841, "grad_norm": 1.020766213160279, "learning_rate": 2.577307351289325e-05, "loss": 0.3262, "step": 7723 }, { "epoch": 5.11523178807947, "grad_norm": 0.6781216659533541, "learning_rate": 2.5735676190787723e-05, "loss": 0.2539, "step": 7724 }, { "epoch": 5.1158940397351, "grad_norm": 0.7001404215064257, "learning_rate": 2.5698303474442926e-05, "loss": 0.2598, "step": 7725 }, { "epoch": 5.1165562913907285, "grad_norm": 0.7181277288389928, "learning_rate": 2.566095537125914e-05, "loss": 0.2422, "step": 7726 }, { "epoch": 5.1172185430463575, "grad_norm": 0.8244274359938574, "learning_rate": 2.562363188863166e-05, "loss": 0.3145, "step": 7727 }, { "epoch": 5.117880794701986, "grad_norm": 0.8478197279295507, "learning_rate": 2.5586333033950984e-05, "loss": 0.3066, "step": 7728 }, { "epoch": 5.118543046357616, "grad_norm": 0.8727063675105848, "learning_rate": 2.554905881460278e-05, "loss": 0.2871, "step": 7729 }, { "epoch": 5.119205298013245, "grad_norm": 0.6841203186875177, "learning_rate": 2.5511809237967696e-05, "loss": 0.2217, "step": 7730 }, { "epoch": 5.119867549668874, "grad_norm": 0.7770491399861332, "learning_rate": 2.5474584311421564e-05, "loss": 0.2695, "step": 7731 }, { "epoch": 5.120529801324503, "grad_norm": 0.7714153913618808, "learning_rate": 2.543738404233539e-05, "loss": 0.2676, "step": 7732 }, { "epoch": 5.121192052980132, "grad_norm": 0.751606982172731, "learning_rate": 2.540020843807525e-05, "loss": 0.2871, "step": 7733 }, { "epoch": 5.121854304635762, "grad_norm": 0.842153988680616, "learning_rate": 2.5363057506002314e-05, "loss": 0.3008, "step": 7734 }, { "epoch": 5.122516556291391, "grad_norm": 0.7261604214187055, "learning_rate": 2.5325931253472925e-05, "loss": 0.291, "step": 7735 }, { "epoch": 5.12317880794702, "grad_norm": 0.8376212128905876, "learning_rate": 2.528882968783846e-05, "loss": 0.3477, "step": 7736 }, { "epoch": 5.123841059602649, "grad_norm": 0.6914180929555219, "learning_rate": 2.5251752816445503e-05, "loss": 0.2188, "step": 7737 }, { "epoch": 5.1245033112582785, "grad_norm": 0.548928762365887, "learning_rate": 2.5214700646635687e-05, "loss": 0.1641, "step": 7738 }, { "epoch": 5.1251655629139075, "grad_norm": 0.9190817917272078, "learning_rate": 2.517767318574575e-05, "loss": 0.3496, "step": 7739 }, { "epoch": 5.125827814569536, "grad_norm": 0.7729911563879301, "learning_rate": 2.514067044110754e-05, "loss": 0.2637, "step": 7740 }, { "epoch": 5.126490066225165, "grad_norm": 0.7391897330933267, "learning_rate": 2.51036924200481e-05, "loss": 0.2178, "step": 7741 }, { "epoch": 5.127152317880794, "grad_norm": 0.7634435524158038, "learning_rate": 2.506673912988945e-05, "loss": 0.2559, "step": 7742 }, { "epoch": 5.127814569536424, "grad_norm": 0.7699337993007899, "learning_rate": 2.5029810577948726e-05, "loss": 0.2988, "step": 7743 }, { "epoch": 5.128476821192053, "grad_norm": 0.7476224687351686, "learning_rate": 2.4992906771538245e-05, "loss": 0.2461, "step": 7744 }, { "epoch": 5.129139072847682, "grad_norm": 0.678022448321499, "learning_rate": 2.495602771796541e-05, "loss": 0.2002, "step": 7745 }, { "epoch": 5.129801324503311, "grad_norm": 0.7811626858174175, "learning_rate": 2.4919173424532645e-05, "loss": 0.2676, "step": 7746 }, { "epoch": 5.130463576158941, "grad_norm": 0.7871573091954337, "learning_rate": 2.4882343898537566e-05, "loss": 0.2617, "step": 7747 }, { "epoch": 5.13112582781457, "grad_norm": 0.6987962050969169, "learning_rate": 2.4845539147272803e-05, "loss": 0.2344, "step": 7748 }, { "epoch": 5.131788079470199, "grad_norm": 0.6829468511002302, "learning_rate": 2.480875917802613e-05, "loss": 0.2178, "step": 7749 }, { "epoch": 5.132450331125828, "grad_norm": 0.6362573019790945, "learning_rate": 2.4772003998080447e-05, "loss": 0.2109, "step": 7750 }, { "epoch": 5.1331125827814565, "grad_norm": 0.6991382754834095, "learning_rate": 2.4735273614713673e-05, "loss": 0.2285, "step": 7751 }, { "epoch": 5.133774834437086, "grad_norm": 0.6981246927431022, "learning_rate": 2.469856803519879e-05, "loss": 0.2373, "step": 7752 }, { "epoch": 5.134437086092715, "grad_norm": 0.7669323342602807, "learning_rate": 2.466188726680398e-05, "loss": 0.2969, "step": 7753 }, { "epoch": 5.135099337748344, "grad_norm": 0.7018298021103166, "learning_rate": 2.462523131679247e-05, "loss": 0.2617, "step": 7754 }, { "epoch": 5.135761589403973, "grad_norm": 0.74069433283855, "learning_rate": 2.4588600192422504e-05, "loss": 0.2891, "step": 7755 }, { "epoch": 5.136423841059603, "grad_norm": 0.8005129413726052, "learning_rate": 2.455199390094749e-05, "loss": 0.2695, "step": 7756 }, { "epoch": 5.137086092715232, "grad_norm": 0.8596229297044873, "learning_rate": 2.4515412449615956e-05, "loss": 0.3027, "step": 7757 }, { "epoch": 5.137748344370861, "grad_norm": 0.7846535256689628, "learning_rate": 2.4478855845671336e-05, "loss": 0.2793, "step": 7758 }, { "epoch": 5.13841059602649, "grad_norm": 0.7163239492621535, "learning_rate": 2.444232409635234e-05, "loss": 0.2773, "step": 7759 }, { "epoch": 5.139072847682119, "grad_norm": 0.7312078921842875, "learning_rate": 2.4405817208892632e-05, "loss": 0.248, "step": 7760 }, { "epoch": 5.139735099337749, "grad_norm": 0.6990468500562959, "learning_rate": 2.436933519052098e-05, "loss": 0.2295, "step": 7761 }, { "epoch": 5.140397350993378, "grad_norm": 0.7988025120811293, "learning_rate": 2.4332878048461318e-05, "loss": 0.2754, "step": 7762 }, { "epoch": 5.1410596026490065, "grad_norm": 0.8475677177874262, "learning_rate": 2.4296445789932528e-05, "loss": 0.3066, "step": 7763 }, { "epoch": 5.1417218543046355, "grad_norm": 0.7022640182440665, "learning_rate": 2.4260038422148585e-05, "loss": 0.2207, "step": 7764 }, { "epoch": 5.142384105960265, "grad_norm": 0.668390192389239, "learning_rate": 2.4223655952318594e-05, "loss": 0.2207, "step": 7765 }, { "epoch": 5.143046357615894, "grad_norm": 0.7187381016044225, "learning_rate": 2.4187298387646735e-05, "loss": 0.2617, "step": 7766 }, { "epoch": 5.143708609271523, "grad_norm": 0.9069151226966364, "learning_rate": 2.4150965735332155e-05, "loss": 0.3438, "step": 7767 }, { "epoch": 5.144370860927152, "grad_norm": 0.6948619986328017, "learning_rate": 2.4114658002569213e-05, "loss": 0.2773, "step": 7768 }, { "epoch": 5.145033112582781, "grad_norm": 0.6989031727473147, "learning_rate": 2.407837519654718e-05, "loss": 0.2285, "step": 7769 }, { "epoch": 5.145695364238411, "grad_norm": 0.7325190222981751, "learning_rate": 2.40421173244505e-05, "loss": 0.2441, "step": 7770 }, { "epoch": 5.14635761589404, "grad_norm": 0.8138696336805727, "learning_rate": 2.400588439345867e-05, "loss": 0.2451, "step": 7771 }, { "epoch": 5.147019867549669, "grad_norm": 0.7051375699650541, "learning_rate": 2.3969676410746232e-05, "loss": 0.2324, "step": 7772 }, { "epoch": 5.147682119205298, "grad_norm": 0.7429945710207402, "learning_rate": 2.3933493383482666e-05, "loss": 0.2471, "step": 7773 }, { "epoch": 5.1483443708609276, "grad_norm": 0.7714852667435148, "learning_rate": 2.3897335318832795e-05, "loss": 0.2598, "step": 7774 }, { "epoch": 5.1490066225165565, "grad_norm": 0.7888821293211294, "learning_rate": 2.386120222395624e-05, "loss": 0.2637, "step": 7775 }, { "epoch": 5.149668874172185, "grad_norm": 0.8088833872670256, "learning_rate": 2.3825094106007742e-05, "loss": 0.248, "step": 7776 }, { "epoch": 5.150331125827814, "grad_norm": 0.6771132236529286, "learning_rate": 2.378901097213717e-05, "loss": 0.2275, "step": 7777 }, { "epoch": 5.150993377483443, "grad_norm": 0.9340225478813796, "learning_rate": 2.3752952829489417e-05, "loss": 0.3027, "step": 7778 }, { "epoch": 5.151655629139073, "grad_norm": 0.7285149406689923, "learning_rate": 2.3716919685204384e-05, "loss": 0.2617, "step": 7779 }, { "epoch": 5.152317880794702, "grad_norm": 0.809976497911641, "learning_rate": 2.3680911546417003e-05, "loss": 0.2754, "step": 7780 }, { "epoch": 5.152980132450331, "grad_norm": 0.6696806025911933, "learning_rate": 2.3644928420257343e-05, "loss": 0.2041, "step": 7781 }, { "epoch": 5.15364238410596, "grad_norm": 0.6446591335289431, "learning_rate": 2.36089703138505e-05, "loss": 0.2207, "step": 7782 }, { "epoch": 5.15430463576159, "grad_norm": 0.7615194567436615, "learning_rate": 2.3573037234316534e-05, "loss": 0.2471, "step": 7783 }, { "epoch": 5.154966887417219, "grad_norm": 0.7855549258138956, "learning_rate": 2.3537129188770648e-05, "loss": 0.2969, "step": 7784 }, { "epoch": 5.155629139072848, "grad_norm": 0.720816910822466, "learning_rate": 2.350124618432301e-05, "loss": 0.2617, "step": 7785 }, { "epoch": 5.156291390728477, "grad_norm": 0.6743108786856217, "learning_rate": 2.3465388228078896e-05, "loss": 0.2256, "step": 7786 }, { "epoch": 5.156953642384106, "grad_norm": 0.8109927685931756, "learning_rate": 2.3429555327138616e-05, "loss": 0.3047, "step": 7787 }, { "epoch": 5.157615894039735, "grad_norm": 0.7431018804929632, "learning_rate": 2.339374748859746e-05, "loss": 0.2461, "step": 7788 }, { "epoch": 5.158278145695364, "grad_norm": 0.6732401333972741, "learning_rate": 2.335796471954575e-05, "loss": 0.2334, "step": 7789 }, { "epoch": 5.158940397350993, "grad_norm": 0.8462147569929277, "learning_rate": 2.3322207027068945e-05, "loss": 0.3066, "step": 7790 }, { "epoch": 5.159602649006622, "grad_norm": 0.8237260248303043, "learning_rate": 2.3286474418247492e-05, "loss": 0.2812, "step": 7791 }, { "epoch": 5.160264900662252, "grad_norm": 0.7700633334375332, "learning_rate": 2.325076690015678e-05, "loss": 0.25, "step": 7792 }, { "epoch": 5.160927152317881, "grad_norm": 0.7263584361951742, "learning_rate": 2.3215084479867407e-05, "loss": 0.2676, "step": 7793 }, { "epoch": 5.16158940397351, "grad_norm": 0.6986797400725967, "learning_rate": 2.3179427164444803e-05, "loss": 0.2393, "step": 7794 }, { "epoch": 5.162251655629139, "grad_norm": 0.6907789414039427, "learning_rate": 2.314379496094957e-05, "loss": 0.2363, "step": 7795 }, { "epoch": 5.162913907284768, "grad_norm": 0.7675051356638901, "learning_rate": 2.3108187876437328e-05, "loss": 0.2695, "step": 7796 }, { "epoch": 5.163576158940398, "grad_norm": 0.8127241518139511, "learning_rate": 2.3072605917958625e-05, "loss": 0.3379, "step": 7797 }, { "epoch": 5.164238410596027, "grad_norm": 0.82816480949799, "learning_rate": 2.3037049092559123e-05, "loss": 0.293, "step": 7798 }, { "epoch": 5.164900662251656, "grad_norm": 0.7454294303979505, "learning_rate": 2.3001517407279523e-05, "loss": 0.2871, "step": 7799 }, { "epoch": 5.1655629139072845, "grad_norm": 0.73516489463546, "learning_rate": 2.2966010869155466e-05, "loss": 0.2236, "step": 7800 }, { "epoch": 5.166225165562914, "grad_norm": 0.9179353472508677, "learning_rate": 2.2930529485217626e-05, "loss": 0.3008, "step": 7801 }, { "epoch": 5.166887417218543, "grad_norm": 0.7277213196810184, "learning_rate": 2.2895073262491732e-05, "loss": 0.2617, "step": 7802 }, { "epoch": 5.167549668874172, "grad_norm": 0.8339817385295946, "learning_rate": 2.2859642207998585e-05, "loss": 0.3066, "step": 7803 }, { "epoch": 5.168211920529801, "grad_norm": 0.7429993147829709, "learning_rate": 2.2824236328753876e-05, "loss": 0.2578, "step": 7804 }, { "epoch": 5.16887417218543, "grad_norm": 0.703236513271614, "learning_rate": 2.278885563176841e-05, "loss": 0.2324, "step": 7805 }, { "epoch": 5.16953642384106, "grad_norm": 0.7673745687197612, "learning_rate": 2.2753500124047936e-05, "loss": 0.2246, "step": 7806 }, { "epoch": 5.170198675496689, "grad_norm": 0.7541916138461104, "learning_rate": 2.271816981259327e-05, "loss": 0.2734, "step": 7807 }, { "epoch": 5.170860927152318, "grad_norm": 0.6729437800769618, "learning_rate": 2.2682864704400243e-05, "loss": 0.2334, "step": 7808 }, { "epoch": 5.171523178807947, "grad_norm": 0.7576188537186267, "learning_rate": 2.2647584806459655e-05, "loss": 0.3047, "step": 7809 }, { "epoch": 5.172185430463577, "grad_norm": 0.8248779428480717, "learning_rate": 2.2612330125757266e-05, "loss": 0.248, "step": 7810 }, { "epoch": 5.1728476821192055, "grad_norm": 0.7960577424955629, "learning_rate": 2.2577100669274024e-05, "loss": 0.2754, "step": 7811 }, { "epoch": 5.1735099337748345, "grad_norm": 0.6245494287907758, "learning_rate": 2.2541896443985697e-05, "loss": 0.1924, "step": 7812 }, { "epoch": 5.174172185430463, "grad_norm": 0.7473964518000902, "learning_rate": 2.250671745686311e-05, "loss": 0.2402, "step": 7813 }, { "epoch": 5.174834437086092, "grad_norm": 0.7866086139276683, "learning_rate": 2.2471563714872127e-05, "loss": 0.2812, "step": 7814 }, { "epoch": 5.175496688741722, "grad_norm": 0.7868871682517076, "learning_rate": 2.2436435224973624e-05, "loss": 0.2383, "step": 7815 }, { "epoch": 5.176158940397351, "grad_norm": 0.7069579841525414, "learning_rate": 2.240133199412338e-05, "loss": 0.2412, "step": 7816 }, { "epoch": 5.17682119205298, "grad_norm": 0.7741840927510586, "learning_rate": 2.2366254029272324e-05, "loss": 0.2793, "step": 7817 }, { "epoch": 5.177483443708609, "grad_norm": 0.6504363679174688, "learning_rate": 2.233120133736619e-05, "loss": 0.2012, "step": 7818 }, { "epoch": 5.178145695364238, "grad_norm": 0.7472154203345664, "learning_rate": 2.2296173925345874e-05, "loss": 0.2754, "step": 7819 }, { "epoch": 5.178807947019868, "grad_norm": 0.9395590951717676, "learning_rate": 2.2261171800147228e-05, "loss": 0.3164, "step": 7820 }, { "epoch": 5.179470198675497, "grad_norm": 0.7232906213178408, "learning_rate": 2.222619496870104e-05, "loss": 0.2441, "step": 7821 }, { "epoch": 5.180132450331126, "grad_norm": 0.6917111067852199, "learning_rate": 2.2191243437933103e-05, "loss": 0.2402, "step": 7822 }, { "epoch": 5.180794701986755, "grad_norm": 0.801163140962666, "learning_rate": 2.2156317214764235e-05, "loss": 0.2969, "step": 7823 }, { "epoch": 5.1814569536423845, "grad_norm": 0.7272144161344692, "learning_rate": 2.2121416306110286e-05, "loss": 0.2461, "step": 7824 }, { "epoch": 5.182119205298013, "grad_norm": 0.8017637000059644, "learning_rate": 2.208654071888194e-05, "loss": 0.2871, "step": 7825 }, { "epoch": 5.182781456953642, "grad_norm": 0.7292843230729871, "learning_rate": 2.2051690459985044e-05, "loss": 0.2334, "step": 7826 }, { "epoch": 5.183443708609271, "grad_norm": 0.691693179915796, "learning_rate": 2.2016865536320293e-05, "loss": 0.2578, "step": 7827 }, { "epoch": 5.184105960264901, "grad_norm": 0.8590929320266867, "learning_rate": 2.198206595478344e-05, "loss": 0.2891, "step": 7828 }, { "epoch": 5.18476821192053, "grad_norm": 0.8534153344779773, "learning_rate": 2.1947291722265236e-05, "loss": 0.3047, "step": 7829 }, { "epoch": 5.185430463576159, "grad_norm": 0.7570824585480698, "learning_rate": 2.191254284565136e-05, "loss": 0.2461, "step": 7830 }, { "epoch": 5.186092715231788, "grad_norm": 0.64961243562121, "learning_rate": 2.1877819331822404e-05, "loss": 0.2207, "step": 7831 }, { "epoch": 5.186754966887417, "grad_norm": 0.7170010319819818, "learning_rate": 2.1843121187654154e-05, "loss": 0.2432, "step": 7832 }, { "epoch": 5.187417218543047, "grad_norm": 0.7639123302856651, "learning_rate": 2.180844842001718e-05, "loss": 0.2617, "step": 7833 }, { "epoch": 5.188079470198676, "grad_norm": 0.8247615476544231, "learning_rate": 2.177380103577706e-05, "loss": 0.2715, "step": 7834 }, { "epoch": 5.188741721854305, "grad_norm": 0.8239149477043434, "learning_rate": 2.1739179041794387e-05, "loss": 0.3105, "step": 7835 }, { "epoch": 5.1894039735099335, "grad_norm": 0.806295474284952, "learning_rate": 2.1704582444924774e-05, "loss": 0.2988, "step": 7836 }, { "epoch": 5.1900662251655625, "grad_norm": 0.7234980921769271, "learning_rate": 2.1670011252018693e-05, "loss": 0.2695, "step": 7837 }, { "epoch": 5.190728476821192, "grad_norm": 0.7916533289482816, "learning_rate": 2.1635465469921605e-05, "loss": 0.2441, "step": 7838 }, { "epoch": 5.191390728476821, "grad_norm": 0.6991993947714319, "learning_rate": 2.1600945105473998e-05, "loss": 0.2227, "step": 7839 }, { "epoch": 5.19205298013245, "grad_norm": 0.7583537012665814, "learning_rate": 2.1566450165511347e-05, "loss": 0.252, "step": 7840 }, { "epoch": 5.192715231788079, "grad_norm": 0.8020611092768729, "learning_rate": 2.153198065686398e-05, "loss": 0.3086, "step": 7841 }, { "epoch": 5.193377483443709, "grad_norm": 0.7821332148868907, "learning_rate": 2.1497536586357305e-05, "loss": 0.2852, "step": 7842 }, { "epoch": 5.194039735099338, "grad_norm": 0.7134476597583137, "learning_rate": 2.1463117960811587e-05, "loss": 0.2559, "step": 7843 }, { "epoch": 5.194701986754967, "grad_norm": 0.8481653724693978, "learning_rate": 2.142872478704214e-05, "loss": 0.293, "step": 7844 }, { "epoch": 5.195364238410596, "grad_norm": 0.6244441117367191, "learning_rate": 2.139435707185923e-05, "loss": 0.1953, "step": 7845 }, { "epoch": 5.196026490066225, "grad_norm": 0.8216860712768915, "learning_rate": 2.136001482206805e-05, "loss": 0.2754, "step": 7846 }, { "epoch": 5.196688741721855, "grad_norm": 0.9395771798685858, "learning_rate": 2.1325698044468707e-05, "loss": 0.3184, "step": 7847 }, { "epoch": 5.1973509933774835, "grad_norm": 0.7966636242608205, "learning_rate": 2.1291406745856353e-05, "loss": 0.2715, "step": 7848 }, { "epoch": 5.1980132450331125, "grad_norm": 0.7843248582356047, "learning_rate": 2.125714093302109e-05, "loss": 0.2832, "step": 7849 }, { "epoch": 5.198675496688741, "grad_norm": 0.7089643606691242, "learning_rate": 2.1222900612747882e-05, "loss": 0.2451, "step": 7850 }, { "epoch": 5.199337748344371, "grad_norm": 0.7545923243966016, "learning_rate": 2.1188685791816734e-05, "loss": 0.2539, "step": 7851 }, { "epoch": 5.2, "grad_norm": 0.6145839073841828, "learning_rate": 2.1154496477002604e-05, "loss": 0.1934, "step": 7852 }, { "epoch": 5.200662251655629, "grad_norm": 0.8258804921652623, "learning_rate": 2.112033267507529e-05, "loss": 0.291, "step": 7853 }, { "epoch": 5.201324503311258, "grad_norm": 0.6946786096307236, "learning_rate": 2.108619439279972e-05, "loss": 0.2559, "step": 7854 }, { "epoch": 5.201986754966887, "grad_norm": 0.7213162679044548, "learning_rate": 2.105208163693557e-05, "loss": 0.2422, "step": 7855 }, { "epoch": 5.202649006622517, "grad_norm": 0.7822093233457468, "learning_rate": 2.101799441423758e-05, "loss": 0.2871, "step": 7856 }, { "epoch": 5.203311258278146, "grad_norm": 0.8267984183229521, "learning_rate": 2.0983932731455476e-05, "loss": 0.3203, "step": 7857 }, { "epoch": 5.203973509933775, "grad_norm": 0.6524904445273471, "learning_rate": 2.094989659533381e-05, "loss": 0.2266, "step": 7858 }, { "epoch": 5.204635761589404, "grad_norm": 0.6867435681303443, "learning_rate": 2.0915886012612086e-05, "loss": 0.2354, "step": 7859 }, { "epoch": 5.2052980132450335, "grad_norm": 0.7443296483922031, "learning_rate": 2.0881900990024842e-05, "loss": 0.2344, "step": 7860 }, { "epoch": 5.205960264900662, "grad_norm": 0.8009401097429937, "learning_rate": 2.084794153430154e-05, "loss": 0.2715, "step": 7861 }, { "epoch": 5.206622516556291, "grad_norm": 0.6800988859515609, "learning_rate": 2.081400765216643e-05, "loss": 0.2441, "step": 7862 }, { "epoch": 5.20728476821192, "grad_norm": 0.7658912952819359, "learning_rate": 2.078009935033893e-05, "loss": 0.2773, "step": 7863 }, { "epoch": 5.207947019867549, "grad_norm": 0.751982343956833, "learning_rate": 2.074621663553318e-05, "loss": 0.2598, "step": 7864 }, { "epoch": 5.208609271523179, "grad_norm": 0.7639573027285648, "learning_rate": 2.0712359514458366e-05, "loss": 0.2812, "step": 7865 }, { "epoch": 5.209271523178808, "grad_norm": 0.6831280557834392, "learning_rate": 2.0678527993818657e-05, "loss": 0.2178, "step": 7866 }, { "epoch": 5.209933774834437, "grad_norm": 0.8150073574257312, "learning_rate": 2.0644722080313007e-05, "loss": 0.2754, "step": 7867 }, { "epoch": 5.210596026490066, "grad_norm": 0.7671141751603547, "learning_rate": 2.0610941780635354e-05, "loss": 0.2734, "step": 7868 }, { "epoch": 5.211258278145696, "grad_norm": 0.7933255863244286, "learning_rate": 2.0577187101474674e-05, "loss": 0.252, "step": 7869 }, { "epoch": 5.211920529801325, "grad_norm": 0.8636251138367256, "learning_rate": 2.054345804951474e-05, "loss": 0.2773, "step": 7870 }, { "epoch": 5.212582781456954, "grad_norm": 0.7990707397393219, "learning_rate": 2.0509754631434238e-05, "loss": 0.2988, "step": 7871 }, { "epoch": 5.213245033112583, "grad_norm": 0.6783036740295046, "learning_rate": 2.0476076853906887e-05, "loss": 0.2314, "step": 7872 }, { "epoch": 5.2139072847682115, "grad_norm": 0.7531432680149214, "learning_rate": 2.04424247236013e-05, "loss": 0.2256, "step": 7873 }, { "epoch": 5.214569536423841, "grad_norm": 0.7732736746507498, "learning_rate": 2.0408798247180892e-05, "loss": 0.2656, "step": 7874 }, { "epoch": 5.21523178807947, "grad_norm": 0.7470185065825986, "learning_rate": 2.0375197431304192e-05, "loss": 0.2617, "step": 7875 }, { "epoch": 5.215894039735099, "grad_norm": 0.8260755186738208, "learning_rate": 2.0341622282624475e-05, "loss": 0.2969, "step": 7876 }, { "epoch": 5.216556291390728, "grad_norm": 0.9879287370913481, "learning_rate": 2.0308072807790026e-05, "loss": 0.3203, "step": 7877 }, { "epoch": 5.217218543046358, "grad_norm": 0.6561622019317833, "learning_rate": 2.0274549013444064e-05, "loss": 0.2109, "step": 7878 }, { "epoch": 5.217880794701987, "grad_norm": 0.7510663162973034, "learning_rate": 2.024105090622464e-05, "loss": 0.2559, "step": 7879 }, { "epoch": 5.218543046357616, "grad_norm": 0.6889266108870015, "learning_rate": 2.020757849276476e-05, "loss": 0.2129, "step": 7880 }, { "epoch": 5.219205298013245, "grad_norm": 0.7181296877900692, "learning_rate": 2.0174131779692343e-05, "loss": 0.2373, "step": 7881 }, { "epoch": 5.219867549668874, "grad_norm": 0.761794897635098, "learning_rate": 2.014071077363028e-05, "loss": 0.2373, "step": 7882 }, { "epoch": 5.220529801324504, "grad_norm": 0.9157205155805359, "learning_rate": 2.010731548119623e-05, "loss": 0.2969, "step": 7883 }, { "epoch": 5.221192052980133, "grad_norm": 0.769945762165555, "learning_rate": 2.0073945909002932e-05, "loss": 0.2598, "step": 7884 }, { "epoch": 5.2218543046357615, "grad_norm": 0.6875768181166205, "learning_rate": 2.0040602063657868e-05, "loss": 0.2354, "step": 7885 }, { "epoch": 5.22251655629139, "grad_norm": 0.761719545264721, "learning_rate": 2.000728395176351e-05, "loss": 0.2676, "step": 7886 }, { "epoch": 5.22317880794702, "grad_norm": 0.8300738737661454, "learning_rate": 1.9973991579917293e-05, "loss": 0.2891, "step": 7887 }, { "epoch": 5.223841059602649, "grad_norm": 0.7559725731714821, "learning_rate": 1.9940724954711414e-05, "loss": 0.2275, "step": 7888 }, { "epoch": 5.224503311258278, "grad_norm": 0.7890720223389843, "learning_rate": 1.9907484082733067e-05, "loss": 0.2891, "step": 7889 }, { "epoch": 5.225165562913907, "grad_norm": 0.7151515808464772, "learning_rate": 1.987426897056437e-05, "loss": 0.2598, "step": 7890 }, { "epoch": 5.225827814569536, "grad_norm": 0.795159509871878, "learning_rate": 1.9841079624782267e-05, "loss": 0.2793, "step": 7891 }, { "epoch": 5.226490066225166, "grad_norm": 0.6997504839004208, "learning_rate": 1.9807916051958584e-05, "loss": 0.2334, "step": 7892 }, { "epoch": 5.227152317880795, "grad_norm": 0.7878210230088095, "learning_rate": 1.9774778258660114e-05, "loss": 0.2852, "step": 7893 }, { "epoch": 5.227814569536424, "grad_norm": 0.740962289246586, "learning_rate": 1.9741666251448572e-05, "loss": 0.2344, "step": 7894 }, { "epoch": 5.228476821192053, "grad_norm": 0.6534268555726958, "learning_rate": 1.970858003688049e-05, "loss": 0.1846, "step": 7895 }, { "epoch": 5.2291390728476825, "grad_norm": 0.8160893194070954, "learning_rate": 1.967551962150725e-05, "loss": 0.2891, "step": 7896 }, { "epoch": 5.2298013245033115, "grad_norm": 0.857152422118708, "learning_rate": 1.964248501187526e-05, "loss": 0.2598, "step": 7897 }, { "epoch": 5.23046357615894, "grad_norm": 0.737825128083667, "learning_rate": 1.960947621452576e-05, "loss": 0.2637, "step": 7898 }, { "epoch": 5.231125827814569, "grad_norm": 0.8304129359859337, "learning_rate": 1.9576493235994826e-05, "loss": 0.2891, "step": 7899 }, { "epoch": 5.231788079470198, "grad_norm": 0.6782967651452708, "learning_rate": 1.9543536082813522e-05, "loss": 0.2275, "step": 7900 }, { "epoch": 5.232450331125828, "grad_norm": 0.7436280225406535, "learning_rate": 1.951060476150767e-05, "loss": 0.2598, "step": 7901 }, { "epoch": 5.233112582781457, "grad_norm": 0.7740599879493457, "learning_rate": 1.9477699278598073e-05, "loss": 0.2217, "step": 7902 }, { "epoch": 5.233774834437086, "grad_norm": 0.7235351000288407, "learning_rate": 1.9444819640600456e-05, "loss": 0.2139, "step": 7903 }, { "epoch": 5.234437086092715, "grad_norm": 0.9073241661614054, "learning_rate": 1.9411965854025324e-05, "loss": 0.3223, "step": 7904 }, { "epoch": 5.235099337748345, "grad_norm": 0.6191824626793746, "learning_rate": 1.9379137925378023e-05, "loss": 0.1953, "step": 7905 }, { "epoch": 5.235761589403974, "grad_norm": 0.7166782722205333, "learning_rate": 1.934633586115899e-05, "loss": 0.2451, "step": 7906 }, { "epoch": 5.236423841059603, "grad_norm": 0.6879512857012469, "learning_rate": 1.931355966786336e-05, "loss": 0.2031, "step": 7907 }, { "epoch": 5.237086092715232, "grad_norm": 0.8087431839652018, "learning_rate": 1.928080935198114e-05, "loss": 0.3125, "step": 7908 }, { "epoch": 5.237748344370861, "grad_norm": 0.8061488740893928, "learning_rate": 1.924808491999733e-05, "loss": 0.293, "step": 7909 }, { "epoch": 5.23841059602649, "grad_norm": 0.826498377892661, "learning_rate": 1.9215386378391763e-05, "loss": 0.3047, "step": 7910 }, { "epoch": 5.239072847682119, "grad_norm": 0.7654925352219261, "learning_rate": 1.9182713733639055e-05, "loss": 0.252, "step": 7911 }, { "epoch": 5.239735099337748, "grad_norm": 0.8919958068766045, "learning_rate": 1.915006699220882e-05, "loss": 0.3066, "step": 7912 }, { "epoch": 5.240397350993377, "grad_norm": 0.7521641041059619, "learning_rate": 1.9117446160565446e-05, "loss": 0.2656, "step": 7913 }, { "epoch": 5.241059602649006, "grad_norm": 0.7750077729378727, "learning_rate": 1.9084851245168242e-05, "loss": 0.25, "step": 7914 }, { "epoch": 5.241721854304636, "grad_norm": 0.7086573573327722, "learning_rate": 1.9052282252471422e-05, "loss": 0.252, "step": 7915 }, { "epoch": 5.242384105960265, "grad_norm": 0.8003024442763168, "learning_rate": 1.9019739188923984e-05, "loss": 0.3008, "step": 7916 }, { "epoch": 5.243046357615894, "grad_norm": 0.7399107732771303, "learning_rate": 1.8987222060969798e-05, "loss": 0.2598, "step": 7917 }, { "epoch": 5.243708609271523, "grad_norm": 0.6869904443572256, "learning_rate": 1.8954730875047653e-05, "loss": 0.2393, "step": 7918 }, { "epoch": 5.244370860927153, "grad_norm": 0.7365286220424062, "learning_rate": 1.89222656375912e-05, "loss": 0.2793, "step": 7919 }, { "epoch": 5.245033112582782, "grad_norm": 0.7372094423581625, "learning_rate": 1.8889826355028897e-05, "loss": 0.2559, "step": 7920 }, { "epoch": 5.2456953642384105, "grad_norm": 0.7187929352700633, "learning_rate": 1.8857413033784125e-05, "loss": 0.2266, "step": 7921 }, { "epoch": 5.2463576158940395, "grad_norm": 0.8472812920726802, "learning_rate": 1.882502568027503e-05, "loss": 0.2949, "step": 7922 }, { "epoch": 5.247019867549669, "grad_norm": 0.7074531065898034, "learning_rate": 1.8792664300914735e-05, "loss": 0.2402, "step": 7923 }, { "epoch": 5.247682119205298, "grad_norm": 0.61640663806564, "learning_rate": 1.8760328902111194e-05, "loss": 0.208, "step": 7924 }, { "epoch": 5.248344370860927, "grad_norm": 0.776443456901954, "learning_rate": 1.872801949026712e-05, "loss": 0.2109, "step": 7925 }, { "epoch": 5.249006622516556, "grad_norm": 0.7133866616249277, "learning_rate": 1.8695736071780133e-05, "loss": 0.21, "step": 7926 }, { "epoch": 5.249668874172185, "grad_norm": 0.6485833223334139, "learning_rate": 1.8663478653042797e-05, "loss": 0.2119, "step": 7927 }, { "epoch": 5.250331125827815, "grad_norm": 0.7427085708559611, "learning_rate": 1.8631247240442436e-05, "loss": 0.2129, "step": 7928 }, { "epoch": 5.250993377483444, "grad_norm": 0.7499296744343822, "learning_rate": 1.859904184036117e-05, "loss": 0.252, "step": 7929 }, { "epoch": 5.251655629139073, "grad_norm": 0.7985085171721937, "learning_rate": 1.8566862459176086e-05, "loss": 0.2656, "step": 7930 }, { "epoch": 5.252317880794702, "grad_norm": 0.9300195438691514, "learning_rate": 1.8534709103259095e-05, "loss": 0.3203, "step": 7931 }, { "epoch": 5.252980132450331, "grad_norm": 0.7642091787500077, "learning_rate": 1.8502581778976865e-05, "loss": 0.2559, "step": 7932 }, { "epoch": 5.2536423841059605, "grad_norm": 0.6849708734419366, "learning_rate": 1.8470480492691046e-05, "loss": 0.2119, "step": 7933 }, { "epoch": 5.2543046357615895, "grad_norm": 0.7035300854114488, "learning_rate": 1.8438405250758005e-05, "loss": 0.2334, "step": 7934 }, { "epoch": 5.254966887417218, "grad_norm": 0.7333351655402648, "learning_rate": 1.840635605952902e-05, "loss": 0.2217, "step": 7935 }, { "epoch": 5.255629139072847, "grad_norm": 0.9347980105663337, "learning_rate": 1.837433292535024e-05, "loss": 0.334, "step": 7936 }, { "epoch": 5.256291390728477, "grad_norm": 0.7373690683431211, "learning_rate": 1.8342335854562584e-05, "loss": 0.2227, "step": 7937 }, { "epoch": 5.256953642384106, "grad_norm": 0.6773016203022948, "learning_rate": 1.831036485350179e-05, "loss": 0.2236, "step": 7938 }, { "epoch": 5.257615894039735, "grad_norm": 0.8117642600088054, "learning_rate": 1.8278419928498543e-05, "loss": 0.3008, "step": 7939 }, { "epoch": 5.258278145695364, "grad_norm": 0.5921443733774829, "learning_rate": 1.82465010858783e-05, "loss": 0.2031, "step": 7940 }, { "epoch": 5.258940397350994, "grad_norm": 0.79400144697917, "learning_rate": 1.8214608331961338e-05, "loss": 0.2539, "step": 7941 }, { "epoch": 5.259602649006623, "grad_norm": 0.8095376884114691, "learning_rate": 1.8182741673062788e-05, "loss": 0.2754, "step": 7942 }, { "epoch": 5.260264900662252, "grad_norm": 0.6887194395153051, "learning_rate": 1.8150901115492656e-05, "loss": 0.2061, "step": 7943 }, { "epoch": 5.260927152317881, "grad_norm": 0.7188388448071006, "learning_rate": 1.811908666555566e-05, "loss": 0.2715, "step": 7944 }, { "epoch": 5.26158940397351, "grad_norm": 0.6546705021321286, "learning_rate": 1.8087298329551514e-05, "loss": 0.21, "step": 7945 }, { "epoch": 5.262251655629139, "grad_norm": 0.7282351660035895, "learning_rate": 1.80555361137746e-05, "loss": 0.2451, "step": 7946 }, { "epoch": 5.262913907284768, "grad_norm": 0.6548286582061331, "learning_rate": 1.8023800024514216e-05, "loss": 0.1973, "step": 7947 }, { "epoch": 5.263576158940397, "grad_norm": 0.7495955533899803, "learning_rate": 1.7992090068054526e-05, "loss": 0.2158, "step": 7948 }, { "epoch": 5.264238410596026, "grad_norm": 0.6720868251874537, "learning_rate": 1.7960406250674418e-05, "loss": 0.2344, "step": 7949 }, { "epoch": 5.264900662251655, "grad_norm": 0.7270314661402414, "learning_rate": 1.792874857864763e-05, "loss": 0.2383, "step": 7950 }, { "epoch": 5.265562913907285, "grad_norm": 0.6482808754839535, "learning_rate": 1.789711705824277e-05, "loss": 0.1709, "step": 7951 }, { "epoch": 5.266225165562914, "grad_norm": 0.7339027704826079, "learning_rate": 1.786551169572327e-05, "loss": 0.249, "step": 7952 }, { "epoch": 5.266887417218543, "grad_norm": 0.7516874149450281, "learning_rate": 1.783393249734734e-05, "loss": 0.2695, "step": 7953 }, { "epoch": 5.267549668874172, "grad_norm": 0.7135721213880135, "learning_rate": 1.780237946936796e-05, "loss": 0.2275, "step": 7954 }, { "epoch": 5.268211920529802, "grad_norm": 0.7632363839841702, "learning_rate": 1.7770852618033048e-05, "loss": 0.252, "step": 7955 }, { "epoch": 5.268874172185431, "grad_norm": 0.7847594420236709, "learning_rate": 1.773935194958531e-05, "loss": 0.2539, "step": 7956 }, { "epoch": 5.26953642384106, "grad_norm": 0.7970442442093321, "learning_rate": 1.7707877470262176e-05, "loss": 0.2852, "step": 7957 }, { "epoch": 5.2701986754966885, "grad_norm": 0.7830175258830214, "learning_rate": 1.7676429186296008e-05, "loss": 0.293, "step": 7958 }, { "epoch": 5.2708609271523175, "grad_norm": 0.7724537437183625, "learning_rate": 1.7645007103913877e-05, "loss": 0.2402, "step": 7959 }, { "epoch": 5.271523178807947, "grad_norm": 0.6441592542769375, "learning_rate": 1.761361122933774e-05, "loss": 0.2002, "step": 7960 }, { "epoch": 5.272185430463576, "grad_norm": 0.6780491321854711, "learning_rate": 1.7582241568784367e-05, "loss": 0.1914, "step": 7961 }, { "epoch": 5.272847682119205, "grad_norm": 0.7843465604244393, "learning_rate": 1.755089812846528e-05, "loss": 0.2676, "step": 7962 }, { "epoch": 5.273509933774834, "grad_norm": 0.8460037524301623, "learning_rate": 1.751958091458678e-05, "loss": 0.3066, "step": 7963 }, { "epoch": 5.274172185430464, "grad_norm": 0.6787237071126669, "learning_rate": 1.7488289933350168e-05, "loss": 0.2148, "step": 7964 }, { "epoch": 5.274834437086093, "grad_norm": 0.6779377647869225, "learning_rate": 1.7457025190951344e-05, "loss": 0.2197, "step": 7965 }, { "epoch": 5.275496688741722, "grad_norm": 0.9127408928580059, "learning_rate": 1.7425786693581057e-05, "loss": 0.2949, "step": 7966 }, { "epoch": 5.276158940397351, "grad_norm": 0.8119680153441015, "learning_rate": 1.739457444742492e-05, "loss": 0.2871, "step": 7967 }, { "epoch": 5.27682119205298, "grad_norm": 0.7548386737351861, "learning_rate": 1.736338845866334e-05, "loss": 0.2373, "step": 7968 }, { "epoch": 5.27748344370861, "grad_norm": 0.6898523205000069, "learning_rate": 1.7332228733471433e-05, "loss": 0.2275, "step": 7969 }, { "epoch": 5.2781456953642385, "grad_norm": 0.7687371226119528, "learning_rate": 1.7301095278019272e-05, "loss": 0.2734, "step": 7970 }, { "epoch": 5.278807947019867, "grad_norm": 0.8167464633390286, "learning_rate": 1.726998809847155e-05, "loss": 0.2773, "step": 7971 }, { "epoch": 5.279470198675496, "grad_norm": 0.5993317301250232, "learning_rate": 1.7238907200987885e-05, "loss": 0.1826, "step": 7972 }, { "epoch": 5.280132450331126, "grad_norm": 0.7109700283338855, "learning_rate": 1.7207852591722686e-05, "loss": 0.252, "step": 7973 }, { "epoch": 5.280794701986755, "grad_norm": 0.8931880179563709, "learning_rate": 1.7176824276825075e-05, "loss": 0.3086, "step": 7974 }, { "epoch": 5.281456953642384, "grad_norm": 0.779421517158717, "learning_rate": 1.7145822262439015e-05, "loss": 0.2676, "step": 7975 }, { "epoch": 5.282119205298013, "grad_norm": 0.7351259852084753, "learning_rate": 1.711484655470326e-05, "loss": 0.2236, "step": 7976 }, { "epoch": 5.282781456953642, "grad_norm": 0.7217881148605702, "learning_rate": 1.7083897159751392e-05, "loss": 0.2539, "step": 7977 }, { "epoch": 5.283443708609272, "grad_norm": 0.8161987976426147, "learning_rate": 1.705297408371169e-05, "loss": 0.2598, "step": 7978 }, { "epoch": 5.284105960264901, "grad_norm": 0.7419313117716437, "learning_rate": 1.702207733270734e-05, "loss": 0.2324, "step": 7979 }, { "epoch": 5.28476821192053, "grad_norm": 0.7344737137503939, "learning_rate": 1.699120691285618e-05, "loss": 0.2432, "step": 7980 }, { "epoch": 5.285430463576159, "grad_norm": 0.8403808901782538, "learning_rate": 1.696036283027096e-05, "loss": 0.2773, "step": 7981 }, { "epoch": 5.2860927152317885, "grad_norm": 0.7995969822485023, "learning_rate": 1.6929545091059184e-05, "loss": 0.2598, "step": 7982 }, { "epoch": 5.286754966887417, "grad_norm": 0.8638478822797097, "learning_rate": 1.6898753701323043e-05, "loss": 0.291, "step": 7983 }, { "epoch": 5.287417218543046, "grad_norm": 0.7229893962646534, "learning_rate": 1.6867988667159625e-05, "loss": 0.2227, "step": 7984 }, { "epoch": 5.288079470198675, "grad_norm": 0.9362202853223729, "learning_rate": 1.6837249994660794e-05, "loss": 0.334, "step": 7985 }, { "epoch": 5.288741721854304, "grad_norm": 0.788656762071385, "learning_rate": 1.6806537689913113e-05, "loss": 0.2793, "step": 7986 }, { "epoch": 5.289403973509934, "grad_norm": 0.6370836095391719, "learning_rate": 1.6775851758997966e-05, "loss": 0.1982, "step": 7987 }, { "epoch": 5.290066225165563, "grad_norm": 0.6828275624797505, "learning_rate": 1.6745192207991527e-05, "loss": 0.1982, "step": 7988 }, { "epoch": 5.290728476821192, "grad_norm": 0.8294928972465531, "learning_rate": 1.6714559042964786e-05, "loss": 0.2715, "step": 7989 }, { "epoch": 5.291390728476821, "grad_norm": 0.8355373736769015, "learning_rate": 1.6683952269983397e-05, "loss": 0.2754, "step": 7990 }, { "epoch": 5.292052980132451, "grad_norm": 0.748831502763545, "learning_rate": 1.665337189510789e-05, "loss": 0.2734, "step": 7991 }, { "epoch": 5.29271523178808, "grad_norm": 0.8038387893959709, "learning_rate": 1.6622817924393496e-05, "loss": 0.2832, "step": 7992 }, { "epoch": 5.293377483443709, "grad_norm": 0.8518082541684192, "learning_rate": 1.659229036389028e-05, "loss": 0.3066, "step": 7993 }, { "epoch": 5.294039735099338, "grad_norm": 0.8073892977347925, "learning_rate": 1.656178921964305e-05, "loss": 0.2773, "step": 7994 }, { "epoch": 5.2947019867549665, "grad_norm": 0.7115917237251048, "learning_rate": 1.653131449769139e-05, "loss": 0.2383, "step": 7995 }, { "epoch": 5.295364238410596, "grad_norm": 0.7465816932716353, "learning_rate": 1.65008662040696e-05, "loss": 0.2432, "step": 7996 }, { "epoch": 5.296026490066225, "grad_norm": 0.7800854869686216, "learning_rate": 1.6470444344806816e-05, "loss": 0.25, "step": 7997 }, { "epoch": 5.296688741721854, "grad_norm": 0.7087002327778797, "learning_rate": 1.6440048925926948e-05, "loss": 0.2412, "step": 7998 }, { "epoch": 5.297350993377483, "grad_norm": 0.7691620236605988, "learning_rate": 1.640967995344858e-05, "loss": 0.252, "step": 7999 }, { "epoch": 5.298013245033113, "grad_norm": 0.8919828131603305, "learning_rate": 1.6379337433385138e-05, "loss": 0.3066, "step": 8000 }, { "epoch": 5.298675496688742, "grad_norm": 0.7161009282025457, "learning_rate": 1.634902137174483e-05, "loss": 0.2471, "step": 8001 }, { "epoch": 5.299337748344371, "grad_norm": 0.7342675550219723, "learning_rate": 1.6318731774530506e-05, "loss": 0.248, "step": 8002 }, { "epoch": 5.3, "grad_norm": 0.7595454341242521, "learning_rate": 1.628846864773994e-05, "loss": 0.2471, "step": 8003 }, { "epoch": 5.300662251655629, "grad_norm": 0.7521242169778093, "learning_rate": 1.625823199736551e-05, "loss": 0.2227, "step": 8004 }, { "epoch": 5.301324503311259, "grad_norm": 0.7104859619242662, "learning_rate": 1.622802182939444e-05, "loss": 0.2422, "step": 8005 }, { "epoch": 5.3019867549668875, "grad_norm": 0.8335162576467143, "learning_rate": 1.6197838149808717e-05, "loss": 0.2969, "step": 8006 }, { "epoch": 5.3026490066225165, "grad_norm": 0.7381661580603504, "learning_rate": 1.6167680964585033e-05, "loss": 0.2432, "step": 8007 }, { "epoch": 5.303311258278145, "grad_norm": 0.7697856224117433, "learning_rate": 1.613755027969483e-05, "loss": 0.2754, "step": 8008 }, { "epoch": 5.303973509933774, "grad_norm": 0.7617559791722613, "learning_rate": 1.6107446101104343e-05, "loss": 0.2852, "step": 8009 }, { "epoch": 5.304635761589404, "grad_norm": 0.7688447781625987, "learning_rate": 1.6077368434774597e-05, "loss": 0.2471, "step": 8010 }, { "epoch": 5.305298013245033, "grad_norm": 0.7226689335581242, "learning_rate": 1.604731728666127e-05, "loss": 0.2715, "step": 8011 }, { "epoch": 5.305960264900662, "grad_norm": 0.7725143756988609, "learning_rate": 1.601729266271481e-05, "loss": 0.2363, "step": 8012 }, { "epoch": 5.306622516556291, "grad_norm": 0.7207629112468867, "learning_rate": 1.5987294568880455e-05, "loss": 0.2266, "step": 8013 }, { "epoch": 5.307284768211921, "grad_norm": 0.680519637929438, "learning_rate": 1.5957323011098222e-05, "loss": 0.1914, "step": 8014 }, { "epoch": 5.30794701986755, "grad_norm": 0.7686441246641283, "learning_rate": 1.5927377995302747e-05, "loss": 0.2617, "step": 8015 }, { "epoch": 5.308609271523179, "grad_norm": 0.730025125195389, "learning_rate": 1.5897459527423556e-05, "loss": 0.2246, "step": 8016 }, { "epoch": 5.309271523178808, "grad_norm": 0.7095257039566637, "learning_rate": 1.5867567613384774e-05, "loss": 0.2344, "step": 8017 }, { "epoch": 5.3099337748344375, "grad_norm": 0.712177172534391, "learning_rate": 1.58377022591054e-05, "loss": 0.2363, "step": 8018 }, { "epoch": 5.3105960264900665, "grad_norm": 0.7099658537010103, "learning_rate": 1.580786347049912e-05, "loss": 0.25, "step": 8019 }, { "epoch": 5.311258278145695, "grad_norm": 0.7598955477276265, "learning_rate": 1.5778051253474328e-05, "loss": 0.2793, "step": 8020 }, { "epoch": 5.311920529801324, "grad_norm": 0.6599488963677888, "learning_rate": 1.5748265613934146e-05, "loss": 0.2178, "step": 8021 }, { "epoch": 5.312582781456953, "grad_norm": 0.7130910037952829, "learning_rate": 1.571850655777656e-05, "loss": 0.25, "step": 8022 }, { "epoch": 5.313245033112583, "grad_norm": 0.70726632600197, "learning_rate": 1.5688774090894174e-05, "loss": 0.2432, "step": 8023 }, { "epoch": 5.313907284768212, "grad_norm": 0.7505602569981783, "learning_rate": 1.5659068219174325e-05, "loss": 0.2383, "step": 8024 }, { "epoch": 5.314569536423841, "grad_norm": 0.8369938657147894, "learning_rate": 1.5629388948499126e-05, "loss": 0.293, "step": 8025 }, { "epoch": 5.31523178807947, "grad_norm": 0.7837462576587424, "learning_rate": 1.5599736284745444e-05, "loss": 0.2559, "step": 8026 }, { "epoch": 5.315894039735099, "grad_norm": 0.6959392875476804, "learning_rate": 1.5570110233784816e-05, "loss": 0.2344, "step": 8027 }, { "epoch": 5.316556291390729, "grad_norm": 0.7201707803254381, "learning_rate": 1.554051080148358e-05, "loss": 0.2432, "step": 8028 }, { "epoch": 5.317218543046358, "grad_norm": 0.7824244705095718, "learning_rate": 1.551093799370268e-05, "loss": 0.2773, "step": 8029 }, { "epoch": 5.317880794701987, "grad_norm": 0.8530055420298697, "learning_rate": 1.5481391816297956e-05, "loss": 0.2793, "step": 8030 }, { "epoch": 5.3185430463576155, "grad_norm": 0.777585389625231, "learning_rate": 1.545187227511987e-05, "loss": 0.2754, "step": 8031 }, { "epoch": 5.319205298013245, "grad_norm": 0.7889343015717932, "learning_rate": 1.5422379376013615e-05, "loss": 0.2773, "step": 8032 }, { "epoch": 5.319867549668874, "grad_norm": 0.8374539124221668, "learning_rate": 1.5392913124819106e-05, "loss": 0.3223, "step": 8033 }, { "epoch": 5.320529801324503, "grad_norm": 0.692479837902625, "learning_rate": 1.5363473527371006e-05, "loss": 0.2217, "step": 8034 }, { "epoch": 5.321192052980132, "grad_norm": 0.7558923536337373, "learning_rate": 1.5334060589498736e-05, "loss": 0.2559, "step": 8035 }, { "epoch": 5.321854304635762, "grad_norm": 0.7477017975474067, "learning_rate": 1.5304674317026338e-05, "loss": 0.2334, "step": 8036 }, { "epoch": 5.322516556291391, "grad_norm": 0.8381973476256512, "learning_rate": 1.5275314715772657e-05, "loss": 0.2969, "step": 8037 }, { "epoch": 5.32317880794702, "grad_norm": 0.7298714333744627, "learning_rate": 1.524598179155126e-05, "loss": 0.2344, "step": 8038 }, { "epoch": 5.323841059602649, "grad_norm": 0.6565277157097448, "learning_rate": 1.5216675550170348e-05, "loss": 0.2051, "step": 8039 }, { "epoch": 5.324503311258278, "grad_norm": 0.7765764066970834, "learning_rate": 1.518739599743295e-05, "loss": 0.252, "step": 8040 }, { "epoch": 5.325165562913908, "grad_norm": 0.8203342564757837, "learning_rate": 1.5158143139136675e-05, "loss": 0.2676, "step": 8041 }, { "epoch": 5.325827814569537, "grad_norm": 0.8454474062452767, "learning_rate": 1.5128916981073974e-05, "loss": 0.2812, "step": 8042 }, { "epoch": 5.3264900662251655, "grad_norm": 0.8248852089810802, "learning_rate": 1.5099717529032002e-05, "loss": 0.2988, "step": 8043 }, { "epoch": 5.3271523178807945, "grad_norm": 0.769150334674777, "learning_rate": 1.5070544788792532e-05, "loss": 0.25, "step": 8044 }, { "epoch": 5.327814569536423, "grad_norm": 0.8530608835527638, "learning_rate": 1.504139876613208e-05, "loss": 0.293, "step": 8045 }, { "epoch": 5.328476821192053, "grad_norm": 0.7251620538401817, "learning_rate": 1.5012279466821908e-05, "loss": 0.2441, "step": 8046 }, { "epoch": 5.329139072847682, "grad_norm": 0.8570179570353621, "learning_rate": 1.4983186896628042e-05, "loss": 0.3008, "step": 8047 }, { "epoch": 5.329801324503311, "grad_norm": 0.8136615154031157, "learning_rate": 1.4954121061311037e-05, "loss": 0.2695, "step": 8048 }, { "epoch": 5.33046357615894, "grad_norm": 0.6955191136404615, "learning_rate": 1.492508196662634e-05, "loss": 0.2373, "step": 8049 }, { "epoch": 5.33112582781457, "grad_norm": 0.7072012667937185, "learning_rate": 1.4896069618323985e-05, "loss": 0.2354, "step": 8050 }, { "epoch": 5.331788079470199, "grad_norm": 0.779486013021626, "learning_rate": 1.4867084022148746e-05, "loss": 0.2617, "step": 8051 }, { "epoch": 5.332450331125828, "grad_norm": 0.7903633861156499, "learning_rate": 1.483812518384016e-05, "loss": 0.248, "step": 8052 }, { "epoch": 5.333112582781457, "grad_norm": 0.7111625633884585, "learning_rate": 1.4809193109132362e-05, "loss": 0.2334, "step": 8053 }, { "epoch": 5.3337748344370866, "grad_norm": 0.7439465463306729, "learning_rate": 1.4780287803754181e-05, "loss": 0.2539, "step": 8054 }, { "epoch": 5.3344370860927155, "grad_norm": 0.7600225013024784, "learning_rate": 1.475140927342931e-05, "loss": 0.2578, "step": 8055 }, { "epoch": 5.335099337748344, "grad_norm": 0.7584012993543706, "learning_rate": 1.472255752387599e-05, "loss": 0.2734, "step": 8056 }, { "epoch": 5.335761589403973, "grad_norm": 0.6614252367719962, "learning_rate": 1.4693732560807147e-05, "loss": 0.2129, "step": 8057 }, { "epoch": 5.336423841059602, "grad_norm": 0.8407698237852888, "learning_rate": 1.4664934389930505e-05, "loss": 0.2734, "step": 8058 }, { "epoch": 5.337086092715232, "grad_norm": 0.73834019736086, "learning_rate": 1.463616301694845e-05, "loss": 0.2656, "step": 8059 }, { "epoch": 5.337748344370861, "grad_norm": 0.7252586684624422, "learning_rate": 1.4607418447557972e-05, "loss": 0.249, "step": 8060 }, { "epoch": 5.33841059602649, "grad_norm": 0.7757527177894843, "learning_rate": 1.457870068745089e-05, "loss": 0.3105, "step": 8061 }, { "epoch": 5.339072847682119, "grad_norm": 0.744245550807292, "learning_rate": 1.4550009742313618e-05, "loss": 0.2754, "step": 8062 }, { "epoch": 5.339735099337748, "grad_norm": 0.7528920580584396, "learning_rate": 1.452134561782728e-05, "loss": 0.2354, "step": 8063 }, { "epoch": 5.340397350993378, "grad_norm": 0.7742172318109762, "learning_rate": 1.4492708319667767e-05, "loss": 0.2441, "step": 8064 }, { "epoch": 5.341059602649007, "grad_norm": 0.7795519738458873, "learning_rate": 1.4464097853505546e-05, "loss": 0.249, "step": 8065 }, { "epoch": 5.341721854304636, "grad_norm": 0.7795912485267877, "learning_rate": 1.4435514225005784e-05, "loss": 0.25, "step": 8066 }, { "epoch": 5.342384105960265, "grad_norm": 0.7436473879365388, "learning_rate": 1.4406957439828387e-05, "loss": 0.2637, "step": 8067 }, { "epoch": 5.343046357615894, "grad_norm": 0.7662431429565507, "learning_rate": 1.4378427503627998e-05, "loss": 0.2715, "step": 8068 }, { "epoch": 5.343708609271523, "grad_norm": 0.8324831073256408, "learning_rate": 1.4349924422053799e-05, "loss": 0.2773, "step": 8069 }, { "epoch": 5.344370860927152, "grad_norm": 0.6958338353786345, "learning_rate": 1.4321448200749708e-05, "loss": 0.2129, "step": 8070 }, { "epoch": 5.345033112582781, "grad_norm": 0.7677448410136595, "learning_rate": 1.429299884535438e-05, "loss": 0.2441, "step": 8071 }, { "epoch": 5.34569536423841, "grad_norm": 0.7640324615076572, "learning_rate": 1.4264576361501146e-05, "loss": 0.2734, "step": 8072 }, { "epoch": 5.34635761589404, "grad_norm": 0.8297230975019833, "learning_rate": 1.4236180754817917e-05, "loss": 0.2754, "step": 8073 }, { "epoch": 5.347019867549669, "grad_norm": 0.709690343838788, "learning_rate": 1.4207812030927396e-05, "loss": 0.2578, "step": 8074 }, { "epoch": 5.347682119205298, "grad_norm": 0.7516017784484753, "learning_rate": 1.4179470195446891e-05, "loss": 0.252, "step": 8075 }, { "epoch": 5.348344370860927, "grad_norm": 0.6466929509174333, "learning_rate": 1.415115525398841e-05, "loss": 0.1982, "step": 8076 }, { "epoch": 5.349006622516557, "grad_norm": 0.7399392444832139, "learning_rate": 1.4122867212158673e-05, "loss": 0.2246, "step": 8077 }, { "epoch": 5.349668874172186, "grad_norm": 0.6584902328451135, "learning_rate": 1.4094606075558978e-05, "loss": 0.1934, "step": 8078 }, { "epoch": 5.350331125827815, "grad_norm": 0.7382598899854903, "learning_rate": 1.4066371849785385e-05, "loss": 0.2393, "step": 8079 }, { "epoch": 5.3509933774834435, "grad_norm": 0.8728424072823019, "learning_rate": 1.4038164540428637e-05, "loss": 0.2812, "step": 8080 }, { "epoch": 5.3516556291390724, "grad_norm": 0.725310618826158, "learning_rate": 1.4009984153074033e-05, "loss": 0.2139, "step": 8081 }, { "epoch": 5.352317880794702, "grad_norm": 0.6834211838119204, "learning_rate": 1.3981830693301644e-05, "loss": 0.207, "step": 8082 }, { "epoch": 5.352980132450331, "grad_norm": 0.7524304254416659, "learning_rate": 1.3953704166686158e-05, "loss": 0.252, "step": 8083 }, { "epoch": 5.35364238410596, "grad_norm": 0.7588940096867658, "learning_rate": 1.392560457879699e-05, "loss": 0.2617, "step": 8084 }, { "epoch": 5.354304635761589, "grad_norm": 0.701281907391126, "learning_rate": 1.3897531935198137e-05, "loss": 0.2451, "step": 8085 }, { "epoch": 5.354966887417219, "grad_norm": 0.676827253926598, "learning_rate": 1.3869486241448353e-05, "loss": 0.2305, "step": 8086 }, { "epoch": 5.355629139072848, "grad_norm": 0.7721843679891223, "learning_rate": 1.3841467503100944e-05, "loss": 0.2461, "step": 8087 }, { "epoch": 5.356291390728477, "grad_norm": 0.7334716276024691, "learning_rate": 1.3813475725703993e-05, "loss": 0.2734, "step": 8088 }, { "epoch": 5.356953642384106, "grad_norm": 0.7065869744921625, "learning_rate": 1.3785510914800197e-05, "loss": 0.2334, "step": 8089 }, { "epoch": 5.357615894039735, "grad_norm": 0.7793203996817449, "learning_rate": 1.3757573075926876e-05, "loss": 0.2539, "step": 8090 }, { "epoch": 5.3582781456953645, "grad_norm": 0.7959420095873805, "learning_rate": 1.3729662214616005e-05, "loss": 0.2637, "step": 8091 }, { "epoch": 5.3589403973509935, "grad_norm": 0.841821715067985, "learning_rate": 1.3701778336394364e-05, "loss": 0.2754, "step": 8092 }, { "epoch": 5.359602649006622, "grad_norm": 0.7939298880862516, "learning_rate": 1.367392144678322e-05, "loss": 0.2812, "step": 8093 }, { "epoch": 5.360264900662251, "grad_norm": 0.6817358925241885, "learning_rate": 1.3646091551298527e-05, "loss": 0.2148, "step": 8094 }, { "epoch": 5.360927152317881, "grad_norm": 0.6180326333111245, "learning_rate": 1.3618288655450942e-05, "loss": 0.1943, "step": 8095 }, { "epoch": 5.36158940397351, "grad_norm": 0.7149430222148255, "learning_rate": 1.3590512764745815e-05, "loss": 0.2422, "step": 8096 }, { "epoch": 5.362251655629139, "grad_norm": 0.6675901496507601, "learning_rate": 1.3562763884683008e-05, "loss": 0.2188, "step": 8097 }, { "epoch": 5.362913907284768, "grad_norm": 0.7210705471617311, "learning_rate": 1.3535042020757181e-05, "loss": 0.2383, "step": 8098 }, { "epoch": 5.363576158940397, "grad_norm": 0.7384161907382689, "learning_rate": 1.350734717845754e-05, "loss": 0.2539, "step": 8099 }, { "epoch": 5.364238410596027, "grad_norm": 0.7625919990337888, "learning_rate": 1.347967936326798e-05, "loss": 0.2832, "step": 8100 }, { "epoch": 5.364900662251656, "grad_norm": 0.7663930695629941, "learning_rate": 1.3452038580667091e-05, "loss": 0.252, "step": 8101 }, { "epoch": 5.365562913907285, "grad_norm": 0.7950490433041154, "learning_rate": 1.3424424836128022e-05, "loss": 0.25, "step": 8102 }, { "epoch": 5.366225165562914, "grad_norm": 0.7654606075112844, "learning_rate": 1.339683813511862e-05, "loss": 0.2441, "step": 8103 }, { "epoch": 5.3668874172185435, "grad_norm": 0.8597960791885009, "learning_rate": 1.3369278483101348e-05, "loss": 0.2949, "step": 8104 }, { "epoch": 5.367549668874172, "grad_norm": 0.7391333628468434, "learning_rate": 1.3341745885533406e-05, "loss": 0.2305, "step": 8105 }, { "epoch": 5.368211920529801, "grad_norm": 0.6865011956080486, "learning_rate": 1.3314240347866467e-05, "loss": 0.1992, "step": 8106 }, { "epoch": 5.36887417218543, "grad_norm": 0.8469854378389177, "learning_rate": 1.3286761875547025e-05, "loss": 0.3145, "step": 8107 }, { "epoch": 5.369536423841059, "grad_norm": 0.73542903618696, "learning_rate": 1.3259310474016077e-05, "loss": 0.2422, "step": 8108 }, { "epoch": 5.370198675496689, "grad_norm": 0.7121697340215851, "learning_rate": 1.3231886148709325e-05, "loss": 0.208, "step": 8109 }, { "epoch": 5.370860927152318, "grad_norm": 0.7611788184036535, "learning_rate": 1.320448890505716e-05, "loss": 0.2734, "step": 8110 }, { "epoch": 5.371523178807947, "grad_norm": 0.6641738081753786, "learning_rate": 1.3177118748484473e-05, "loss": 0.2051, "step": 8111 }, { "epoch": 5.372185430463576, "grad_norm": 0.7371043570499014, "learning_rate": 1.3149775684410862e-05, "loss": 0.2217, "step": 8112 }, { "epoch": 5.372847682119206, "grad_norm": 0.7235410944515224, "learning_rate": 1.312245971825065e-05, "loss": 0.2393, "step": 8113 }, { "epoch": 5.373509933774835, "grad_norm": 0.7146664462154936, "learning_rate": 1.3095170855412657e-05, "loss": 0.2441, "step": 8114 }, { "epoch": 5.374172185430464, "grad_norm": 0.7118476218223516, "learning_rate": 1.3067909101300377e-05, "loss": 0.2129, "step": 8115 }, { "epoch": 5.3748344370860925, "grad_norm": 0.7776591769575657, "learning_rate": 1.3040674461311979e-05, "loss": 0.2656, "step": 8116 }, { "epoch": 5.3754966887417215, "grad_norm": 0.662157128904447, "learning_rate": 1.3013466940840229e-05, "loss": 0.2217, "step": 8117 }, { "epoch": 5.376158940397351, "grad_norm": 0.8653796429167776, "learning_rate": 1.2986286545272518e-05, "loss": 0.2949, "step": 8118 }, { "epoch": 5.37682119205298, "grad_norm": 0.7816704252931724, "learning_rate": 1.295913327999089e-05, "loss": 0.3008, "step": 8119 }, { "epoch": 5.377483443708609, "grad_norm": 0.7293748306525842, "learning_rate": 1.2932007150371982e-05, "loss": 0.208, "step": 8120 }, { "epoch": 5.378145695364238, "grad_norm": 0.8902552451990786, "learning_rate": 1.2904908161787126e-05, "loss": 0.3301, "step": 8121 }, { "epoch": 5.378807947019867, "grad_norm": 0.7174210207758523, "learning_rate": 1.2877836319602164e-05, "loss": 0.2441, "step": 8122 }, { "epoch": 5.379470198675497, "grad_norm": 0.750952672623268, "learning_rate": 1.2850791629177675e-05, "loss": 0.2832, "step": 8123 }, { "epoch": 5.380132450331126, "grad_norm": 0.6931268188939322, "learning_rate": 1.2823774095868795e-05, "loss": 0.2275, "step": 8124 }, { "epoch": 5.380794701986755, "grad_norm": 0.8765104909900238, "learning_rate": 1.2796783725025323e-05, "loss": 0.2754, "step": 8125 }, { "epoch": 5.381456953642384, "grad_norm": 0.7339528779960038, "learning_rate": 1.2769820521991686e-05, "loss": 0.2295, "step": 8126 }, { "epoch": 5.382119205298014, "grad_norm": 0.7796513420193587, "learning_rate": 1.2742884492106865e-05, "loss": 0.2236, "step": 8127 }, { "epoch": 5.3827814569536425, "grad_norm": 0.8350439485368039, "learning_rate": 1.271597564070449e-05, "loss": 0.2891, "step": 8128 }, { "epoch": 5.3834437086092715, "grad_norm": 0.7954422380292928, "learning_rate": 1.2689093973112852e-05, "loss": 0.2227, "step": 8129 }, { "epoch": 5.3841059602649, "grad_norm": 0.8668144498166878, "learning_rate": 1.2662239494654858e-05, "loss": 0.2578, "step": 8130 }, { "epoch": 5.38476821192053, "grad_norm": 0.5885514608931014, "learning_rate": 1.2635412210647938e-05, "loss": 0.1689, "step": 8131 }, { "epoch": 5.385430463576159, "grad_norm": 0.9297808001075322, "learning_rate": 1.2608612126404244e-05, "loss": 0.3105, "step": 8132 }, { "epoch": 5.386092715231788, "grad_norm": 0.7160444303368904, "learning_rate": 1.2581839247230496e-05, "loss": 0.2393, "step": 8133 }, { "epoch": 5.386754966887417, "grad_norm": 0.77735727113849, "learning_rate": 1.2555093578428022e-05, "loss": 0.2451, "step": 8134 }, { "epoch": 5.387417218543046, "grad_norm": 0.7941862836734495, "learning_rate": 1.2528375125292783e-05, "loss": 0.2695, "step": 8135 }, { "epoch": 5.388079470198676, "grad_norm": 0.9282686334098263, "learning_rate": 1.2501683893115317e-05, "loss": 0.3184, "step": 8136 }, { "epoch": 5.388741721854305, "grad_norm": 0.7680855118564253, "learning_rate": 1.2475019887180793e-05, "loss": 0.2412, "step": 8137 }, { "epoch": 5.389403973509934, "grad_norm": 0.7090166065141378, "learning_rate": 1.244838311276904e-05, "loss": 0.2148, "step": 8138 }, { "epoch": 5.390066225165563, "grad_norm": 0.7411006042308812, "learning_rate": 1.2421773575154404e-05, "loss": 0.249, "step": 8139 }, { "epoch": 5.390728476821192, "grad_norm": 0.705245131197477, "learning_rate": 1.2395191279605854e-05, "loss": 0.2314, "step": 8140 }, { "epoch": 5.391390728476821, "grad_norm": 0.8648090928323934, "learning_rate": 1.2368636231387013e-05, "loss": 0.2891, "step": 8141 }, { "epoch": 5.39205298013245, "grad_norm": 0.9133575883761696, "learning_rate": 1.2342108435756127e-05, "loss": 0.3066, "step": 8142 }, { "epoch": 5.392715231788079, "grad_norm": 0.8290083753182598, "learning_rate": 1.231560789796594e-05, "loss": 0.293, "step": 8143 }, { "epoch": 5.393377483443708, "grad_norm": 0.7248293484836095, "learning_rate": 1.2289134623263907e-05, "loss": 0.249, "step": 8144 }, { "epoch": 5.394039735099338, "grad_norm": 0.7134182883441372, "learning_rate": 1.226268861689202e-05, "loss": 0.2188, "step": 8145 }, { "epoch": 5.394701986754967, "grad_norm": 0.7024244159127842, "learning_rate": 1.223626988408687e-05, "loss": 0.2236, "step": 8146 }, { "epoch": 5.395364238410596, "grad_norm": 0.7117994170427495, "learning_rate": 1.2209878430079723e-05, "loss": 0.2266, "step": 8147 }, { "epoch": 5.396026490066225, "grad_norm": 0.9054738899100201, "learning_rate": 1.2183514260096367e-05, "loss": 0.2891, "step": 8148 }, { "epoch": 5.396688741721855, "grad_norm": 0.7010688716628896, "learning_rate": 1.2157177379357141e-05, "loss": 0.2217, "step": 8149 }, { "epoch": 5.397350993377484, "grad_norm": 0.752388869843788, "learning_rate": 1.213086779307716e-05, "loss": 0.2773, "step": 8150 }, { "epoch": 5.398013245033113, "grad_norm": 0.8115645998945585, "learning_rate": 1.2104585506465987e-05, "loss": 0.2734, "step": 8151 }, { "epoch": 5.398675496688742, "grad_norm": 0.802017631925854, "learning_rate": 1.2078330524727774e-05, "loss": 0.3105, "step": 8152 }, { "epoch": 5.3993377483443705, "grad_norm": 0.8211867609746558, "learning_rate": 1.2052102853061318e-05, "loss": 0.2695, "step": 8153 }, { "epoch": 5.4, "grad_norm": 0.7795316870922536, "learning_rate": 1.202590249666006e-05, "loss": 0.2793, "step": 8154 }, { "epoch": 5.400662251655629, "grad_norm": 0.7850957469530484, "learning_rate": 1.1999729460711905e-05, "loss": 0.2695, "step": 8155 }, { "epoch": 5.401324503311258, "grad_norm": 0.8452925272713329, "learning_rate": 1.1973583750399468e-05, "loss": 0.3125, "step": 8156 }, { "epoch": 5.401986754966887, "grad_norm": 0.7576852496345284, "learning_rate": 1.1947465370899828e-05, "loss": 0.2578, "step": 8157 }, { "epoch": 5.402649006622516, "grad_norm": 0.7589360996698207, "learning_rate": 1.1921374327384781e-05, "loss": 0.25, "step": 8158 }, { "epoch": 5.403311258278146, "grad_norm": 0.7372719760609229, "learning_rate": 1.189531062502066e-05, "loss": 0.2197, "step": 8159 }, { "epoch": 5.403973509933775, "grad_norm": 0.7028243003358164, "learning_rate": 1.1869274268968354e-05, "loss": 0.2256, "step": 8160 }, { "epoch": 5.404635761589404, "grad_norm": 0.7227228402664468, "learning_rate": 1.1843265264383323e-05, "loss": 0.2031, "step": 8161 }, { "epoch": 5.405298013245033, "grad_norm": 0.7494359889569733, "learning_rate": 1.1817283616415696e-05, "loss": 0.2637, "step": 8162 }, { "epoch": 5.405960264900663, "grad_norm": 0.6085543777252429, "learning_rate": 1.1791329330210142e-05, "loss": 0.1943, "step": 8163 }, { "epoch": 5.406622516556292, "grad_norm": 0.7947274207666705, "learning_rate": 1.1765402410905882e-05, "loss": 0.2383, "step": 8164 }, { "epoch": 5.4072847682119205, "grad_norm": 0.7348735191153554, "learning_rate": 1.1739502863636762e-05, "loss": 0.2246, "step": 8165 }, { "epoch": 5.407947019867549, "grad_norm": 0.7447796731198978, "learning_rate": 1.1713630693531163e-05, "loss": 0.2578, "step": 8166 }, { "epoch": 5.408609271523179, "grad_norm": 0.6633076058683766, "learning_rate": 1.1687785905712082e-05, "loss": 0.2021, "step": 8167 }, { "epoch": 5.409271523178808, "grad_norm": 0.7755868988314782, "learning_rate": 1.1661968505297115e-05, "loss": 0.2334, "step": 8168 }, { "epoch": 5.409933774834437, "grad_norm": 0.7979223686511705, "learning_rate": 1.1636178497398386e-05, "loss": 0.2539, "step": 8169 }, { "epoch": 5.410596026490066, "grad_norm": 0.6340012719774182, "learning_rate": 1.1610415887122543e-05, "loss": 0.1914, "step": 8170 }, { "epoch": 5.411258278145695, "grad_norm": 0.7629744640947688, "learning_rate": 1.1584680679571006e-05, "loss": 0.2354, "step": 8171 }, { "epoch": 5.411920529801325, "grad_norm": 0.6710205472309697, "learning_rate": 1.155897287983955e-05, "loss": 0.207, "step": 8172 }, { "epoch": 5.412582781456954, "grad_norm": 0.6640371554223963, "learning_rate": 1.1533292493018637e-05, "loss": 0.2051, "step": 8173 }, { "epoch": 5.413245033112583, "grad_norm": 0.7383189576535907, "learning_rate": 1.1507639524193263e-05, "loss": 0.2422, "step": 8174 }, { "epoch": 5.413907284768212, "grad_norm": 0.7961543215425335, "learning_rate": 1.1482013978443067e-05, "loss": 0.2832, "step": 8175 }, { "epoch": 5.414569536423841, "grad_norm": 0.822293768376884, "learning_rate": 1.1456415860842122e-05, "loss": 0.2734, "step": 8176 }, { "epoch": 5.4152317880794705, "grad_norm": 0.8304980819465055, "learning_rate": 1.1430845176459208e-05, "loss": 0.2471, "step": 8177 }, { "epoch": 5.415894039735099, "grad_norm": 0.6869328437647758, "learning_rate": 1.1405301930357574e-05, "loss": 0.1953, "step": 8178 }, { "epoch": 5.416556291390728, "grad_norm": 0.8053336974629604, "learning_rate": 1.1379786127595125e-05, "loss": 0.2812, "step": 8179 }, { "epoch": 5.417218543046357, "grad_norm": 0.7645932450540927, "learning_rate": 1.1354297773224218e-05, "loss": 0.249, "step": 8180 }, { "epoch": 5.417880794701987, "grad_norm": 0.789999939534861, "learning_rate": 1.1328836872291903e-05, "loss": 0.2559, "step": 8181 }, { "epoch": 5.418543046357616, "grad_norm": 0.780082624630594, "learning_rate": 1.1303403429839675e-05, "loss": 0.2539, "step": 8182 }, { "epoch": 5.419205298013245, "grad_norm": 0.7400716013208438, "learning_rate": 1.1277997450903676e-05, "loss": 0.2236, "step": 8183 }, { "epoch": 5.419867549668874, "grad_norm": 0.8470551630512101, "learning_rate": 1.1252618940514596e-05, "loss": 0.3008, "step": 8184 }, { "epoch": 5.420529801324503, "grad_norm": 0.8115084483960443, "learning_rate": 1.122726790369765e-05, "loss": 0.25, "step": 8185 }, { "epoch": 5.421192052980133, "grad_norm": 0.8175666513983704, "learning_rate": 1.1201944345472608e-05, "loss": 0.2852, "step": 8186 }, { "epoch": 5.421854304635762, "grad_norm": 0.7363014576660762, "learning_rate": 1.1176648270853873e-05, "loss": 0.2314, "step": 8187 }, { "epoch": 5.422516556291391, "grad_norm": 0.6597580004001037, "learning_rate": 1.1151379684850353e-05, "loss": 0.2393, "step": 8188 }, { "epoch": 5.42317880794702, "grad_norm": 0.6665124518440448, "learning_rate": 1.1126138592465483e-05, "loss": 0.2236, "step": 8189 }, { "epoch": 5.423841059602649, "grad_norm": 0.7395893595479944, "learning_rate": 1.1100924998697296e-05, "loss": 0.2148, "step": 8190 }, { "epoch": 5.424503311258278, "grad_norm": 0.8222924520485109, "learning_rate": 1.1075738908538417e-05, "loss": 0.291, "step": 8191 }, { "epoch": 5.425165562913907, "grad_norm": 0.8495959298384754, "learning_rate": 1.1050580326975921e-05, "loss": 0.2373, "step": 8192 }, { "epoch": 5.425827814569536, "grad_norm": 0.8384429757339605, "learning_rate": 1.1025449258991559e-05, "loss": 0.2578, "step": 8193 }, { "epoch": 5.426490066225165, "grad_norm": 0.7562773819474549, "learning_rate": 1.1000345709561497e-05, "loss": 0.2031, "step": 8194 }, { "epoch": 5.427152317880795, "grad_norm": 0.6754742488972901, "learning_rate": 1.0975269683656578e-05, "loss": 0.2061, "step": 8195 }, { "epoch": 5.427814569536424, "grad_norm": 0.7575133685163968, "learning_rate": 1.0950221186242147e-05, "loss": 0.2354, "step": 8196 }, { "epoch": 5.428476821192053, "grad_norm": 0.7135852000437264, "learning_rate": 1.092520022227807e-05, "loss": 0.2305, "step": 8197 }, { "epoch": 5.429139072847682, "grad_norm": 0.7543585159134615, "learning_rate": 1.0900206796718764e-05, "loss": 0.2324, "step": 8198 }, { "epoch": 5.429801324503312, "grad_norm": 0.8129481972683335, "learning_rate": 1.087524091451324e-05, "loss": 0.25, "step": 8199 }, { "epoch": 5.430463576158941, "grad_norm": 0.8077685259747593, "learning_rate": 1.0850302580605058e-05, "loss": 0.2295, "step": 8200 }, { "epoch": 5.4311258278145695, "grad_norm": 0.7457859115731689, "learning_rate": 1.0825391799932237e-05, "loss": 0.2539, "step": 8201 }, { "epoch": 5.4317880794701985, "grad_norm": 0.7500849211638266, "learning_rate": 1.0800508577427447e-05, "loss": 0.25, "step": 8202 }, { "epoch": 5.432450331125827, "grad_norm": 0.6969300558331686, "learning_rate": 1.0775652918017796e-05, "loss": 0.208, "step": 8203 }, { "epoch": 5.433112582781457, "grad_norm": 0.7941004332415325, "learning_rate": 1.0750824826625031e-05, "loss": 0.2695, "step": 8204 }, { "epoch": 5.433774834437086, "grad_norm": 0.7423193525808635, "learning_rate": 1.072602430816542e-05, "loss": 0.2412, "step": 8205 }, { "epoch": 5.434437086092715, "grad_norm": 0.7283960495069702, "learning_rate": 1.0701251367549718e-05, "loss": 0.2461, "step": 8206 }, { "epoch": 5.435099337748344, "grad_norm": 0.8535558556368544, "learning_rate": 1.06765060096832e-05, "loss": 0.2734, "step": 8207 }, { "epoch": 5.435761589403974, "grad_norm": 0.7525491434304608, "learning_rate": 1.065178823946583e-05, "loss": 0.2422, "step": 8208 }, { "epoch": 5.436423841059603, "grad_norm": 0.7630745567166811, "learning_rate": 1.0627098061791978e-05, "loss": 0.2559, "step": 8209 }, { "epoch": 5.437086092715232, "grad_norm": 0.8634963924755517, "learning_rate": 1.060243548155053e-05, "loss": 0.2773, "step": 8210 }, { "epoch": 5.437748344370861, "grad_norm": 0.7700201159252429, "learning_rate": 1.0577800503625e-05, "loss": 0.2695, "step": 8211 }, { "epoch": 5.43841059602649, "grad_norm": 0.782024996803962, "learning_rate": 1.0553193132893417e-05, "loss": 0.2695, "step": 8212 }, { "epoch": 5.4390728476821195, "grad_norm": 0.837352402992951, "learning_rate": 1.0528613374228284e-05, "loss": 0.2871, "step": 8213 }, { "epoch": 5.4397350993377485, "grad_norm": 0.8052086983769423, "learning_rate": 1.0504061232496709e-05, "loss": 0.2832, "step": 8214 }, { "epoch": 5.440397350993377, "grad_norm": 0.910522628102496, "learning_rate": 1.047953671256025e-05, "loss": 0.3027, "step": 8215 }, { "epoch": 5.441059602649006, "grad_norm": 0.6726072249008708, "learning_rate": 1.0455039819275074e-05, "loss": 0.2197, "step": 8216 }, { "epoch": 5.441721854304636, "grad_norm": 0.6823027367476258, "learning_rate": 1.0430570557491869e-05, "loss": 0.249, "step": 8217 }, { "epoch": 5.442384105960265, "grad_norm": 0.7412709353727522, "learning_rate": 1.0406128932055807e-05, "loss": 0.2246, "step": 8218 }, { "epoch": 5.443046357615894, "grad_norm": 0.7663270401544181, "learning_rate": 1.0381714947806586e-05, "loss": 0.2539, "step": 8219 }, { "epoch": 5.443708609271523, "grad_norm": 0.8462294760811713, "learning_rate": 1.0357328609578468e-05, "loss": 0.2812, "step": 8220 }, { "epoch": 5.444370860927152, "grad_norm": 0.7292670055148073, "learning_rate": 1.0332969922200263e-05, "loss": 0.2227, "step": 8221 }, { "epoch": 5.445033112582782, "grad_norm": 0.8815330380510904, "learning_rate": 1.0308638890495225e-05, "loss": 0.3145, "step": 8222 }, { "epoch": 5.445695364238411, "grad_norm": 0.8436743622481963, "learning_rate": 1.0284335519281217e-05, "loss": 0.2637, "step": 8223 }, { "epoch": 5.44635761589404, "grad_norm": 0.6737165412135997, "learning_rate": 1.0260059813370541e-05, "loss": 0.2344, "step": 8224 }, { "epoch": 5.447019867549669, "grad_norm": 0.7995762922951847, "learning_rate": 1.0235811777570086e-05, "loss": 0.2754, "step": 8225 }, { "epoch": 5.447682119205298, "grad_norm": 0.8482090119796771, "learning_rate": 1.021159141668126e-05, "loss": 0.3027, "step": 8226 }, { "epoch": 5.448344370860927, "grad_norm": 0.7992928107368851, "learning_rate": 1.0187398735499941e-05, "loss": 0.2637, "step": 8227 }, { "epoch": 5.449006622516556, "grad_norm": 0.8022607478377197, "learning_rate": 1.0163233738816568e-05, "loss": 0.2832, "step": 8228 }, { "epoch": 5.449668874172185, "grad_norm": 0.7048177652434733, "learning_rate": 1.0139096431416126e-05, "loss": 0.2207, "step": 8229 }, { "epoch": 5.450331125827814, "grad_norm": 0.7424175924202886, "learning_rate": 1.011498681807803e-05, "loss": 0.2256, "step": 8230 }, { "epoch": 5.450993377483444, "grad_norm": 0.8678018402148715, "learning_rate": 1.0090904903576258e-05, "loss": 0.2715, "step": 8231 }, { "epoch": 5.451655629139073, "grad_norm": 0.705198379446596, "learning_rate": 1.0066850692679312e-05, "loss": 0.2295, "step": 8232 }, { "epoch": 5.452317880794702, "grad_norm": 0.724380239751693, "learning_rate": 1.0042824190150234e-05, "loss": 0.249, "step": 8233 }, { "epoch": 5.452980132450331, "grad_norm": 0.76509104105022, "learning_rate": 1.0018825400746517e-05, "loss": 0.2393, "step": 8234 }, { "epoch": 5.45364238410596, "grad_norm": 0.6892756107253529, "learning_rate": 9.99485432922021e-06, "loss": 0.2334, "step": 8235 }, { "epoch": 5.45430463576159, "grad_norm": 0.8040562507015419, "learning_rate": 9.97091098031783e-06, "loss": 0.249, "step": 8236 }, { "epoch": 5.454966887417219, "grad_norm": 0.7575597080649262, "learning_rate": 9.946995358780474e-06, "loss": 0.2314, "step": 8237 }, { "epoch": 5.4556291390728475, "grad_norm": 0.718341691814952, "learning_rate": 9.923107469343683e-06, "loss": 0.249, "step": 8238 }, { "epoch": 5.4562913907284765, "grad_norm": 0.6796712610276961, "learning_rate": 9.899247316737557e-06, "loss": 0.2197, "step": 8239 }, { "epoch": 5.456953642384106, "grad_norm": 0.8364442650831285, "learning_rate": 9.875414905686652e-06, "loss": 0.2969, "step": 8240 }, { "epoch": 5.457615894039735, "grad_norm": 0.6750954248636425, "learning_rate": 9.851610240910058e-06, "loss": 0.207, "step": 8241 }, { "epoch": 5.458278145695364, "grad_norm": 0.6982925630966095, "learning_rate": 9.82783332712142e-06, "loss": 0.2324, "step": 8242 }, { "epoch": 5.458940397350993, "grad_norm": 0.7567891967997111, "learning_rate": 9.804084169028792e-06, "loss": 0.2617, "step": 8243 }, { "epoch": 5.459602649006623, "grad_norm": 0.9282080768713186, "learning_rate": 9.78036277133476e-06, "loss": 0.3555, "step": 8244 }, { "epoch": 5.460264900662252, "grad_norm": 0.698834967713629, "learning_rate": 9.7566691387365e-06, "loss": 0.2148, "step": 8245 }, { "epoch": 5.460927152317881, "grad_norm": 0.7101774292543698, "learning_rate": 9.733003275925594e-06, "loss": 0.2275, "step": 8246 }, { "epoch": 5.46158940397351, "grad_norm": 0.7805355081769234, "learning_rate": 9.709365187588124e-06, "loss": 0.252, "step": 8247 }, { "epoch": 5.462251655629139, "grad_norm": 0.8274963832791492, "learning_rate": 9.685754878404717e-06, "loss": 0.2832, "step": 8248 }, { "epoch": 5.4629139072847686, "grad_norm": 0.764600385943334, "learning_rate": 9.66217235305053e-06, "loss": 0.2676, "step": 8249 }, { "epoch": 5.4635761589403975, "grad_norm": 0.7636334168787373, "learning_rate": 9.63861761619511e-06, "loss": 0.2236, "step": 8250 }, { "epoch": 5.464238410596026, "grad_norm": 0.9644969940582598, "learning_rate": 9.615090672502613e-06, "loss": 0.3262, "step": 8251 }, { "epoch": 5.464900662251655, "grad_norm": 0.6995941733866757, "learning_rate": 9.591591526631608e-06, "loss": 0.2236, "step": 8252 }, { "epoch": 5.465562913907284, "grad_norm": 0.7917754617393823, "learning_rate": 9.568120183235211e-06, "loss": 0.3047, "step": 8253 }, { "epoch": 5.466225165562914, "grad_norm": 0.6729855738696939, "learning_rate": 9.544676646961036e-06, "loss": 0.208, "step": 8254 }, { "epoch": 5.466887417218543, "grad_norm": 0.7366912523378247, "learning_rate": 9.521260922451152e-06, "loss": 0.2197, "step": 8255 }, { "epoch": 5.467549668874172, "grad_norm": 0.7243111905019703, "learning_rate": 9.497873014342116e-06, "loss": 0.21, "step": 8256 }, { "epoch": 5.468211920529801, "grad_norm": 0.7604047882262971, "learning_rate": 9.474512927265026e-06, "loss": 0.2754, "step": 8257 }, { "epoch": 5.468874172185431, "grad_norm": 0.7472783503399074, "learning_rate": 9.451180665845482e-06, "loss": 0.252, "step": 8258 }, { "epoch": 5.46953642384106, "grad_norm": 0.6986079764339138, "learning_rate": 9.42787623470347e-06, "loss": 0.2295, "step": 8259 }, { "epoch": 5.470198675496689, "grad_norm": 0.7558842667525061, "learning_rate": 9.404599638453597e-06, "loss": 0.2305, "step": 8260 }, { "epoch": 5.470860927152318, "grad_norm": 0.8514022457802254, "learning_rate": 9.381350881704863e-06, "loss": 0.3281, "step": 8261 }, { "epoch": 5.4715231788079475, "grad_norm": 0.8199458953577055, "learning_rate": 9.358129969060768e-06, "loss": 0.2559, "step": 8262 }, { "epoch": 5.472185430463576, "grad_norm": 0.7727945978518875, "learning_rate": 9.334936905119383e-06, "loss": 0.248, "step": 8263 }, { "epoch": 5.472847682119205, "grad_norm": 0.6898482664036545, "learning_rate": 9.311771694473169e-06, "loss": 0.2285, "step": 8264 }, { "epoch": 5.473509933774834, "grad_norm": 0.7728471369233042, "learning_rate": 9.28863434170904e-06, "loss": 0.2324, "step": 8265 }, { "epoch": 5.474172185430463, "grad_norm": 0.6800552107777947, "learning_rate": 9.265524851408562e-06, "loss": 0.2236, "step": 8266 }, { "epoch": 5.474834437086093, "grad_norm": 0.6416980743440165, "learning_rate": 9.242443228147627e-06, "loss": 0.1855, "step": 8267 }, { "epoch": 5.475496688741722, "grad_norm": 0.7395458056646712, "learning_rate": 9.219389476496663e-06, "loss": 0.2656, "step": 8268 }, { "epoch": 5.476158940397351, "grad_norm": 0.7252764856930516, "learning_rate": 9.196363601020567e-06, "loss": 0.2354, "step": 8269 }, { "epoch": 5.47682119205298, "grad_norm": 0.7981120496073488, "learning_rate": 9.17336560627876e-06, "loss": 0.2275, "step": 8270 }, { "epoch": 5.477483443708609, "grad_norm": 0.6849213180202682, "learning_rate": 9.150395496825064e-06, "loss": 0.2178, "step": 8271 }, { "epoch": 5.478145695364239, "grad_norm": 0.7742025613850498, "learning_rate": 9.127453277207875e-06, "loss": 0.25, "step": 8272 }, { "epoch": 5.478807947019868, "grad_norm": 0.8014910574622097, "learning_rate": 9.104538951969958e-06, "loss": 0.2852, "step": 8273 }, { "epoch": 5.479470198675497, "grad_norm": 0.8294948101863496, "learning_rate": 9.081652525648652e-06, "loss": 0.2832, "step": 8274 }, { "epoch": 5.4801324503311255, "grad_norm": 0.8209309893961193, "learning_rate": 9.058794002775733e-06, "loss": 0.3027, "step": 8275 }, { "epoch": 5.480794701986755, "grad_norm": 1.1535190935514115, "learning_rate": 9.035963387877442e-06, "loss": 0.4082, "step": 8276 }, { "epoch": 5.481456953642384, "grad_norm": 0.7032787783811938, "learning_rate": 9.013160685474485e-06, "loss": 0.2285, "step": 8277 }, { "epoch": 5.482119205298013, "grad_norm": 0.7839174397945688, "learning_rate": 8.990385900082064e-06, "loss": 0.2598, "step": 8278 }, { "epoch": 5.482781456953642, "grad_norm": 0.706368724884191, "learning_rate": 8.96763903620989e-06, "loss": 0.2363, "step": 8279 }, { "epoch": 5.483443708609272, "grad_norm": 0.7127085294222835, "learning_rate": 8.944920098362041e-06, "loss": 0.2354, "step": 8280 }, { "epoch": 5.484105960264901, "grad_norm": 0.6614521180400365, "learning_rate": 8.92222909103717e-06, "loss": 0.2129, "step": 8281 }, { "epoch": 5.48476821192053, "grad_norm": 0.781839744938468, "learning_rate": 8.89956601872835e-06, "loss": 0.2891, "step": 8282 }, { "epoch": 5.485430463576159, "grad_norm": 0.7650277271382648, "learning_rate": 8.876930885923106e-06, "loss": 0.2422, "step": 8283 }, { "epoch": 5.486092715231788, "grad_norm": 0.759510725817291, "learning_rate": 8.854323697103488e-06, "loss": 0.2451, "step": 8284 }, { "epoch": 5.486754966887418, "grad_norm": 0.8120853996743709, "learning_rate": 8.831744456745948e-06, "loss": 0.2539, "step": 8285 }, { "epoch": 5.4874172185430465, "grad_norm": 0.8713494720344912, "learning_rate": 8.809193169321427e-06, "loss": 0.2578, "step": 8286 }, { "epoch": 5.4880794701986755, "grad_norm": 0.7227020010770573, "learning_rate": 8.786669839295402e-06, "loss": 0.2178, "step": 8287 }, { "epoch": 5.488741721854304, "grad_norm": 0.7542463220051343, "learning_rate": 8.764174471127688e-06, "loss": 0.2334, "step": 8288 }, { "epoch": 5.489403973509933, "grad_norm": 0.7999487423582928, "learning_rate": 8.74170706927264e-06, "loss": 0.2354, "step": 8289 }, { "epoch": 5.490066225165563, "grad_norm": 0.7383005138056044, "learning_rate": 8.71926763817905e-06, "loss": 0.2402, "step": 8290 }, { "epoch": 5.490728476821192, "grad_norm": 0.7455341099804799, "learning_rate": 8.69685618229023e-06, "loss": 0.2559, "step": 8291 }, { "epoch": 5.491390728476821, "grad_norm": 0.7420239301405779, "learning_rate": 8.674472706043878e-06, "loss": 0.2178, "step": 8292 }, { "epoch": 5.49205298013245, "grad_norm": 0.7148609906004662, "learning_rate": 8.652117213872151e-06, "loss": 0.2109, "step": 8293 }, { "epoch": 5.49271523178808, "grad_norm": 0.7844358463326505, "learning_rate": 8.629789710201707e-06, "loss": 0.249, "step": 8294 }, { "epoch": 5.493377483443709, "grad_norm": 0.6962563409073436, "learning_rate": 8.607490199453677e-06, "loss": 0.2461, "step": 8295 }, { "epoch": 5.494039735099338, "grad_norm": 0.806374167167584, "learning_rate": 8.585218686043593e-06, "loss": 0.2637, "step": 8296 }, { "epoch": 5.494701986754967, "grad_norm": 0.749256973432726, "learning_rate": 8.56297517438148e-06, "loss": 0.2617, "step": 8297 }, { "epoch": 5.495364238410596, "grad_norm": 0.7876949175835746, "learning_rate": 8.540759668871799e-06, "loss": 0.252, "step": 8298 }, { "epoch": 5.4960264900662255, "grad_norm": 0.7087124789169329, "learning_rate": 8.518572173913479e-06, "loss": 0.207, "step": 8299 }, { "epoch": 5.496688741721854, "grad_norm": 0.7974959260157949, "learning_rate": 8.496412693899907e-06, "loss": 0.2949, "step": 8300 }, { "epoch": 5.497350993377483, "grad_norm": 0.6467321618308874, "learning_rate": 8.474281233218922e-06, "loss": 0.1748, "step": 8301 }, { "epoch": 5.498013245033112, "grad_norm": 0.7707040606716175, "learning_rate": 8.452177796252752e-06, "loss": 0.2715, "step": 8302 }, { "epoch": 5.498675496688742, "grad_norm": 0.7919071808346391, "learning_rate": 8.430102387378195e-06, "loss": 0.2812, "step": 8303 }, { "epoch": 5.499337748344371, "grad_norm": 0.8710522382639634, "learning_rate": 8.408055010966424e-06, "loss": 0.3008, "step": 8304 }, { "epoch": 5.5, "grad_norm": 0.7726653714638038, "learning_rate": 8.386035671383045e-06, "loss": 0.2207, "step": 8305 }, { "epoch": 5.500662251655629, "grad_norm": 0.8368796364761532, "learning_rate": 8.364044372988155e-06, "loss": 0.2471, "step": 8306 }, { "epoch": 5.501324503311258, "grad_norm": 0.6616917380141908, "learning_rate": 8.342081120136284e-06, "loss": 0.1865, "step": 8307 }, { "epoch": 5.501986754966888, "grad_norm": 0.7452066452415348, "learning_rate": 8.320145917176408e-06, "loss": 0.2266, "step": 8308 }, { "epoch": 5.502649006622517, "grad_norm": 0.7257236661562294, "learning_rate": 8.298238768451948e-06, "loss": 0.2441, "step": 8309 }, { "epoch": 5.503311258278146, "grad_norm": 0.8034540985697016, "learning_rate": 8.276359678300754e-06, "loss": 0.2559, "step": 8310 }, { "epoch": 5.5039735099337745, "grad_norm": 0.6775550839967807, "learning_rate": 8.254508651055158e-06, "loss": 0.2129, "step": 8311 }, { "epoch": 5.5046357615894035, "grad_norm": 0.825451786072652, "learning_rate": 8.232685691041936e-06, "loss": 0.2793, "step": 8312 }, { "epoch": 5.505298013245033, "grad_norm": 0.7215712011001146, "learning_rate": 8.210890802582243e-06, "loss": 0.2354, "step": 8313 }, { "epoch": 5.505960264900662, "grad_norm": 0.7269564301420719, "learning_rate": 8.189123989991714e-06, "loss": 0.2227, "step": 8314 }, { "epoch": 5.506622516556291, "grad_norm": 0.7623588933700701, "learning_rate": 8.167385257580434e-06, "loss": 0.2422, "step": 8315 }, { "epoch": 5.50728476821192, "grad_norm": 0.8612237448598782, "learning_rate": 8.145674609652975e-06, "loss": 0.2773, "step": 8316 }, { "epoch": 5.50794701986755, "grad_norm": 0.8531345885322131, "learning_rate": 8.123992050508216e-06, "loss": 0.3262, "step": 8317 }, { "epoch": 5.508609271523179, "grad_norm": 0.6212988854206485, "learning_rate": 8.102337584439606e-06, "loss": 0.1855, "step": 8318 }, { "epoch": 5.509271523178808, "grad_norm": 0.7727103218675102, "learning_rate": 8.08071121573493e-06, "loss": 0.249, "step": 8319 }, { "epoch": 5.509933774834437, "grad_norm": 0.7302727176373264, "learning_rate": 8.059112948676482e-06, "loss": 0.2598, "step": 8320 }, { "epoch": 5.510596026490067, "grad_norm": 0.8979670971514583, "learning_rate": 8.037542787541008e-06, "loss": 0.3535, "step": 8321 }, { "epoch": 5.511258278145696, "grad_norm": 0.7491866413513846, "learning_rate": 8.016000736599588e-06, "loss": 0.2246, "step": 8322 }, { "epoch": 5.5119205298013245, "grad_norm": 0.8398765613626522, "learning_rate": 7.994486800117794e-06, "loss": 0.2988, "step": 8323 }, { "epoch": 5.5125827814569535, "grad_norm": 0.853552413315536, "learning_rate": 7.973000982355688e-06, "loss": 0.3047, "step": 8324 }, { "epoch": 5.513245033112582, "grad_norm": 0.7051913954437126, "learning_rate": 7.95154328756768e-06, "loss": 0.2227, "step": 8325 }, { "epoch": 5.513907284768212, "grad_norm": 0.6407479685054761, "learning_rate": 7.930113720002606e-06, "loss": 0.2021, "step": 8326 }, { "epoch": 5.514569536423841, "grad_norm": 0.773330342520804, "learning_rate": 7.90871228390379e-06, "loss": 0.252, "step": 8327 }, { "epoch": 5.51523178807947, "grad_norm": 0.8133827483817562, "learning_rate": 7.88733898350899e-06, "loss": 0.3086, "step": 8328 }, { "epoch": 5.515894039735099, "grad_norm": 0.8515957072000154, "learning_rate": 7.86599382305032e-06, "loss": 0.2832, "step": 8329 }, { "epoch": 5.516556291390728, "grad_norm": 0.8330364555146641, "learning_rate": 7.844676806754419e-06, "loss": 0.3125, "step": 8330 }, { "epoch": 5.517218543046358, "grad_norm": 0.8028675040663321, "learning_rate": 7.823387938842223e-06, "loss": 0.2715, "step": 8331 }, { "epoch": 5.517880794701987, "grad_norm": 0.6961876344455217, "learning_rate": 7.802127223529232e-06, "loss": 0.2041, "step": 8332 }, { "epoch": 5.518543046357616, "grad_norm": 0.7365859303159131, "learning_rate": 7.780894665025305e-06, "loss": 0.2168, "step": 8333 }, { "epoch": 5.519205298013245, "grad_norm": 0.7241149163549856, "learning_rate": 7.759690267534735e-06, "loss": 0.2373, "step": 8334 }, { "epoch": 5.5198675496688745, "grad_norm": 0.7949174588562877, "learning_rate": 7.738514035256177e-06, "loss": 0.2373, "step": 8335 }, { "epoch": 5.520529801324503, "grad_norm": 0.7526697155621349, "learning_rate": 7.71736597238286e-06, "loss": 0.249, "step": 8336 }, { "epoch": 5.521192052980132, "grad_norm": 0.648639742627535, "learning_rate": 7.696246083102303e-06, "loss": 0.2002, "step": 8337 }, { "epoch": 5.521854304635761, "grad_norm": 0.8102700019392715, "learning_rate": 7.675154371596442e-06, "loss": 0.2598, "step": 8338 }, { "epoch": 5.522516556291391, "grad_norm": 0.6620507102770588, "learning_rate": 7.654090842041737e-06, "loss": 0.1963, "step": 8339 }, { "epoch": 5.52317880794702, "grad_norm": 0.7161798959685081, "learning_rate": 7.633055498609004e-06, "loss": 0.2158, "step": 8340 }, { "epoch": 5.523841059602649, "grad_norm": 0.8038250774594444, "learning_rate": 7.612048345463439e-06, "loss": 0.2715, "step": 8341 }, { "epoch": 5.524503311258278, "grad_norm": 0.869101589851218, "learning_rate": 7.591069386764737e-06, "loss": 0.2734, "step": 8342 }, { "epoch": 5.525165562913907, "grad_norm": 0.6908105298538103, "learning_rate": 7.570118626666955e-06, "loss": 0.2158, "step": 8343 }, { "epoch": 5.525827814569537, "grad_norm": 0.8165604776930756, "learning_rate": 7.549196069318592e-06, "loss": 0.2734, "step": 8344 }, { "epoch": 5.526490066225166, "grad_norm": 0.7421269720817115, "learning_rate": 7.528301718862567e-06, "loss": 0.252, "step": 8345 }, { "epoch": 5.527152317880795, "grad_norm": 0.8052111551035224, "learning_rate": 7.507435579436189e-06, "loss": 0.252, "step": 8346 }, { "epoch": 5.527814569536424, "grad_norm": 0.6982969874923907, "learning_rate": 7.486597655171167e-06, "loss": 0.2275, "step": 8347 }, { "epoch": 5.5284768211920525, "grad_norm": 0.797358972529865, "learning_rate": 7.465787950193669e-06, "loss": 0.25, "step": 8348 }, { "epoch": 5.529139072847682, "grad_norm": 0.8572471956592532, "learning_rate": 7.445006468624282e-06, "loss": 0.3047, "step": 8349 }, { "epoch": 5.529801324503311, "grad_norm": 0.7365344931306134, "learning_rate": 7.424253214577963e-06, "loss": 0.2471, "step": 8350 }, { "epoch": 5.53046357615894, "grad_norm": 0.7437903744234947, "learning_rate": 7.40352819216406e-06, "loss": 0.2383, "step": 8351 }, { "epoch": 5.531125827814569, "grad_norm": 0.7695939616483454, "learning_rate": 7.382831405486389e-06, "loss": 0.2715, "step": 8352 }, { "epoch": 5.531788079470199, "grad_norm": 0.7950244074580723, "learning_rate": 7.362162858643156e-06, "loss": 0.2656, "step": 8353 }, { "epoch": 5.532450331125828, "grad_norm": 0.6624313644233896, "learning_rate": 7.34152255572697e-06, "loss": 0.1934, "step": 8354 }, { "epoch": 5.533112582781457, "grad_norm": 0.780890017609156, "learning_rate": 7.320910500824861e-06, "loss": 0.2305, "step": 8355 }, { "epoch": 5.533774834437086, "grad_norm": 0.6447238574903741, "learning_rate": 7.3003266980182e-06, "loss": 0.1992, "step": 8356 }, { "epoch": 5.534437086092716, "grad_norm": 0.814351305047172, "learning_rate": 7.2797711513828574e-06, "loss": 0.2402, "step": 8357 }, { "epoch": 5.535099337748345, "grad_norm": 0.7968039095791204, "learning_rate": 7.2592438649890945e-06, "loss": 0.2754, "step": 8358 }, { "epoch": 5.535761589403974, "grad_norm": 0.9427827728803346, "learning_rate": 7.238744842901478e-06, "loss": 0.2617, "step": 8359 }, { "epoch": 5.5364238410596025, "grad_norm": 0.787436588649743, "learning_rate": 7.218274089179094e-06, "loss": 0.2617, "step": 8360 }, { "epoch": 5.5370860927152314, "grad_norm": 0.8051572475828139, "learning_rate": 7.197831607875398e-06, "loss": 0.248, "step": 8361 }, { "epoch": 5.537748344370861, "grad_norm": 0.7091971882525859, "learning_rate": 7.177417403038221e-06, "loss": 0.2207, "step": 8362 }, { "epoch": 5.53841059602649, "grad_norm": 0.6777760101457878, "learning_rate": 7.157031478709779e-06, "loss": 0.2207, "step": 8363 }, { "epoch": 5.539072847682119, "grad_norm": 0.6780213178088493, "learning_rate": 7.13667383892676e-06, "loss": 0.2031, "step": 8364 }, { "epoch": 5.539735099337748, "grad_norm": 0.8566820225082039, "learning_rate": 7.116344487720205e-06, "loss": 0.3086, "step": 8365 }, { "epoch": 5.540397350993377, "grad_norm": 0.7164370790299112, "learning_rate": 7.096043429115528e-06, "loss": 0.2148, "step": 8366 }, { "epoch": 5.541059602649007, "grad_norm": 0.7029028451645084, "learning_rate": 7.075770667132613e-06, "loss": 0.2217, "step": 8367 }, { "epoch": 5.541721854304636, "grad_norm": 0.719559595144086, "learning_rate": 7.05552620578565e-06, "loss": 0.2285, "step": 8368 }, { "epoch": 5.542384105960265, "grad_norm": 0.6602198377859929, "learning_rate": 7.035310049083315e-06, "loss": 0.2188, "step": 8369 }, { "epoch": 5.543046357615894, "grad_norm": 0.748166217083447, "learning_rate": 7.0151222010286555e-06, "loss": 0.2734, "step": 8370 }, { "epoch": 5.5437086092715235, "grad_norm": 0.6793001286682699, "learning_rate": 6.994962665619059e-06, "loss": 0.2129, "step": 8371 }, { "epoch": 5.5443708609271525, "grad_norm": 0.7337748312306621, "learning_rate": 6.9748314468463295e-06, "loss": 0.2354, "step": 8372 }, { "epoch": 5.545033112582781, "grad_norm": 0.8885860920839272, "learning_rate": 6.954728548696698e-06, "loss": 0.3262, "step": 8373 }, { "epoch": 5.54569536423841, "grad_norm": 0.7650343173612224, "learning_rate": 6.934653975150811e-06, "loss": 0.25, "step": 8374 }, { "epoch": 5.54635761589404, "grad_norm": 0.7312496296099791, "learning_rate": 6.9146077301835916e-06, "loss": 0.2451, "step": 8375 }, { "epoch": 5.547019867549669, "grad_norm": 0.7947847552281614, "learning_rate": 6.894589817764478e-06, "loss": 0.2451, "step": 8376 }, { "epoch": 5.547682119205298, "grad_norm": 0.7900757998657023, "learning_rate": 6.874600241857237e-06, "loss": 0.252, "step": 8377 }, { "epoch": 5.548344370860927, "grad_norm": 0.684882118600078, "learning_rate": 6.8546390064200155e-06, "loss": 0.2256, "step": 8378 }, { "epoch": 5.549006622516556, "grad_norm": 0.8190141657866639, "learning_rate": 6.834706115405386e-06, "loss": 0.2754, "step": 8379 }, { "epoch": 5.549668874172186, "grad_norm": 0.7459821985111394, "learning_rate": 6.814801572760275e-06, "loss": 0.2314, "step": 8380 }, { "epoch": 5.550331125827815, "grad_norm": 0.789873779128214, "learning_rate": 6.794925382426014e-06, "loss": 0.2285, "step": 8381 }, { "epoch": 5.550993377483444, "grad_norm": 0.8127378337742824, "learning_rate": 6.775077548338353e-06, "loss": 0.25, "step": 8382 }, { "epoch": 5.551655629139073, "grad_norm": 0.6492678219688907, "learning_rate": 6.755258074427333e-06, "loss": 0.1943, "step": 8383 }, { "epoch": 5.552317880794702, "grad_norm": 0.7293835109097302, "learning_rate": 6.735466964617464e-06, "loss": 0.2285, "step": 8384 }, { "epoch": 5.552980132450331, "grad_norm": 0.7363112026655881, "learning_rate": 6.715704222827595e-06, "loss": 0.2539, "step": 8385 }, { "epoch": 5.55364238410596, "grad_norm": 0.8300986547597354, "learning_rate": 6.695969852971012e-06, "loss": 0.252, "step": 8386 }, { "epoch": 5.554304635761589, "grad_norm": 0.829444671322198, "learning_rate": 6.676263858955305e-06, "loss": 0.2432, "step": 8387 }, { "epoch": 5.554966887417218, "grad_norm": 0.778051796223688, "learning_rate": 6.656586244682538e-06, "loss": 0.2598, "step": 8388 }, { "epoch": 5.555629139072848, "grad_norm": 0.7601144399086871, "learning_rate": 6.636937014049043e-06, "loss": 0.2539, "step": 8389 }, { "epoch": 5.556291390728477, "grad_norm": 0.806625963453341, "learning_rate": 6.617316170945624e-06, "loss": 0.2832, "step": 8390 }, { "epoch": 5.556953642384106, "grad_norm": 0.8344953961753315, "learning_rate": 6.597723719257459e-06, "loss": 0.2969, "step": 8391 }, { "epoch": 5.557615894039735, "grad_norm": 0.7445752009619905, "learning_rate": 6.578159662864041e-06, "loss": 0.252, "step": 8392 }, { "epoch": 5.558278145695365, "grad_norm": 0.6890255025252918, "learning_rate": 6.5586240056392565e-06, "loss": 0.2256, "step": 8393 }, { "epoch": 5.558940397350994, "grad_norm": 0.8308510965175706, "learning_rate": 6.539116751451462e-06, "loss": 0.2676, "step": 8394 }, { "epoch": 5.559602649006623, "grad_norm": 0.8203633569264653, "learning_rate": 6.519637904163266e-06, "loss": 0.2676, "step": 8395 }, { "epoch": 5.5602649006622515, "grad_norm": 0.8050032967942341, "learning_rate": 6.5001874676317015e-06, "loss": 0.252, "step": 8396 }, { "epoch": 5.5609271523178805, "grad_norm": 0.7448345502268655, "learning_rate": 6.480765445708186e-06, "loss": 0.2334, "step": 8397 }, { "epoch": 5.56158940397351, "grad_norm": 0.7576869613315572, "learning_rate": 6.461371842238528e-06, "loss": 0.2832, "step": 8398 }, { "epoch": 5.562251655629139, "grad_norm": 0.8997034901971405, "learning_rate": 6.442006661062837e-06, "loss": 0.2852, "step": 8399 }, { "epoch": 5.562913907284768, "grad_norm": 0.7859713816157559, "learning_rate": 6.422669906015698e-06, "loss": 0.252, "step": 8400 }, { "epoch": 5.563576158940397, "grad_norm": 0.7912667782919214, "learning_rate": 6.403361580925947e-06, "loss": 0.2578, "step": 8401 }, { "epoch": 5.564238410596026, "grad_norm": 0.6832060697119714, "learning_rate": 6.384081689616894e-06, "loss": 0.2217, "step": 8402 }, { "epoch": 5.564900662251656, "grad_norm": 0.6991137397027128, "learning_rate": 6.364830235906182e-06, "loss": 0.2412, "step": 8403 }, { "epoch": 5.565562913907285, "grad_norm": 0.761468316267935, "learning_rate": 6.345607223605814e-06, "loss": 0.2305, "step": 8404 }, { "epoch": 5.566225165562914, "grad_norm": 0.6916545574548287, "learning_rate": 6.326412656522128e-06, "loss": 0.2158, "step": 8405 }, { "epoch": 5.566887417218543, "grad_norm": 0.74579428230351, "learning_rate": 6.3072465384559015e-06, "loss": 0.252, "step": 8406 }, { "epoch": 5.567549668874172, "grad_norm": 0.7823247587881895, "learning_rate": 6.28810887320228e-06, "loss": 0.2695, "step": 8407 }, { "epoch": 5.5682119205298015, "grad_norm": 0.7198752351127756, "learning_rate": 6.268999664550683e-06, "loss": 0.21, "step": 8408 }, { "epoch": 5.5688741721854305, "grad_norm": 0.7547697319321347, "learning_rate": 6.249918916284969e-06, "loss": 0.2305, "step": 8409 }, { "epoch": 5.569536423841059, "grad_norm": 0.71998029381535, "learning_rate": 6.230866632183345e-06, "loss": 0.2275, "step": 8410 }, { "epoch": 5.570198675496689, "grad_norm": 0.633533752953935, "learning_rate": 6.211842816018414e-06, "loss": 0.1982, "step": 8411 }, { "epoch": 5.570860927152318, "grad_norm": 0.7704261637844638, "learning_rate": 6.192847471557045e-06, "loss": 0.2344, "step": 8412 }, { "epoch": 5.571523178807947, "grad_norm": 0.7463619405142556, "learning_rate": 6.1738806025606105e-06, "loss": 0.2559, "step": 8413 }, { "epoch": 5.572185430463576, "grad_norm": 0.8024549232017715, "learning_rate": 6.154942212784708e-06, "loss": 0.2314, "step": 8414 }, { "epoch": 5.572847682119205, "grad_norm": 0.6514501930914766, "learning_rate": 6.136032305979372e-06, "loss": 0.2236, "step": 8415 }, { "epoch": 5.573509933774835, "grad_norm": 0.8206122133660599, "learning_rate": 6.117150885889005e-06, "loss": 0.2393, "step": 8416 }, { "epoch": 5.574172185430464, "grad_norm": 0.6286922233077189, "learning_rate": 6.098297956252302e-06, "loss": 0.1943, "step": 8417 }, { "epoch": 5.574834437086093, "grad_norm": 0.8595431519278207, "learning_rate": 6.079473520802391e-06, "loss": 0.3008, "step": 8418 }, { "epoch": 5.575496688741722, "grad_norm": 0.7776277998643802, "learning_rate": 6.060677583266726e-06, "loss": 0.25, "step": 8419 }, { "epoch": 5.576158940397351, "grad_norm": 0.7919388692558922, "learning_rate": 6.04191014736713e-06, "loss": 0.2432, "step": 8420 }, { "epoch": 5.57682119205298, "grad_norm": 0.6910393287704936, "learning_rate": 6.023171216819711e-06, "loss": 0.2354, "step": 8421 }, { "epoch": 5.577483443708609, "grad_norm": 0.7811169638029272, "learning_rate": 6.004460795335036e-06, "loss": 0.2383, "step": 8422 }, { "epoch": 5.578145695364238, "grad_norm": 0.7506580018110335, "learning_rate": 5.985778886618009e-06, "loss": 0.2148, "step": 8423 }, { "epoch": 5.578807947019867, "grad_norm": 0.7897003599137093, "learning_rate": 5.967125494367802e-06, "loss": 0.25, "step": 8424 }, { "epoch": 5.579470198675496, "grad_norm": 0.7291403893480034, "learning_rate": 5.948500622278046e-06, "loss": 0.2188, "step": 8425 }, { "epoch": 5.580132450331126, "grad_norm": 0.8168084017981146, "learning_rate": 5.929904274036657e-06, "loss": 0.2852, "step": 8426 }, { "epoch": 5.580794701986755, "grad_norm": 0.6928197176456917, "learning_rate": 5.911336453325921e-06, "loss": 0.2119, "step": 8427 }, { "epoch": 5.581456953642384, "grad_norm": 0.7117914697721293, "learning_rate": 5.892797163822516e-06, "loss": 0.2422, "step": 8428 }, { "epoch": 5.582119205298013, "grad_norm": 0.7752433603498773, "learning_rate": 5.874286409197404e-06, "loss": 0.2275, "step": 8429 }, { "epoch": 5.582781456953643, "grad_norm": 0.8223441062466899, "learning_rate": 5.855804193115904e-06, "loss": 0.2832, "step": 8430 }, { "epoch": 5.583443708609272, "grad_norm": 0.6980149777548678, "learning_rate": 5.8373505192377545e-06, "loss": 0.2354, "step": 8431 }, { "epoch": 5.584105960264901, "grad_norm": 0.7548319089763472, "learning_rate": 5.818925391216983e-06, "loss": 0.2539, "step": 8432 }, { "epoch": 5.5847682119205295, "grad_norm": 0.7703726783696607, "learning_rate": 5.800528812701938e-06, "loss": 0.2373, "step": 8433 }, { "epoch": 5.585430463576159, "grad_norm": 0.7254880123857514, "learning_rate": 5.782160787335388e-06, "loss": 0.2188, "step": 8434 }, { "epoch": 5.586092715231788, "grad_norm": 0.7705445269541704, "learning_rate": 5.7638213187544245e-06, "loss": 0.2539, "step": 8435 }, { "epoch": 5.586754966887417, "grad_norm": 0.5955668623920266, "learning_rate": 5.745510410590426e-06, "loss": 0.1748, "step": 8436 }, { "epoch": 5.587417218543046, "grad_norm": 0.7736918690466908, "learning_rate": 5.727228066469225e-06, "loss": 0.2559, "step": 8437 }, { "epoch": 5.588079470198675, "grad_norm": 0.7678887157863181, "learning_rate": 5.708974290010859e-06, "loss": 0.2754, "step": 8438 }, { "epoch": 5.588741721854305, "grad_norm": 0.6595611925196745, "learning_rate": 5.6907490848298544e-06, "loss": 0.1924, "step": 8439 }, { "epoch": 5.589403973509934, "grad_norm": 0.7199444960157584, "learning_rate": 5.6725524545349735e-06, "loss": 0.2061, "step": 8440 }, { "epoch": 5.590066225165563, "grad_norm": 0.8106593922681078, "learning_rate": 5.654384402729384e-06, "loss": 0.2969, "step": 8441 }, { "epoch": 5.590728476821192, "grad_norm": 0.8554274419406235, "learning_rate": 5.636244933010525e-06, "loss": 0.2949, "step": 8442 }, { "epoch": 5.591390728476821, "grad_norm": 0.7750320447759975, "learning_rate": 5.618134048970258e-06, "loss": 0.2451, "step": 8443 }, { "epoch": 5.592052980132451, "grad_norm": 0.7579028643633943, "learning_rate": 5.600051754194729e-06, "loss": 0.2383, "step": 8444 }, { "epoch": 5.5927152317880795, "grad_norm": 0.7843461395641105, "learning_rate": 5.581998052264441e-06, "loss": 0.2451, "step": 8445 }, { "epoch": 5.593377483443708, "grad_norm": 0.6375469834454234, "learning_rate": 5.563972946754253e-06, "loss": 0.1963, "step": 8446 }, { "epoch": 5.594039735099337, "grad_norm": 0.8267859142640496, "learning_rate": 5.545976441233324e-06, "loss": 0.248, "step": 8447 }, { "epoch": 5.594701986754967, "grad_norm": 0.742734296315138, "learning_rate": 5.528008539265155e-06, "loss": 0.2295, "step": 8448 }, { "epoch": 5.595364238410596, "grad_norm": 0.744074110937815, "learning_rate": 5.5100692444076336e-06, "loss": 0.2373, "step": 8449 }, { "epoch": 5.596026490066225, "grad_norm": 0.8012950600698244, "learning_rate": 5.492158560212917e-06, "loss": 0.2432, "step": 8450 }, { "epoch": 5.596688741721854, "grad_norm": 0.7367631048301474, "learning_rate": 5.474276490227502e-06, "loss": 0.2432, "step": 8451 }, { "epoch": 5.597350993377484, "grad_norm": 0.7721930189762625, "learning_rate": 5.456423037992308e-06, "loss": 0.2441, "step": 8452 }, { "epoch": 5.598013245033113, "grad_norm": 0.7238026352825647, "learning_rate": 5.438598207042505e-06, "loss": 0.2334, "step": 8453 }, { "epoch": 5.598675496688742, "grad_norm": 0.7310900653353348, "learning_rate": 5.420802000907571e-06, "loss": 0.2129, "step": 8454 }, { "epoch": 5.599337748344371, "grad_norm": 0.675129106333715, "learning_rate": 5.4030344231113855e-06, "loss": 0.1836, "step": 8455 }, { "epoch": 5.6, "grad_norm": 0.725645843175105, "learning_rate": 5.385295477172152e-06, "loss": 0.25, "step": 8456 }, { "epoch": 5.6006622516556295, "grad_norm": 0.7635799158133673, "learning_rate": 5.367585166602361e-06, "loss": 0.2598, "step": 8457 }, { "epoch": 5.601324503311258, "grad_norm": 0.8167069447447451, "learning_rate": 5.349903494908875e-06, "loss": 0.2773, "step": 8458 }, { "epoch": 5.601986754966887, "grad_norm": 0.7453381338680775, "learning_rate": 5.332250465592825e-06, "loss": 0.2285, "step": 8459 }, { "epoch": 5.602649006622516, "grad_norm": 0.7679852408327323, "learning_rate": 5.314626082149764e-06, "loss": 0.2695, "step": 8460 }, { "epoch": 5.603311258278145, "grad_norm": 0.7437396224587862, "learning_rate": 5.297030348069503e-06, "loss": 0.2363, "step": 8461 }, { "epoch": 5.603973509933775, "grad_norm": 0.7561477499614427, "learning_rate": 5.27946326683622e-06, "loss": 0.2363, "step": 8462 }, { "epoch": 5.604635761589404, "grad_norm": 0.8040407439067624, "learning_rate": 5.261924841928333e-06, "loss": 0.2451, "step": 8463 }, { "epoch": 5.605298013245033, "grad_norm": 0.8287852295913165, "learning_rate": 5.244415076818714e-06, "loss": 0.2539, "step": 8464 }, { "epoch": 5.605960264900662, "grad_norm": 0.7714540217243087, "learning_rate": 5.22693397497449e-06, "loss": 0.2354, "step": 8465 }, { "epoch": 5.606622516556292, "grad_norm": 0.6396068428681412, "learning_rate": 5.2094815398571075e-06, "loss": 0.1924, "step": 8466 }, { "epoch": 5.607284768211921, "grad_norm": 0.7822072028577506, "learning_rate": 5.192057774922337e-06, "loss": 0.2637, "step": 8467 }, { "epoch": 5.60794701986755, "grad_norm": 0.9030796734585833, "learning_rate": 5.174662683620301e-06, "loss": 0.2871, "step": 8468 }, { "epoch": 5.608609271523179, "grad_norm": 0.7698454819279575, "learning_rate": 5.157296269395428e-06, "loss": 0.2539, "step": 8469 }, { "epoch": 5.609271523178808, "grad_norm": 0.7003535530530464, "learning_rate": 5.139958535686451e-06, "loss": 0.2217, "step": 8470 }, { "epoch": 5.609933774834437, "grad_norm": 0.7217079424270505, "learning_rate": 5.122649485926439e-06, "loss": 0.209, "step": 8471 }, { "epoch": 5.610596026490066, "grad_norm": 0.8664892909776489, "learning_rate": 5.105369123542819e-06, "loss": 0.3105, "step": 8472 }, { "epoch": 5.611258278145695, "grad_norm": 0.7708588323659494, "learning_rate": 5.088117451957252e-06, "loss": 0.2539, "step": 8473 }, { "epoch": 5.611920529801324, "grad_norm": 0.7651796170686557, "learning_rate": 5.070894474585807e-06, "loss": 0.252, "step": 8474 }, { "epoch": 5.612582781456954, "grad_norm": 0.8030373171086901, "learning_rate": 5.053700194838806e-06, "loss": 0.2451, "step": 8475 }, { "epoch": 5.613245033112583, "grad_norm": 0.7502180844733405, "learning_rate": 5.036534616120924e-06, "loss": 0.2236, "step": 8476 }, { "epoch": 5.613907284768212, "grad_norm": 0.8619030455109354, "learning_rate": 5.019397741831143e-06, "loss": 0.3184, "step": 8477 }, { "epoch": 5.614569536423841, "grad_norm": 0.7205090823633746, "learning_rate": 5.002289575362767e-06, "loss": 0.25, "step": 8478 }, { "epoch": 5.61523178807947, "grad_norm": 0.8249374689913206, "learning_rate": 4.985210120103383e-06, "loss": 0.2852, "step": 8479 }, { "epoch": 5.6158940397351, "grad_norm": 0.7382911410621639, "learning_rate": 4.968159379434938e-06, "loss": 0.2363, "step": 8480 }, { "epoch": 5.6165562913907285, "grad_norm": 0.6641485301840342, "learning_rate": 4.951137356733681e-06, "loss": 0.2275, "step": 8481 }, { "epoch": 5.6172185430463575, "grad_norm": 0.724756162719566, "learning_rate": 4.9341440553701315e-06, "loss": 0.248, "step": 8482 }, { "epoch": 5.617880794701986, "grad_norm": 0.7266456200452215, "learning_rate": 4.917179478709215e-06, "loss": 0.2598, "step": 8483 }, { "epoch": 5.618543046357616, "grad_norm": 1.0474044201359225, "learning_rate": 4.900243630110062e-06, "loss": 0.3965, "step": 8484 }, { "epoch": 5.619205298013245, "grad_norm": 0.8426537751290345, "learning_rate": 4.883336512926189e-06, "loss": 0.3203, "step": 8485 }, { "epoch": 5.619867549668874, "grad_norm": 0.785220401367622, "learning_rate": 4.866458130505418e-06, "loss": 0.2715, "step": 8486 }, { "epoch": 5.620529801324503, "grad_norm": 0.8425988365176172, "learning_rate": 4.849608486189827e-06, "loss": 0.2754, "step": 8487 }, { "epoch": 5.621192052980133, "grad_norm": 0.7369887467602914, "learning_rate": 4.832787583315828e-06, "loss": 0.2246, "step": 8488 }, { "epoch": 5.621854304635762, "grad_norm": 0.7361218707803845, "learning_rate": 4.815995425214192e-06, "loss": 0.2217, "step": 8489 }, { "epoch": 5.622516556291391, "grad_norm": 0.7985520742746308, "learning_rate": 4.799232015209959e-06, "loss": 0.2578, "step": 8490 }, { "epoch": 5.62317880794702, "grad_norm": 0.6454044058097419, "learning_rate": 4.782497356622422e-06, "loss": 0.1904, "step": 8491 }, { "epoch": 5.623841059602649, "grad_norm": 0.854726650031557, "learning_rate": 4.7657914527652994e-06, "loss": 0.2949, "step": 8492 }, { "epoch": 5.6245033112582785, "grad_norm": 0.8426656528155526, "learning_rate": 4.749114306946528e-06, "loss": 0.2969, "step": 8493 }, { "epoch": 5.6251655629139075, "grad_norm": 0.7167154625595982, "learning_rate": 4.732465922468348e-06, "loss": 0.2441, "step": 8494 }, { "epoch": 5.625827814569536, "grad_norm": 0.8681019410389452, "learning_rate": 4.715846302627374e-06, "loss": 0.3027, "step": 8495 }, { "epoch": 5.626490066225165, "grad_norm": 0.6992059233735132, "learning_rate": 4.699255450714456e-06, "loss": 0.2178, "step": 8496 }, { "epoch": 5.627152317880794, "grad_norm": 0.6661296447732384, "learning_rate": 4.6826933700147676e-06, "loss": 0.2031, "step": 8497 }, { "epoch": 5.627814569536424, "grad_norm": 0.7338539459661653, "learning_rate": 4.666160063807817e-06, "loss": 0.2539, "step": 8498 }, { "epoch": 5.628476821192053, "grad_norm": 0.6622866212414646, "learning_rate": 4.649655535367369e-06, "loss": 0.2178, "step": 8499 }, { "epoch": 5.629139072847682, "grad_norm": 0.7854816111874673, "learning_rate": 4.633179787961494e-06, "loss": 0.2754, "step": 8500 }, { "epoch": 5.629801324503311, "grad_norm": 0.6961236917235227, "learning_rate": 4.616732824852598e-06, "loss": 0.2041, "step": 8501 }, { "epoch": 5.630463576158941, "grad_norm": 0.7444002390163725, "learning_rate": 4.600314649297393e-06, "loss": 0.2412, "step": 8502 }, { "epoch": 5.63112582781457, "grad_norm": 0.7075861272820734, "learning_rate": 4.583925264546828e-06, "loss": 0.2285, "step": 8503 }, { "epoch": 5.631788079470199, "grad_norm": 0.7448360725956312, "learning_rate": 4.567564673846208e-06, "loss": 0.2393, "step": 8504 }, { "epoch": 5.632450331125828, "grad_norm": 0.774574677500248, "learning_rate": 4.551232880435091e-06, "loss": 0.2314, "step": 8505 }, { "epoch": 5.633112582781457, "grad_norm": 0.6721241266165442, "learning_rate": 4.534929887547389e-06, "loss": 0.1992, "step": 8506 }, { "epoch": 5.633774834437086, "grad_norm": 0.6955758003291067, "learning_rate": 4.518655698411272e-06, "loss": 0.2236, "step": 8507 }, { "epoch": 5.634437086092715, "grad_norm": 0.7259023429007827, "learning_rate": 4.502410316249211e-06, "loss": 0.2236, "step": 8508 }, { "epoch": 5.635099337748344, "grad_norm": 0.6501745976506197, "learning_rate": 4.486193744277982e-06, "loss": 0.2197, "step": 8509 }, { "epoch": 5.635761589403973, "grad_norm": 0.650200124014585, "learning_rate": 4.470005985708669e-06, "loss": 0.1992, "step": 8510 }, { "epoch": 5.636423841059603, "grad_norm": 0.6653012577069665, "learning_rate": 4.453847043746622e-06, "loss": 0.2236, "step": 8511 }, { "epoch": 5.637086092715232, "grad_norm": 0.6974592809883344, "learning_rate": 4.437716921591483e-06, "loss": 0.2178, "step": 8512 }, { "epoch": 5.637748344370861, "grad_norm": 0.8825311385695657, "learning_rate": 4.421615622437197e-06, "loss": 0.2969, "step": 8513 }, { "epoch": 5.63841059602649, "grad_norm": 0.7311989974103863, "learning_rate": 4.40554314947203e-06, "loss": 0.2305, "step": 8514 }, { "epoch": 5.639072847682119, "grad_norm": 0.7106977707789908, "learning_rate": 4.389499505878502e-06, "loss": 0.2207, "step": 8515 }, { "epoch": 5.639735099337749, "grad_norm": 0.8321591582209643, "learning_rate": 4.373484694833457e-06, "loss": 0.3047, "step": 8516 }, { "epoch": 5.640397350993378, "grad_norm": 0.8722504798142211, "learning_rate": 4.357498719507974e-06, "loss": 0.2832, "step": 8517 }, { "epoch": 5.6410596026490065, "grad_norm": 0.8112295318165914, "learning_rate": 4.341541583067487e-06, "loss": 0.2812, "step": 8518 }, { "epoch": 5.6417218543046355, "grad_norm": 0.8015420691657599, "learning_rate": 4.3256132886717035e-06, "loss": 0.252, "step": 8519 }, { "epoch": 5.642384105960264, "grad_norm": 0.8264326679469394, "learning_rate": 4.309713839474582e-06, "loss": 0.25, "step": 8520 }, { "epoch": 5.643046357615894, "grad_norm": 0.7123104355496577, "learning_rate": 4.293843238624406e-06, "loss": 0.2402, "step": 8521 }, { "epoch": 5.643708609271523, "grad_norm": 0.6518236349832517, "learning_rate": 4.27800148926371e-06, "loss": 0.2061, "step": 8522 }, { "epoch": 5.644370860927152, "grad_norm": 0.6630715092768414, "learning_rate": 4.262188594529404e-06, "loss": 0.1934, "step": 8523 }, { "epoch": 5.645033112582782, "grad_norm": 0.7916764999954217, "learning_rate": 4.24640455755258e-06, "loss": 0.2578, "step": 8524 }, { "epoch": 5.645695364238411, "grad_norm": 0.8036782363264839, "learning_rate": 4.230649381458623e-06, "loss": 0.2734, "step": 8525 }, { "epoch": 5.64635761589404, "grad_norm": 0.7253599321378659, "learning_rate": 4.214923069367321e-06, "loss": 0.1982, "step": 8526 }, { "epoch": 5.647019867549669, "grad_norm": 0.7133757881977699, "learning_rate": 4.199225624392633e-06, "loss": 0.2148, "step": 8527 }, { "epoch": 5.647682119205298, "grad_norm": 0.7704658825528609, "learning_rate": 4.183557049642788e-06, "loss": 0.2539, "step": 8528 }, { "epoch": 5.6483443708609276, "grad_norm": 0.7623762474808625, "learning_rate": 4.16791734822039e-06, "loss": 0.249, "step": 8529 }, { "epoch": 5.6490066225165565, "grad_norm": 0.7185483067537491, "learning_rate": 4.152306523222276e-06, "loss": 0.209, "step": 8530 }, { "epoch": 5.649668874172185, "grad_norm": 0.6744706952726712, "learning_rate": 4.136724577739542e-06, "loss": 0.2188, "step": 8531 }, { "epoch": 5.650331125827814, "grad_norm": 0.8612375882948342, "learning_rate": 4.1211715148576355e-06, "loss": 0.2949, "step": 8532 }, { "epoch": 5.650993377483443, "grad_norm": 0.7524673215856749, "learning_rate": 4.105647337656192e-06, "loss": 0.2344, "step": 8533 }, { "epoch": 5.651655629139073, "grad_norm": 0.8624627487494877, "learning_rate": 4.0901520492092055e-06, "loss": 0.2793, "step": 8534 }, { "epoch": 5.652317880794702, "grad_norm": 0.7331302662725838, "learning_rate": 4.074685652584919e-06, "loss": 0.2402, "step": 8535 }, { "epoch": 5.652980132450331, "grad_norm": 0.7830063282058954, "learning_rate": 4.059248150845867e-06, "loss": 0.2734, "step": 8536 }, { "epoch": 5.65364238410596, "grad_norm": 0.6801219166697693, "learning_rate": 4.043839547048805e-06, "loss": 0.2051, "step": 8537 }, { "epoch": 5.654304635761589, "grad_norm": 0.675234335847937, "learning_rate": 4.028459844244858e-06, "loss": 0.2207, "step": 8538 }, { "epoch": 5.654966887417219, "grad_norm": 0.6881030706520367, "learning_rate": 4.01310904547939e-06, "loss": 0.2129, "step": 8539 }, { "epoch": 5.655629139072848, "grad_norm": 0.7552724632652142, "learning_rate": 3.997787153791987e-06, "loss": 0.2373, "step": 8540 }, { "epoch": 5.656291390728477, "grad_norm": 0.7139538729361956, "learning_rate": 3.982494172216621e-06, "loss": 0.2178, "step": 8541 }, { "epoch": 5.656953642384106, "grad_norm": 0.7041626426663392, "learning_rate": 3.967230103781421e-06, "loss": 0.1963, "step": 8542 }, { "epoch": 5.657615894039735, "grad_norm": 0.7219860831987656, "learning_rate": 3.951994951508869e-06, "loss": 0.2129, "step": 8543 }, { "epoch": 5.658278145695364, "grad_norm": 0.8409502608875353, "learning_rate": 3.936788718415718e-06, "loss": 0.2773, "step": 8544 }, { "epoch": 5.658940397350993, "grad_norm": 0.8220322646406694, "learning_rate": 3.921611407512959e-06, "loss": 0.2676, "step": 8545 }, { "epoch": 5.659602649006622, "grad_norm": 0.7175441090841517, "learning_rate": 3.906463021805839e-06, "loss": 0.2129, "step": 8546 }, { "epoch": 5.660264900662252, "grad_norm": 0.6993501135172824, "learning_rate": 3.891343564293992e-06, "loss": 0.2119, "step": 8547 }, { "epoch": 5.660927152317881, "grad_norm": 0.7328783589762904, "learning_rate": 3.876253037971206e-06, "loss": 0.2246, "step": 8548 }, { "epoch": 5.66158940397351, "grad_norm": 0.7656772656708439, "learning_rate": 3.86119144582554e-06, "loss": 0.2363, "step": 8549 }, { "epoch": 5.662251655629139, "grad_norm": 0.9025197899645491, "learning_rate": 3.84615879083941e-06, "loss": 0.3223, "step": 8550 }, { "epoch": 5.662913907284768, "grad_norm": 0.853666487162792, "learning_rate": 3.831155075989434e-06, "loss": 0.2617, "step": 8551 }, { "epoch": 5.663576158940398, "grad_norm": 0.6642960617220305, "learning_rate": 3.8161803042465185e-06, "loss": 0.2217, "step": 8552 }, { "epoch": 5.664238410596027, "grad_norm": 0.6970475076943218, "learning_rate": 3.8012344785758596e-06, "loss": 0.2002, "step": 8553 }, { "epoch": 5.664900662251656, "grad_norm": 0.7323323643977634, "learning_rate": 3.786317601936889e-06, "loss": 0.2305, "step": 8554 }, { "epoch": 5.6655629139072845, "grad_norm": 0.9448954263774241, "learning_rate": 3.771429677283294e-06, "loss": 0.3125, "step": 8555 }, { "epoch": 5.6662251655629134, "grad_norm": 0.6869703212333618, "learning_rate": 3.7565707075631167e-06, "loss": 0.2119, "step": 8556 }, { "epoch": 5.666887417218543, "grad_norm": 0.7468329210827919, "learning_rate": 3.7417406957185537e-06, "loss": 0.2275, "step": 8557 }, { "epoch": 5.667549668874172, "grad_norm": 0.7178786799515048, "learning_rate": 3.7269396446861057e-06, "loss": 0.1992, "step": 8558 }, { "epoch": 5.668211920529801, "grad_norm": 0.7268704655523025, "learning_rate": 3.712167557396595e-06, "loss": 0.2383, "step": 8559 }, { "epoch": 5.66887417218543, "grad_norm": 0.6662134359771252, "learning_rate": 3.697424436775032e-06, "loss": 0.2197, "step": 8560 }, { "epoch": 5.66953642384106, "grad_norm": 0.7576907119054107, "learning_rate": 3.6827102857407475e-06, "loss": 0.2461, "step": 8561 }, { "epoch": 5.670198675496689, "grad_norm": 0.75228012118287, "learning_rate": 3.6680251072072787e-06, "loss": 0.2334, "step": 8562 }, { "epoch": 5.670860927152318, "grad_norm": 0.833643875365092, "learning_rate": 3.6533689040824988e-06, "loss": 0.2617, "step": 8563 }, { "epoch": 5.671523178807947, "grad_norm": 0.7704374423066492, "learning_rate": 3.6387416792684697e-06, "loss": 0.2471, "step": 8564 }, { "epoch": 5.672185430463577, "grad_norm": 0.8086400627370286, "learning_rate": 3.624143435661592e-06, "loss": 0.2432, "step": 8565 }, { "epoch": 5.6728476821192055, "grad_norm": 0.648369241314559, "learning_rate": 3.6095741761524355e-06, "loss": 0.2119, "step": 8566 }, { "epoch": 5.6735099337748345, "grad_norm": 0.7035289876089487, "learning_rate": 3.59503390362591e-06, "loss": 0.2266, "step": 8567 }, { "epoch": 5.674172185430463, "grad_norm": 0.7910925795454296, "learning_rate": 3.580522620961146e-06, "loss": 0.2246, "step": 8568 }, { "epoch": 5.674834437086092, "grad_norm": 0.6874882487466664, "learning_rate": 3.566040331031561e-06, "loss": 0.1992, "step": 8569 }, { "epoch": 5.675496688741722, "grad_norm": 0.7049272783022219, "learning_rate": 3.551587036704795e-06, "loss": 0.2236, "step": 8570 }, { "epoch": 5.676158940397351, "grad_norm": 0.6846851855216668, "learning_rate": 3.537162740842758e-06, "loss": 0.2207, "step": 8571 }, { "epoch": 5.67682119205298, "grad_norm": 0.6900816179124258, "learning_rate": 3.5227674463016664e-06, "loss": 0.2031, "step": 8572 }, { "epoch": 5.677483443708609, "grad_norm": 0.6858039868723763, "learning_rate": 3.508401155931906e-06, "loss": 0.209, "step": 8573 }, { "epoch": 5.678145695364238, "grad_norm": 0.808526551510992, "learning_rate": 3.494063872578201e-06, "loss": 0.248, "step": 8574 }, { "epoch": 5.678807947019868, "grad_norm": 0.7473907055470922, "learning_rate": 3.4797555990794645e-06, "loss": 0.2295, "step": 8575 }, { "epoch": 5.679470198675497, "grad_norm": 0.7926581910556464, "learning_rate": 3.46547633826893e-06, "loss": 0.2793, "step": 8576 }, { "epoch": 5.680132450331126, "grad_norm": 0.7996822227126037, "learning_rate": 3.451226092974052e-06, "loss": 0.2812, "step": 8577 }, { "epoch": 5.680794701986755, "grad_norm": 0.6676539453929504, "learning_rate": 3.4370048660165404e-06, "loss": 0.2168, "step": 8578 }, { "epoch": 5.6814569536423845, "grad_norm": 0.7683045936438035, "learning_rate": 3.4228126602123264e-06, "loss": 0.252, "step": 8579 }, { "epoch": 5.682119205298013, "grad_norm": 0.7442792110756081, "learning_rate": 3.4086494783716776e-06, "loss": 0.2402, "step": 8580 }, { "epoch": 5.682781456953642, "grad_norm": 0.8850624310786916, "learning_rate": 3.394515323299052e-06, "loss": 0.2988, "step": 8581 }, { "epoch": 5.683443708609271, "grad_norm": 0.8546360882550629, "learning_rate": 3.38041019779316e-06, "loss": 0.2891, "step": 8582 }, { "epoch": 5.684105960264901, "grad_norm": 0.7725394639362063, "learning_rate": 3.366334104646984e-06, "loss": 0.2256, "step": 8583 }, { "epoch": 5.68476821192053, "grad_norm": 0.799132548546653, "learning_rate": 3.3522870466477616e-06, "loss": 0.2598, "step": 8584 }, { "epoch": 5.685430463576159, "grad_norm": 0.643589630239686, "learning_rate": 3.3382690265769675e-06, "loss": 0.1836, "step": 8585 }, { "epoch": 5.686092715231788, "grad_norm": 0.7947782396089556, "learning_rate": 3.3242800472103316e-06, "loss": 0.2441, "step": 8586 }, { "epoch": 5.686754966887417, "grad_norm": 0.8028651906906636, "learning_rate": 3.3103201113178047e-06, "loss": 0.2598, "step": 8587 }, { "epoch": 5.687417218543047, "grad_norm": 0.7922659986714103, "learning_rate": 3.2963892216636754e-06, "loss": 0.2441, "step": 8588 }, { "epoch": 5.688079470198676, "grad_norm": 0.8025376847540597, "learning_rate": 3.2824873810063546e-06, "loss": 0.2363, "step": 8589 }, { "epoch": 5.688741721854305, "grad_norm": 0.6783142847557733, "learning_rate": 3.2686145920986064e-06, "loss": 0.2227, "step": 8590 }, { "epoch": 5.6894039735099335, "grad_norm": 0.7800951401510476, "learning_rate": 3.254770857687383e-06, "loss": 0.2715, "step": 8591 }, { "epoch": 5.6900662251655625, "grad_norm": 0.7376079901427984, "learning_rate": 3.240956180513893e-06, "loss": 0.2393, "step": 8592 }, { "epoch": 5.690728476821192, "grad_norm": 0.7675613267619977, "learning_rate": 3.227170563313647e-06, "loss": 0.2471, "step": 8593 }, { "epoch": 5.691390728476821, "grad_norm": 0.6444713595340121, "learning_rate": 3.2134140088163284e-06, "loss": 0.2021, "step": 8594 }, { "epoch": 5.69205298013245, "grad_norm": 0.7421009342083975, "learning_rate": 3.1996865197458586e-06, "loss": 0.2422, "step": 8595 }, { "epoch": 5.692715231788079, "grad_norm": 0.7727205023404466, "learning_rate": 3.18598809882048e-06, "loss": 0.2344, "step": 8596 }, { "epoch": 5.693377483443709, "grad_norm": 0.8790695565464431, "learning_rate": 3.172318748752639e-06, "loss": 0.291, "step": 8597 }, { "epoch": 5.694039735099338, "grad_norm": 0.7217626471124348, "learning_rate": 3.1586784722489876e-06, "loss": 0.2441, "step": 8598 }, { "epoch": 5.694701986754967, "grad_norm": 0.7007544142337919, "learning_rate": 3.1450672720104986e-06, "loss": 0.1797, "step": 8599 }, { "epoch": 5.695364238410596, "grad_norm": 0.8626877288751443, "learning_rate": 3.1314851507323156e-06, "loss": 0.2617, "step": 8600 }, { "epoch": 5.696026490066226, "grad_norm": 0.7587759161443056, "learning_rate": 3.117932111103871e-06, "loss": 0.208, "step": 8601 }, { "epoch": 5.696688741721855, "grad_norm": 0.7655784543850515, "learning_rate": 3.1044081558088185e-06, "loss": 0.2324, "step": 8602 }, { "epoch": 5.6973509933774835, "grad_norm": 0.7106988096305012, "learning_rate": 3.0909132875250486e-06, "loss": 0.209, "step": 8603 }, { "epoch": 5.6980132450331125, "grad_norm": 0.6833138393880326, "learning_rate": 3.0774475089247075e-06, "loss": 0.2422, "step": 8604 }, { "epoch": 5.698675496688741, "grad_norm": 0.7430705986006035, "learning_rate": 3.064010822674179e-06, "loss": 0.2324, "step": 8605 }, { "epoch": 5.699337748344371, "grad_norm": 0.8602248738876542, "learning_rate": 3.050603231434068e-06, "loss": 0.2754, "step": 8606 }, { "epoch": 5.7, "grad_norm": 0.7380158473752596, "learning_rate": 3.037224737859234e-06, "loss": 0.2314, "step": 8607 }, { "epoch": 5.700662251655629, "grad_norm": 0.6632044511143497, "learning_rate": 3.023875344598775e-06, "loss": 0.2188, "step": 8608 }, { "epoch": 5.701324503311258, "grad_norm": 0.6206776248097631, "learning_rate": 3.0105550542960264e-06, "loss": 0.1953, "step": 8609 }, { "epoch": 5.701986754966887, "grad_norm": 0.7805705457006857, "learning_rate": 2.9972638695885454e-06, "loss": 0.2451, "step": 8610 }, { "epoch": 5.702649006622517, "grad_norm": 0.7637203865877168, "learning_rate": 2.984001793108176e-06, "loss": 0.2676, "step": 8611 }, { "epoch": 5.703311258278146, "grad_norm": 0.8546547458816739, "learning_rate": 2.9707688274809183e-06, "loss": 0.291, "step": 8612 }, { "epoch": 5.703973509933775, "grad_norm": 0.7315628751428759, "learning_rate": 2.9575649753270593e-06, "loss": 0.2324, "step": 8613 }, { "epoch": 5.704635761589404, "grad_norm": 0.8296784239040956, "learning_rate": 2.9443902392611407e-06, "loss": 0.2773, "step": 8614 }, { "epoch": 5.7052980132450335, "grad_norm": 0.8341940809898911, "learning_rate": 2.9312446218918926e-06, "loss": 0.2637, "step": 8615 }, { "epoch": 5.705960264900662, "grad_norm": 0.6827283667465832, "learning_rate": 2.9181281258222665e-06, "loss": 0.2158, "step": 8616 }, { "epoch": 5.706622516556291, "grad_norm": 0.7238618702923731, "learning_rate": 2.9050407536495346e-06, "loss": 0.2412, "step": 8617 }, { "epoch": 5.70728476821192, "grad_norm": 1.0761811260950445, "learning_rate": 2.891982507965124e-06, "loss": 0.3789, "step": 8618 }, { "epoch": 5.70794701986755, "grad_norm": 0.7594725598999803, "learning_rate": 2.878953391354699e-06, "loss": 0.2559, "step": 8619 }, { "epoch": 5.708609271523179, "grad_norm": 0.740918104653288, "learning_rate": 2.865953406398197e-06, "loss": 0.2832, "step": 8620 }, { "epoch": 5.709271523178808, "grad_norm": 0.8158432659720243, "learning_rate": 2.8529825556697748e-06, "loss": 0.2715, "step": 8621 }, { "epoch": 5.709933774834437, "grad_norm": 0.7074515762835106, "learning_rate": 2.8400408417377786e-06, "loss": 0.2461, "step": 8622 }, { "epoch": 5.710596026490066, "grad_norm": 0.7652949200933105, "learning_rate": 2.827128267164841e-06, "loss": 0.2715, "step": 8623 }, { "epoch": 5.711258278145696, "grad_norm": 0.7474207508758464, "learning_rate": 2.8142448345077673e-06, "loss": 0.2734, "step": 8624 }, { "epoch": 5.711920529801325, "grad_norm": 0.7188052119138011, "learning_rate": 2.80139054631765e-06, "loss": 0.2109, "step": 8625 }, { "epoch": 5.712582781456954, "grad_norm": 0.7147471376432075, "learning_rate": 2.7885654051397867e-06, "loss": 0.2197, "step": 8626 }, { "epoch": 5.713245033112583, "grad_norm": 0.818025315003347, "learning_rate": 2.775769413513712e-06, "loss": 0.2793, "step": 8627 }, { "epoch": 5.7139072847682115, "grad_norm": 0.7737542097602708, "learning_rate": 2.7630025739731498e-06, "loss": 0.2432, "step": 8628 }, { "epoch": 5.714569536423841, "grad_norm": 0.7151855972234946, "learning_rate": 2.7502648890460775e-06, "loss": 0.2188, "step": 8629 }, { "epoch": 5.71523178807947, "grad_norm": 0.6979875731903878, "learning_rate": 2.7375563612547446e-06, "loss": 0.2188, "step": 8630 }, { "epoch": 5.715894039735099, "grad_norm": 0.7892823754864899, "learning_rate": 2.7248769931155546e-06, "loss": 0.2578, "step": 8631 }, { "epoch": 5.716556291390728, "grad_norm": 0.7057219418411872, "learning_rate": 2.7122267871391823e-06, "loss": 0.2139, "step": 8632 }, { "epoch": 5.717218543046357, "grad_norm": 0.723895318119924, "learning_rate": 2.6996057458304908e-06, "loss": 0.2148, "step": 8633 }, { "epoch": 5.717880794701987, "grad_norm": 0.7368163756512313, "learning_rate": 2.6870138716886313e-06, "loss": 0.2041, "step": 8634 }, { "epoch": 5.718543046357616, "grad_norm": 0.7286834079608066, "learning_rate": 2.674451167206876e-06, "loss": 0.2246, "step": 8635 }, { "epoch": 5.719205298013245, "grad_norm": 0.816493453494043, "learning_rate": 2.661917634872851e-06, "loss": 0.293, "step": 8636 }, { "epoch": 5.719867549668874, "grad_norm": 1.0768925360474695, "learning_rate": 2.6494132771682887e-06, "loss": 0.4219, "step": 8637 }, { "epoch": 5.720529801324504, "grad_norm": 0.7362226662124302, "learning_rate": 2.6369380965692246e-06, "loss": 0.2412, "step": 8638 }, { "epoch": 5.721192052980133, "grad_norm": 0.6634310156524303, "learning_rate": 2.6244920955458994e-06, "loss": 0.2021, "step": 8639 }, { "epoch": 5.7218543046357615, "grad_norm": 0.7388836698531333, "learning_rate": 2.612075276562725e-06, "loss": 0.2344, "step": 8640 }, { "epoch": 5.72251655629139, "grad_norm": 0.7647830281345201, "learning_rate": 2.5996876420783854e-06, "loss": 0.249, "step": 8641 }, { "epoch": 5.72317880794702, "grad_norm": 0.742924859890923, "learning_rate": 2.587329194545801e-06, "loss": 0.2598, "step": 8642 }, { "epoch": 5.723841059602649, "grad_norm": 0.871409100066138, "learning_rate": 2.5749999364120643e-06, "loss": 0.2969, "step": 8643 }, { "epoch": 5.724503311258278, "grad_norm": 0.7323304545548751, "learning_rate": 2.5626998701185065e-06, "loss": 0.2539, "step": 8644 }, { "epoch": 5.725165562913907, "grad_norm": 0.7208838531302878, "learning_rate": 2.550428998100679e-06, "loss": 0.2412, "step": 8645 }, { "epoch": 5.725827814569536, "grad_norm": 0.6668759913903309, "learning_rate": 2.5381873227883874e-06, "loss": 0.1934, "step": 8646 }, { "epoch": 5.726490066225166, "grad_norm": 0.5976642479280211, "learning_rate": 2.5259748466055774e-06, "loss": 0.1846, "step": 8647 }, { "epoch": 5.727152317880795, "grad_norm": 0.810121359011573, "learning_rate": 2.513791571970497e-06, "loss": 0.2471, "step": 8648 }, { "epoch": 5.727814569536424, "grad_norm": 0.7210981807843834, "learning_rate": 2.5016375012955504e-06, "loss": 0.1885, "step": 8649 }, { "epoch": 5.728476821192053, "grad_norm": 0.7833789979075667, "learning_rate": 2.4895126369873953e-06, "loss": 0.2383, "step": 8650 }, { "epoch": 5.729139072847682, "grad_norm": 0.8151711431201897, "learning_rate": 2.477416981446895e-06, "loss": 0.2656, "step": 8651 }, { "epoch": 5.7298013245033115, "grad_norm": 0.8924368760128949, "learning_rate": 2.465350537069133e-06, "loss": 0.332, "step": 8652 }, { "epoch": 5.73046357615894, "grad_norm": 0.693292124364326, "learning_rate": 2.4533133062433653e-06, "loss": 0.2227, "step": 8653 }, { "epoch": 5.731125827814569, "grad_norm": 0.8739236686153801, "learning_rate": 2.441305291353152e-06, "loss": 0.3008, "step": 8654 }, { "epoch": 5.731788079470198, "grad_norm": 0.935125555202239, "learning_rate": 2.4293264947761904e-06, "loss": 0.3418, "step": 8655 }, { "epoch": 5.732450331125828, "grad_norm": 0.7218634894708744, "learning_rate": 2.417376918884417e-06, "loss": 0.2246, "step": 8656 }, { "epoch": 5.733112582781457, "grad_norm": 0.8350880091907508, "learning_rate": 2.405456566044006e-06, "loss": 0.2832, "step": 8657 }, { "epoch": 5.733774834437086, "grad_norm": 0.7428693814787666, "learning_rate": 2.3935654386153026e-06, "loss": 0.2305, "step": 8658 }, { "epoch": 5.734437086092715, "grad_norm": 0.8429084936738148, "learning_rate": 2.381703538952906e-06, "loss": 0.2773, "step": 8659 }, { "epoch": 5.735099337748345, "grad_norm": 0.7807614098197941, "learning_rate": 2.369870869405588e-06, "loss": 0.2539, "step": 8660 }, { "epoch": 5.735761589403974, "grad_norm": 0.8037381157085843, "learning_rate": 2.3580674323163574e-06, "loss": 0.2617, "step": 8661 }, { "epoch": 5.736423841059603, "grad_norm": 0.6683369236054126, "learning_rate": 2.3462932300224448e-06, "loss": 0.2012, "step": 8662 }, { "epoch": 5.737086092715232, "grad_norm": 0.7468594377186247, "learning_rate": 2.3345482648552682e-06, "loss": 0.2471, "step": 8663 }, { "epoch": 5.737748344370861, "grad_norm": 0.7753412220266124, "learning_rate": 2.3228325391404677e-06, "loss": 0.2393, "step": 8664 }, { "epoch": 5.73841059602649, "grad_norm": 0.7195722412527152, "learning_rate": 2.3111460551978877e-06, "loss": 0.2061, "step": 8665 }, { "epoch": 5.739072847682119, "grad_norm": 0.7373996863934508, "learning_rate": 2.2994888153415605e-06, "loss": 0.2363, "step": 8666 }, { "epoch": 5.739735099337748, "grad_norm": 0.7050511936168654, "learning_rate": 2.2878608218798057e-06, "loss": 0.209, "step": 8667 }, { "epoch": 5.740397350993377, "grad_norm": 0.7172396657055471, "learning_rate": 2.2762620771150653e-06, "loss": 0.2246, "step": 8668 }, { "epoch": 5.741059602649006, "grad_norm": 0.6966412225135395, "learning_rate": 2.2646925833440355e-06, "loss": 0.2236, "step": 8669 }, { "epoch": 5.741721854304636, "grad_norm": 0.6390564833159935, "learning_rate": 2.2531523428576005e-06, "loss": 0.1895, "step": 8670 }, { "epoch": 5.742384105960265, "grad_norm": 0.7010901656587819, "learning_rate": 2.241641357940849e-06, "loss": 0.2266, "step": 8671 }, { "epoch": 5.743046357615894, "grad_norm": 0.8219829680540488, "learning_rate": 2.230159630873124e-06, "loss": 0.2559, "step": 8672 }, { "epoch": 5.743708609271523, "grad_norm": 0.6978902351292718, "learning_rate": 2.218707163927924e-06, "loss": 0.2168, "step": 8673 }, { "epoch": 5.744370860927153, "grad_norm": 0.7383504338197789, "learning_rate": 2.207283959372935e-06, "loss": 0.2188, "step": 8674 }, { "epoch": 5.745033112582782, "grad_norm": 0.7485799926914288, "learning_rate": 2.195890019470131e-06, "loss": 0.2354, "step": 8675 }, { "epoch": 5.7456953642384105, "grad_norm": 0.7455558497780159, "learning_rate": 2.184525346475624e-06, "loss": 0.2207, "step": 8676 }, { "epoch": 5.7463576158940395, "grad_norm": 0.6702953788231936, "learning_rate": 2.1731899426397305e-06, "loss": 0.2139, "step": 8677 }, { "epoch": 5.747019867549669, "grad_norm": 0.7851674112158219, "learning_rate": 2.1618838102070046e-06, "loss": 0.2637, "step": 8678 }, { "epoch": 5.747682119205298, "grad_norm": 0.720888201542392, "learning_rate": 2.150606951416223e-06, "loss": 0.2402, "step": 8679 }, { "epoch": 5.748344370860927, "grad_norm": 0.7567741128796722, "learning_rate": 2.139359368500282e-06, "loss": 0.2617, "step": 8680 }, { "epoch": 5.749006622516556, "grad_norm": 0.7512194420382396, "learning_rate": 2.1281410636863504e-06, "loss": 0.2363, "step": 8681 }, { "epoch": 5.749668874172185, "grad_norm": 0.7641420717826817, "learning_rate": 2.116952039195785e-06, "loss": 0.249, "step": 8682 }, { "epoch": 5.750331125827815, "grad_norm": 0.747889420272799, "learning_rate": 2.1057922972441464e-06, "loss": 0.2354, "step": 8683 }, { "epoch": 5.750993377483444, "grad_norm": 0.726693111050286, "learning_rate": 2.094661840041184e-06, "loss": 0.2168, "step": 8684 }, { "epoch": 5.751655629139073, "grad_norm": 0.7429999503227016, "learning_rate": 2.083560669790868e-06, "loss": 0.2363, "step": 8685 }, { "epoch": 5.752317880794702, "grad_norm": 0.6694714308508266, "learning_rate": 2.0724887886913234e-06, "loss": 0.1963, "step": 8686 }, { "epoch": 5.752980132450331, "grad_norm": 0.7474054099951204, "learning_rate": 2.0614461989349295e-06, "loss": 0.2637, "step": 8687 }, { "epoch": 5.7536423841059605, "grad_norm": 0.7489807726623684, "learning_rate": 2.050432902708271e-06, "loss": 0.2695, "step": 8688 }, { "epoch": 5.7543046357615895, "grad_norm": 0.8461572490782385, "learning_rate": 2.0394489021920703e-06, "loss": 0.2852, "step": 8689 }, { "epoch": 5.754966887417218, "grad_norm": 0.8823206448204663, "learning_rate": 2.0284941995613037e-06, "loss": 0.3242, "step": 8690 }, { "epoch": 5.755629139072847, "grad_norm": 0.6187271290370921, "learning_rate": 2.0175687969851195e-06, "loss": 0.1895, "step": 8691 }, { "epoch": 5.756291390728477, "grad_norm": 0.663407771281543, "learning_rate": 2.0066726966268876e-06, "loss": 0.2109, "step": 8692 }, { "epoch": 5.756953642384106, "grad_norm": 0.7294767472237663, "learning_rate": 1.995805900644132e-06, "loss": 0.2402, "step": 8693 }, { "epoch": 5.757615894039735, "grad_norm": 0.8270008901078862, "learning_rate": 1.9849684111886477e-06, "loss": 0.2852, "step": 8694 }, { "epoch": 5.758278145695364, "grad_norm": 0.7418373565958783, "learning_rate": 1.9741602304063353e-06, "loss": 0.2559, "step": 8695 }, { "epoch": 5.758940397350994, "grad_norm": 0.6910156237449533, "learning_rate": 1.9633813604373492e-06, "loss": 0.2217, "step": 8696 }, { "epoch": 5.759602649006623, "grad_norm": 0.6094836185422668, "learning_rate": 1.952631803416066e-06, "loss": 0.1875, "step": 8697 }, { "epoch": 5.760264900662252, "grad_norm": 0.714375829297667, "learning_rate": 1.941911561470966e-06, "loss": 0.2197, "step": 8698 }, { "epoch": 5.760927152317881, "grad_norm": 0.7281800091808914, "learning_rate": 1.9312206367248172e-06, "loss": 0.2041, "step": 8699 }, { "epoch": 5.76158940397351, "grad_norm": 0.7824293676245234, "learning_rate": 1.920559031294544e-06, "loss": 0.2373, "step": 8700 }, { "epoch": 5.762251655629139, "grad_norm": 0.724803558744917, "learning_rate": 1.909926747291257e-06, "loss": 0.2363, "step": 8701 }, { "epoch": 5.762913907284768, "grad_norm": 0.7660724613158997, "learning_rate": 1.899323786820256e-06, "loss": 0.2012, "step": 8702 }, { "epoch": 5.763576158940397, "grad_norm": 0.6899600180029024, "learning_rate": 1.8887501519810778e-06, "loss": 0.1846, "step": 8703 }, { "epoch": 5.764238410596026, "grad_norm": 0.7528873231734479, "learning_rate": 1.8782058448673976e-06, "loss": 0.2539, "step": 8704 }, { "epoch": 5.764900662251655, "grad_norm": 0.7978965454652358, "learning_rate": 1.8676908675671287e-06, "loss": 0.2471, "step": 8705 }, { "epoch": 5.765562913907285, "grad_norm": 0.7742595522469865, "learning_rate": 1.8572052221623558e-06, "loss": 0.2422, "step": 8706 }, { "epoch": 5.766225165562914, "grad_norm": 0.7254702090895969, "learning_rate": 1.8467489107293509e-06, "loss": 0.209, "step": 8707 }, { "epoch": 5.766887417218543, "grad_norm": 0.6898917284902956, "learning_rate": 1.8363219353385584e-06, "loss": 0.2188, "step": 8708 }, { "epoch": 5.767549668874172, "grad_norm": 0.7212818688251796, "learning_rate": 1.825924298054693e-06, "loss": 0.2109, "step": 8709 }, { "epoch": 5.768211920529802, "grad_norm": 0.6858390986814731, "learning_rate": 1.8155560009365744e-06, "loss": 0.1914, "step": 8710 }, { "epoch": 5.768874172185431, "grad_norm": 0.841095462692886, "learning_rate": 1.8052170460372272e-06, "loss": 0.291, "step": 8711 }, { "epoch": 5.76953642384106, "grad_norm": 0.7233751242706888, "learning_rate": 1.794907435403914e-06, "loss": 0.2305, "step": 8712 }, { "epoch": 5.7701986754966885, "grad_norm": 0.6990412551066661, "learning_rate": 1.7846271710780514e-06, "loss": 0.1953, "step": 8713 }, { "epoch": 5.770860927152318, "grad_norm": 0.7677927325779764, "learning_rate": 1.774376255095228e-06, "loss": 0.2402, "step": 8714 }, { "epoch": 5.771523178807947, "grad_norm": 0.7767490531768444, "learning_rate": 1.7641546894852699e-06, "loss": 0.2617, "step": 8715 }, { "epoch": 5.772185430463576, "grad_norm": 0.6391866385137092, "learning_rate": 1.7539624762721581e-06, "loss": 0.2031, "step": 8716 }, { "epoch": 5.772847682119205, "grad_norm": 0.8168142306369967, "learning_rate": 1.7437996174740453e-06, "loss": 0.2656, "step": 8717 }, { "epoch": 5.773509933774834, "grad_norm": 0.7582981367273539, "learning_rate": 1.7336661151033215e-06, "loss": 0.2734, "step": 8718 }, { "epoch": 5.774172185430464, "grad_norm": 0.7932610039544626, "learning_rate": 1.723561971166515e-06, "loss": 0.2295, "step": 8719 }, { "epoch": 5.774834437086093, "grad_norm": 0.670023723496463, "learning_rate": 1.7134871876643586e-06, "loss": 0.2158, "step": 8720 }, { "epoch": 5.775496688741722, "grad_norm": 0.7403726199307219, "learning_rate": 1.7034417665918232e-06, "loss": 0.249, "step": 8721 }, { "epoch": 5.776158940397351, "grad_norm": 0.6895892629441779, "learning_rate": 1.6934257099379678e-06, "loss": 0.2188, "step": 8722 }, { "epoch": 5.77682119205298, "grad_norm": 0.8286805103921656, "learning_rate": 1.683439019686089e-06, "loss": 0.2715, "step": 8723 }, { "epoch": 5.77748344370861, "grad_norm": 0.8142738717599949, "learning_rate": 1.673481697813689e-06, "loss": 0.2891, "step": 8724 }, { "epoch": 5.7781456953642385, "grad_norm": 0.8240483476053179, "learning_rate": 1.663553746292423e-06, "loss": 0.2598, "step": 8725 }, { "epoch": 5.778807947019867, "grad_norm": 0.8894195355813784, "learning_rate": 1.653655167088136e-06, "loss": 0.3359, "step": 8726 }, { "epoch": 5.779470198675496, "grad_norm": 0.6747133327483652, "learning_rate": 1.6437859621608595e-06, "loss": 0.1924, "step": 8727 }, { "epoch": 5.780132450331126, "grad_norm": 0.6608283043488363, "learning_rate": 1.633946133464814e-06, "loss": 0.208, "step": 8728 }, { "epoch": 5.780794701986755, "grad_norm": 0.6906683344541861, "learning_rate": 1.624135682948391e-06, "loss": 0.2266, "step": 8729 }, { "epoch": 5.781456953642384, "grad_norm": 0.8170174013216691, "learning_rate": 1.6143546125541695e-06, "loss": 0.252, "step": 8730 }, { "epoch": 5.782119205298013, "grad_norm": 0.6776945246319762, "learning_rate": 1.604602924218934e-06, "loss": 0.1982, "step": 8731 }, { "epoch": 5.782781456953643, "grad_norm": 0.8505380538283959, "learning_rate": 1.5948806198736063e-06, "loss": 0.2617, "step": 8732 }, { "epoch": 5.783443708609272, "grad_norm": 0.6556298803771343, "learning_rate": 1.58518770144333e-06, "loss": 0.1963, "step": 8733 }, { "epoch": 5.784105960264901, "grad_norm": 0.7842342421880394, "learning_rate": 1.5755241708474198e-06, "loss": 0.25, "step": 8734 }, { "epoch": 5.78476821192053, "grad_norm": 0.6754407746985512, "learning_rate": 1.5658900299993448e-06, "loss": 0.21, "step": 8735 }, { "epoch": 5.785430463576159, "grad_norm": 0.6841954183643724, "learning_rate": 1.5562852808067794e-06, "loss": 0.2061, "step": 8736 }, { "epoch": 5.7860927152317885, "grad_norm": 0.7519840086843343, "learning_rate": 1.5467099251715852e-06, "loss": 0.2441, "step": 8737 }, { "epoch": 5.786754966887417, "grad_norm": 0.7071820034837071, "learning_rate": 1.537163964989796e-06, "loss": 0.2178, "step": 8738 }, { "epoch": 5.787417218543046, "grad_norm": 0.7670814629446293, "learning_rate": 1.5276474021515994e-06, "loss": 0.2383, "step": 8739 }, { "epoch": 5.788079470198675, "grad_norm": 0.7492701692483842, "learning_rate": 1.518160238541405e-06, "loss": 0.2334, "step": 8740 }, { "epoch": 5.788741721854304, "grad_norm": 0.7412888329440318, "learning_rate": 1.5087024760377598e-06, "loss": 0.2559, "step": 8741 }, { "epoch": 5.789403973509934, "grad_norm": 0.8636727376525105, "learning_rate": 1.4992741165134493e-06, "loss": 0.2656, "step": 8742 }, { "epoch": 5.790066225165563, "grad_norm": 0.8070364214414796, "learning_rate": 1.4898751618353466e-06, "loss": 0.2617, "step": 8743 }, { "epoch": 5.790728476821192, "grad_norm": 0.7898507532078801, "learning_rate": 1.4805056138645799e-06, "loss": 0.2637, "step": 8744 }, { "epoch": 5.791390728476821, "grad_norm": 0.6598634522465825, "learning_rate": 1.471165474456415e-06, "loss": 0.209, "step": 8745 }, { "epoch": 5.79205298013245, "grad_norm": 0.9350772134505817, "learning_rate": 1.4618547454603224e-06, "loss": 0.3281, "step": 8746 }, { "epoch": 5.79271523178808, "grad_norm": 0.8733134161967404, "learning_rate": 1.4525734287199276e-06, "loss": 0.3145, "step": 8747 }, { "epoch": 5.793377483443709, "grad_norm": 0.7272498549436502, "learning_rate": 1.443321526073027e-06, "loss": 0.207, "step": 8748 }, { "epoch": 5.794039735099338, "grad_norm": 0.7884254176266394, "learning_rate": 1.4340990393516216e-06, "loss": 0.2715, "step": 8749 }, { "epoch": 5.7947019867549665, "grad_norm": 0.7788952943897341, "learning_rate": 1.424905970381851e-06, "loss": 0.2412, "step": 8750 }, { "epoch": 5.795364238410596, "grad_norm": 0.7886484334670395, "learning_rate": 1.415742320984059e-06, "loss": 0.2598, "step": 8751 }, { "epoch": 5.796026490066225, "grad_norm": 0.910694243625502, "learning_rate": 1.4066080929727275e-06, "loss": 0.3398, "step": 8752 }, { "epoch": 5.796688741721854, "grad_norm": 0.6926864395615128, "learning_rate": 1.397503288156593e-06, "loss": 0.2285, "step": 8753 }, { "epoch": 5.797350993377483, "grad_norm": 0.5956281691630017, "learning_rate": 1.388427908338463e-06, "loss": 0.1602, "step": 8754 }, { "epoch": 5.798013245033113, "grad_norm": 0.7584226254609122, "learning_rate": 1.3793819553154007e-06, "loss": 0.2197, "step": 8755 }, { "epoch": 5.798675496688742, "grad_norm": 0.6907027126969195, "learning_rate": 1.3703654308785729e-06, "loss": 0.2031, "step": 8756 }, { "epoch": 5.799337748344371, "grad_norm": 0.7272030927217936, "learning_rate": 1.361378336813368e-06, "loss": 0.2451, "step": 8757 }, { "epoch": 5.8, "grad_norm": 0.8679548329548463, "learning_rate": 1.3524206748993626e-06, "loss": 0.3047, "step": 8758 }, { "epoch": 5.800662251655629, "grad_norm": 0.8335746842737348, "learning_rate": 1.3434924469102382e-06, "loss": 0.2656, "step": 8759 }, { "epoch": 5.801324503311259, "grad_norm": 0.9611762232387562, "learning_rate": 1.3345936546138968e-06, "loss": 0.332, "step": 8760 }, { "epoch": 5.8019867549668875, "grad_norm": 0.7425841831413459, "learning_rate": 1.3257242997724128e-06, "loss": 0.2217, "step": 8761 }, { "epoch": 5.8026490066225165, "grad_norm": 0.716892351546579, "learning_rate": 1.3168843841420141e-06, "loss": 0.2246, "step": 8762 }, { "epoch": 5.803311258278145, "grad_norm": 0.8112737208608828, "learning_rate": 1.3080739094730841e-06, "loss": 0.2578, "step": 8763 }, { "epoch": 5.803973509933774, "grad_norm": 0.7355726184556421, "learning_rate": 1.299292877510244e-06, "loss": 0.2539, "step": 8764 }, { "epoch": 5.804635761589404, "grad_norm": 0.8615908112274114, "learning_rate": 1.2905412899921864e-06, "loss": 0.2891, "step": 8765 }, { "epoch": 5.805298013245033, "grad_norm": 0.7252477252003586, "learning_rate": 1.2818191486518748e-06, "loss": 0.2168, "step": 8766 }, { "epoch": 5.805960264900662, "grad_norm": 0.7061209259066067, "learning_rate": 1.273126455216361e-06, "loss": 0.2246, "step": 8767 }, { "epoch": 5.806622516556291, "grad_norm": 0.8034232511276235, "learning_rate": 1.2644632114069186e-06, "loss": 0.2539, "step": 8768 }, { "epoch": 5.807284768211921, "grad_norm": 0.7840057654053251, "learning_rate": 1.2558294189389251e-06, "loss": 0.2324, "step": 8769 }, { "epoch": 5.80794701986755, "grad_norm": 0.7159949058668483, "learning_rate": 1.2472250795220295e-06, "loss": 0.2168, "step": 8770 }, { "epoch": 5.808609271523179, "grad_norm": 0.6974423523440565, "learning_rate": 1.238650194859969e-06, "loss": 0.2139, "step": 8771 }, { "epoch": 5.809271523178808, "grad_norm": 0.7255283196960042, "learning_rate": 1.2301047666506359e-06, "loss": 0.2266, "step": 8772 }, { "epoch": 5.8099337748344375, "grad_norm": 0.6284756009842309, "learning_rate": 1.2215887965861592e-06, "loss": 0.207, "step": 8773 }, { "epoch": 5.8105960264900665, "grad_norm": 0.7950900473568635, "learning_rate": 1.2131022863528073e-06, "loss": 0.2168, "step": 8774 }, { "epoch": 5.811258278145695, "grad_norm": 0.7916523460458754, "learning_rate": 1.204645237630969e-06, "loss": 0.2793, "step": 8775 }, { "epoch": 5.811920529801324, "grad_norm": 0.7729154266479662, "learning_rate": 1.1962176520952715e-06, "loss": 0.2559, "step": 8776 }, { "epoch": 5.812582781456953, "grad_norm": 0.7693537816635067, "learning_rate": 1.1878195314144468e-06, "loss": 0.2227, "step": 8777 }, { "epoch": 5.813245033112583, "grad_norm": 0.7075189187198534, "learning_rate": 1.1794508772514478e-06, "loss": 0.2393, "step": 8778 }, { "epoch": 5.813907284768212, "grad_norm": 0.7660800578841785, "learning_rate": 1.1711116912633322e-06, "loss": 0.2266, "step": 8779 }, { "epoch": 5.814569536423841, "grad_norm": 0.7162288573551453, "learning_rate": 1.1628019751013794e-06, "loss": 0.2285, "step": 8780 }, { "epoch": 5.81523178807947, "grad_norm": 0.7226595902499757, "learning_rate": 1.1545217304109899e-06, "loss": 0.2139, "step": 8781 }, { "epoch": 5.815894039735099, "grad_norm": 0.7520538963404279, "learning_rate": 1.146270958831752e-06, "loss": 0.2676, "step": 8782 }, { "epoch": 5.816556291390729, "grad_norm": 0.7222313559097812, "learning_rate": 1.1380496619974255e-06, "loss": 0.2256, "step": 8783 }, { "epoch": 5.817218543046358, "grad_norm": 0.6732014415654022, "learning_rate": 1.1298578415359083e-06, "loss": 0.1943, "step": 8784 }, { "epoch": 5.817880794701987, "grad_norm": 0.6377840055003942, "learning_rate": 1.121695499069286e-06, "loss": 0.1826, "step": 8785 }, { "epoch": 5.8185430463576155, "grad_norm": 0.7124829007955021, "learning_rate": 1.1135626362137829e-06, "loss": 0.2246, "step": 8786 }, { "epoch": 5.819205298013245, "grad_norm": 0.7000372832124223, "learning_rate": 1.1054592545797935e-06, "loss": 0.2227, "step": 8787 }, { "epoch": 5.819867549668874, "grad_norm": 0.8226500063042513, "learning_rate": 1.0973853557719015e-06, "loss": 0.2734, "step": 8788 }, { "epoch": 5.820529801324503, "grad_norm": 0.7252901621277582, "learning_rate": 1.0893409413888276e-06, "loss": 0.2041, "step": 8789 }, { "epoch": 5.821192052980132, "grad_norm": 0.8632500813753755, "learning_rate": 1.081326013023448e-06, "loss": 0.3242, "step": 8790 }, { "epoch": 5.821854304635762, "grad_norm": 0.6814361391193241, "learning_rate": 1.0733405722628097e-06, "loss": 0.2119, "step": 8791 }, { "epoch": 5.822516556291391, "grad_norm": 0.7222564501665105, "learning_rate": 1.0653846206881479e-06, "loss": 0.2295, "step": 8792 }, { "epoch": 5.82317880794702, "grad_norm": 0.6882162258878533, "learning_rate": 1.0574581598747855e-06, "loss": 0.2236, "step": 8793 }, { "epoch": 5.823841059602649, "grad_norm": 0.7172868666355022, "learning_rate": 1.049561191392284e-06, "loss": 0.2236, "step": 8794 }, { "epoch": 5.824503311258278, "grad_norm": 1.0630013153167195, "learning_rate": 1.0416937168043427e-06, "loss": 0.3848, "step": 8795 }, { "epoch": 5.825165562913908, "grad_norm": 1.008966116816096, "learning_rate": 1.0338557376687817e-06, "loss": 0.3457, "step": 8796 }, { "epoch": 5.825827814569537, "grad_norm": 0.7668784775182778, "learning_rate": 1.0260472555376264e-06, "loss": 0.249, "step": 8797 }, { "epoch": 5.8264900662251655, "grad_norm": 0.7244742605881664, "learning_rate": 1.0182682719570401e-06, "loss": 0.2432, "step": 8798 }, { "epoch": 5.8271523178807945, "grad_norm": 0.7196204347604843, "learning_rate": 1.010518788467357e-06, "loss": 0.2148, "step": 8799 }, { "epoch": 5.827814569536423, "grad_norm": 0.7634842518411153, "learning_rate": 1.0027988066030835e-06, "loss": 0.2148, "step": 8800 }, { "epoch": 5.828476821192053, "grad_norm": 0.7880795574879496, "learning_rate": 9.951083278928295e-07, "loss": 0.2314, "step": 8801 }, { "epoch": 5.829139072847682, "grad_norm": 0.769842612920095, "learning_rate": 9.874473538594108e-07, "loss": 0.2373, "step": 8802 }, { "epoch": 5.829801324503311, "grad_norm": 0.7312587351830787, "learning_rate": 9.798158860197803e-07, "loss": 0.2139, "step": 8803 }, { "epoch": 5.83046357615894, "grad_norm": 0.7369781543893006, "learning_rate": 9.722139258850791e-07, "loss": 0.2236, "step": 8804 }, { "epoch": 5.83112582781457, "grad_norm": 0.9024122470073395, "learning_rate": 9.646414749605702e-07, "loss": 0.3145, "step": 8805 }, { "epoch": 5.831788079470199, "grad_norm": 0.6839335656006554, "learning_rate": 9.570985347456705e-07, "loss": 0.1914, "step": 8806 }, { "epoch": 5.832450331125828, "grad_norm": 0.776911919304328, "learning_rate": 9.495851067339854e-07, "loss": 0.2559, "step": 8807 }, { "epoch": 5.833112582781457, "grad_norm": 0.6811499303306172, "learning_rate": 9.421011924132748e-07, "loss": 0.2129, "step": 8808 }, { "epoch": 5.8337748344370866, "grad_norm": 0.755387615201497, "learning_rate": 9.346467932654034e-07, "loss": 0.248, "step": 8809 }, { "epoch": 5.8344370860927155, "grad_norm": 0.6900663772662814, "learning_rate": 9.272219107664403e-07, "loss": 0.2109, "step": 8810 }, { "epoch": 5.835099337748344, "grad_norm": 0.8268035056234583, "learning_rate": 9.198265463866095e-07, "loss": 0.3301, "step": 8811 }, { "epoch": 5.835761589403973, "grad_norm": 0.6679224855409073, "learning_rate": 9.124607015902729e-07, "loss": 0.1973, "step": 8812 }, { "epoch": 5.836423841059602, "grad_norm": 0.6858376461150592, "learning_rate": 9.051243778359641e-07, "loss": 0.208, "step": 8813 }, { "epoch": 5.837086092715232, "grad_norm": 0.8138605455107571, "learning_rate": 8.978175765763207e-07, "loss": 0.3008, "step": 8814 }, { "epoch": 5.837748344370861, "grad_norm": 0.6213164073960943, "learning_rate": 8.905402992582023e-07, "loss": 0.1768, "step": 8815 }, { "epoch": 5.83841059602649, "grad_norm": 0.8104656426558374, "learning_rate": 8.83292547322606e-07, "loss": 0.2676, "step": 8816 }, { "epoch": 5.839072847682119, "grad_norm": 0.7674065282150813, "learning_rate": 8.76074322204634e-07, "loss": 0.2471, "step": 8817 }, { "epoch": 5.839735099337748, "grad_norm": 0.7722854098254966, "learning_rate": 8.688856253336096e-07, "loss": 0.2422, "step": 8818 }, { "epoch": 5.840397350993378, "grad_norm": 0.6787048824580929, "learning_rate": 8.617264581329441e-07, "loss": 0.1865, "step": 8819 }, { "epoch": 5.841059602649007, "grad_norm": 0.6700804943048952, "learning_rate": 8.54596822020287e-07, "loss": 0.1904, "step": 8820 }, { "epoch": 5.841721854304636, "grad_norm": 0.7815751969666703, "learning_rate": 8.474967184073255e-07, "loss": 0.2559, "step": 8821 }, { "epoch": 5.842384105960265, "grad_norm": 0.6604765104902409, "learning_rate": 8.404261487000019e-07, "loss": 0.21, "step": 8822 }, { "epoch": 5.843046357615894, "grad_norm": 0.7198683729863349, "learning_rate": 8.333851142983627e-07, "loss": 0.2207, "step": 8823 }, { "epoch": 5.843708609271523, "grad_norm": 0.6732531887742457, "learning_rate": 8.263736165966095e-07, "loss": 0.1904, "step": 8824 }, { "epoch": 5.844370860927152, "grad_norm": 0.7149857968683632, "learning_rate": 8.193916569830983e-07, "loss": 0.2168, "step": 8825 }, { "epoch": 5.845033112582781, "grad_norm": 0.825933570723644, "learning_rate": 8.124392368403399e-07, "loss": 0.2471, "step": 8826 }, { "epoch": 5.845695364238411, "grad_norm": 0.683456241245201, "learning_rate": 8.05516357544983e-07, "loss": 0.1973, "step": 8827 }, { "epoch": 5.84635761589404, "grad_norm": 0.7540353810219281, "learning_rate": 7.986230204678645e-07, "loss": 0.2617, "step": 8828 }, { "epoch": 5.847019867549669, "grad_norm": 0.6227638896625928, "learning_rate": 7.917592269739093e-07, "loss": 0.2021, "step": 8829 }, { "epoch": 5.847682119205298, "grad_norm": 0.7833528210932177, "learning_rate": 7.849249784222466e-07, "loss": 0.2578, "step": 8830 }, { "epoch": 5.848344370860927, "grad_norm": 0.7793830097070877, "learning_rate": 7.781202761661442e-07, "loss": 0.2324, "step": 8831 }, { "epoch": 5.849006622516557, "grad_norm": 0.6483548444734437, "learning_rate": 7.713451215529909e-07, "loss": 0.1924, "step": 8832 }, { "epoch": 5.849668874172186, "grad_norm": 0.7505857592029226, "learning_rate": 7.645995159243467e-07, "loss": 0.2598, "step": 8833 }, { "epoch": 5.850331125827815, "grad_norm": 0.8172479951880373, "learning_rate": 7.578834606159434e-07, "loss": 0.2441, "step": 8834 }, { "epoch": 5.8509933774834435, "grad_norm": 0.7942814547343147, "learning_rate": 7.511969569576171e-07, "loss": 0.2246, "step": 8835 }, { "epoch": 5.8516556291390724, "grad_norm": 0.8057364412369703, "learning_rate": 7.445400062733586e-07, "loss": 0.2676, "step": 8836 }, { "epoch": 5.852317880794702, "grad_norm": 0.817122951823774, "learning_rate": 7.379126098813637e-07, "loss": 0.2734, "step": 8837 }, { "epoch": 5.852980132450331, "grad_norm": 0.7220753659998398, "learning_rate": 7.313147690938826e-07, "loss": 0.209, "step": 8838 }, { "epoch": 5.85364238410596, "grad_norm": 0.8462978987368671, "learning_rate": 7.247464852174034e-07, "loss": 0.2852, "step": 8839 }, { "epoch": 5.854304635761589, "grad_norm": 0.7226459728010415, "learning_rate": 7.18207759552486e-07, "loss": 0.2354, "step": 8840 }, { "epoch": 5.854966887417218, "grad_norm": 0.785150262551872, "learning_rate": 7.116985933939112e-07, "loss": 0.2168, "step": 8841 }, { "epoch": 5.855629139072848, "grad_norm": 0.7103113242134287, "learning_rate": 7.052189880305314e-07, "loss": 0.2275, "step": 8842 }, { "epoch": 5.856291390728477, "grad_norm": 0.7323601748136837, "learning_rate": 6.987689447454203e-07, "loss": 0.2236, "step": 8843 }, { "epoch": 5.856953642384106, "grad_norm": 0.7462629165651844, "learning_rate": 6.92348464815723e-07, "loss": 0.2324, "step": 8844 }, { "epoch": 5.857615894039736, "grad_norm": 0.760776808726537, "learning_rate": 6.859575495128055e-07, "loss": 0.2236, "step": 8845 }, { "epoch": 5.8582781456953645, "grad_norm": 0.6777784020406845, "learning_rate": 6.79596200102106e-07, "loss": 0.2256, "step": 8846 }, { "epoch": 5.8589403973509935, "grad_norm": 0.8607366020066697, "learning_rate": 6.732644178432667e-07, "loss": 0.3125, "step": 8847 }, { "epoch": 5.859602649006622, "grad_norm": 0.936704976906442, "learning_rate": 6.669622039900513e-07, "loss": 0.3418, "step": 8848 }, { "epoch": 5.860264900662251, "grad_norm": 0.8323622978783115, "learning_rate": 6.606895597903617e-07, "loss": 0.2637, "step": 8849 }, { "epoch": 5.860927152317881, "grad_norm": 0.7143824425782659, "learning_rate": 6.544464864862708e-07, "loss": 0.2383, "step": 8850 }, { "epoch": 5.86158940397351, "grad_norm": 0.7392291070381241, "learning_rate": 6.482329853139401e-07, "loss": 0.25, "step": 8851 }, { "epoch": 5.862251655629139, "grad_norm": 0.7314031024492902, "learning_rate": 6.420490575037684e-07, "loss": 0.2598, "step": 8852 }, { "epoch": 5.862913907284768, "grad_norm": 0.6806140369204269, "learning_rate": 6.358947042802098e-07, "loss": 0.21, "step": 8853 }, { "epoch": 5.863576158940397, "grad_norm": 0.7971253164028728, "learning_rate": 6.297699268618894e-07, "loss": 0.2354, "step": 8854 }, { "epoch": 5.864238410596027, "grad_norm": 0.8815921132321878, "learning_rate": 6.236747264616038e-07, "loss": 0.2988, "step": 8855 }, { "epoch": 5.864900662251656, "grad_norm": 0.8093700739453823, "learning_rate": 6.176091042862708e-07, "loss": 0.2734, "step": 8856 }, { "epoch": 5.865562913907285, "grad_norm": 0.7230415535205806, "learning_rate": 6.115730615369463e-07, "loss": 0.2354, "step": 8857 }, { "epoch": 5.866225165562914, "grad_norm": 0.7250978244691269, "learning_rate": 6.05566599408841e-07, "loss": 0.2275, "step": 8858 }, { "epoch": 5.866887417218543, "grad_norm": 0.7538741973102229, "learning_rate": 5.995897190913035e-07, "loss": 0.2197, "step": 8859 }, { "epoch": 5.867549668874172, "grad_norm": 0.9003633042542556, "learning_rate": 5.936424217678038e-07, "loss": 0.3105, "step": 8860 }, { "epoch": 5.868211920529801, "grad_norm": 0.6372154700034829, "learning_rate": 5.877247086159998e-07, "loss": 0.1826, "step": 8861 }, { "epoch": 5.86887417218543, "grad_norm": 0.8012555531283756, "learning_rate": 5.818365808076708e-07, "loss": 0.2656, "step": 8862 }, { "epoch": 5.869536423841059, "grad_norm": 0.6777449116273798, "learning_rate": 5.759780395087176e-07, "loss": 0.2041, "step": 8863 }, { "epoch": 5.870198675496689, "grad_norm": 0.6604036290285409, "learning_rate": 5.701490858791958e-07, "loss": 0.2041, "step": 8864 }, { "epoch": 5.870860927152318, "grad_norm": 0.8786069880195906, "learning_rate": 5.64349721073315e-07, "loss": 0.3086, "step": 8865 }, { "epoch": 5.871523178807947, "grad_norm": 0.8724540945043789, "learning_rate": 5.585799462394236e-07, "loss": 0.2852, "step": 8866 }, { "epoch": 5.872185430463576, "grad_norm": 0.700663342969884, "learning_rate": 5.528397625199743e-07, "loss": 0.2236, "step": 8867 }, { "epoch": 5.872847682119206, "grad_norm": 0.754637812746562, "learning_rate": 5.471291710516246e-07, "loss": 0.2393, "step": 8868 }, { "epoch": 5.873509933774835, "grad_norm": 0.7473045072514733, "learning_rate": 5.414481729651199e-07, "loss": 0.2432, "step": 8869 }, { "epoch": 5.874172185430464, "grad_norm": 0.7307426481796577, "learning_rate": 5.357967693853438e-07, "loss": 0.2441, "step": 8870 }, { "epoch": 5.8748344370860925, "grad_norm": 0.7437882297845524, "learning_rate": 5.301749614313844e-07, "loss": 0.2461, "step": 8871 }, { "epoch": 5.8754966887417215, "grad_norm": 0.8303225964519886, "learning_rate": 5.245827502163847e-07, "loss": 0.2695, "step": 8872 }, { "epoch": 5.876158940397351, "grad_norm": 0.7629807179782687, "learning_rate": 5.190201368476754e-07, "loss": 0.2314, "step": 8873 }, { "epoch": 5.87682119205298, "grad_norm": 0.8345768921431717, "learning_rate": 5.134871224267256e-07, "loss": 0.2539, "step": 8874 }, { "epoch": 5.877483443708609, "grad_norm": 0.7180517480575306, "learning_rate": 5.079837080491422e-07, "loss": 0.2363, "step": 8875 }, { "epoch": 5.878145695364238, "grad_norm": 0.7623139790550106, "learning_rate": 5.025098948046369e-07, "loss": 0.2197, "step": 8876 }, { "epoch": 5.878807947019867, "grad_norm": 0.71260140977013, "learning_rate": 4.970656837771258e-07, "loss": 0.252, "step": 8877 }, { "epoch": 5.879470198675497, "grad_norm": 0.7070401338359867, "learning_rate": 4.916510760445969e-07, "loss": 0.2256, "step": 8878 }, { "epoch": 5.880132450331126, "grad_norm": 0.6636400467097158, "learning_rate": 4.862660726792089e-07, "loss": 0.1797, "step": 8879 }, { "epoch": 5.880794701986755, "grad_norm": 0.6409366375442862, "learning_rate": 4.809106747472591e-07, "loss": 0.1865, "step": 8880 }, { "epoch": 5.881456953642384, "grad_norm": 0.7512881296308841, "learning_rate": 4.7558488330916576e-07, "loss": 0.2344, "step": 8881 }, { "epoch": 5.882119205298014, "grad_norm": 0.6202887503837247, "learning_rate": 4.7028869941951875e-07, "loss": 0.1846, "step": 8882 }, { "epoch": 5.8827814569536425, "grad_norm": 0.9623708625567875, "learning_rate": 4.650221241270125e-07, "loss": 0.2793, "step": 8883 }, { "epoch": 5.8834437086092715, "grad_norm": 0.7449521470704898, "learning_rate": 4.5978515847449603e-07, "loss": 0.2412, "step": 8884 }, { "epoch": 5.8841059602649, "grad_norm": 0.8056556332346643, "learning_rate": 4.545778034989234e-07, "loss": 0.2656, "step": 8885 }, { "epoch": 5.88476821192053, "grad_norm": 0.8237688634946688, "learning_rate": 4.494000602314363e-07, "loss": 0.2676, "step": 8886 }, { "epoch": 5.885430463576159, "grad_norm": 0.7460964113136672, "learning_rate": 4.442519296972813e-07, "loss": 0.2344, "step": 8887 }, { "epoch": 5.886092715231788, "grad_norm": 0.7121795779345035, "learning_rate": 4.39133412915843e-07, "loss": 0.1982, "step": 8888 }, { "epoch": 5.886754966887417, "grad_norm": 0.8654211500698257, "learning_rate": 4.340445109006441e-07, "loss": 0.2715, "step": 8889 }, { "epoch": 5.887417218543046, "grad_norm": 0.6716284569433371, "learning_rate": 4.28985224659345e-07, "loss": 0.2227, "step": 8890 }, { "epoch": 5.888079470198676, "grad_norm": 0.759695587640271, "learning_rate": 4.2395555519376124e-07, "loss": 0.2617, "step": 8891 }, { "epoch": 5.888741721854305, "grad_norm": 0.8834560624636014, "learning_rate": 4.18955503499796e-07, "loss": 0.2734, "step": 8892 }, { "epoch": 5.889403973509934, "grad_norm": 0.6672665112428086, "learning_rate": 4.139850705675407e-07, "loss": 0.2061, "step": 8893 }, { "epoch": 5.890066225165563, "grad_norm": 0.8136199319104247, "learning_rate": 4.090442573811914e-07, "loss": 0.25, "step": 8894 }, { "epoch": 5.890728476821192, "grad_norm": 0.775986462916124, "learning_rate": 4.0413306491908216e-07, "loss": 0.2754, "step": 8895 }, { "epoch": 5.891390728476821, "grad_norm": 0.7325669466224816, "learning_rate": 3.992514941536851e-07, "loss": 0.2314, "step": 8896 }, { "epoch": 5.89205298013245, "grad_norm": 0.739937539137077, "learning_rate": 3.943995460516103e-07, "loss": 0.252, "step": 8897 }, { "epoch": 5.892715231788079, "grad_norm": 0.7104803824206763, "learning_rate": 3.895772215735893e-07, "loss": 0.2061, "step": 8898 }, { "epoch": 5.893377483443708, "grad_norm": 0.6503873762066041, "learning_rate": 3.847845216745249e-07, "loss": 0.1953, "step": 8899 }, { "epoch": 5.894039735099338, "grad_norm": 0.8189973613440557, "learning_rate": 3.8002144730339137e-07, "loss": 0.2988, "step": 8900 }, { "epoch": 5.894701986754967, "grad_norm": 0.6713833839486647, "learning_rate": 3.7528799940335086e-07, "loss": 0.2227, "step": 8901 }, { "epoch": 5.895364238410596, "grad_norm": 0.8079638182630176, "learning_rate": 3.7058417891167034e-07, "loss": 0.2754, "step": 8902 }, { "epoch": 5.896026490066225, "grad_norm": 0.7691070948660405, "learning_rate": 3.659099867597881e-07, "loss": 0.2471, "step": 8903 }, { "epoch": 5.896688741721855, "grad_norm": 0.7692274268185151, "learning_rate": 3.6126542387321375e-07, "loss": 0.2363, "step": 8904 }, { "epoch": 5.897350993377484, "grad_norm": 0.830780542318256, "learning_rate": 3.56650491171645e-07, "loss": 0.2559, "step": 8905 }, { "epoch": 5.898013245033113, "grad_norm": 0.6730085175320445, "learning_rate": 3.5206518956886756e-07, "loss": 0.2109, "step": 8906 }, { "epoch": 5.898675496688742, "grad_norm": 0.6704224132716022, "learning_rate": 3.4750951997285505e-07, "loss": 0.2061, "step": 8907 }, { "epoch": 5.8993377483443705, "grad_norm": 0.7898065347201618, "learning_rate": 3.4298348328566927e-07, "loss": 0.2314, "step": 8908 }, { "epoch": 5.9, "grad_norm": 0.7944145808036633, "learning_rate": 3.3848708040351004e-07, "loss": 0.2715, "step": 8909 }, { "epoch": 5.900662251655629, "grad_norm": 0.7903490770275601, "learning_rate": 3.340203122167151e-07, "loss": 0.2451, "step": 8910 }, { "epoch": 5.901324503311258, "grad_norm": 0.8199336167792031, "learning_rate": 3.2958317960977696e-07, "loss": 0.2314, "step": 8911 }, { "epoch": 5.901986754966887, "grad_norm": 0.7106047653223646, "learning_rate": 3.251756834612762e-07, "loss": 0.2139, "step": 8912 }, { "epoch": 5.902649006622516, "grad_norm": 0.7526616204225441, "learning_rate": 3.2079782464396464e-07, "loss": 0.25, "step": 8913 }, { "epoch": 5.903311258278146, "grad_norm": 0.7944706242158172, "learning_rate": 3.164496040246822e-07, "loss": 0.25, "step": 8914 }, { "epoch": 5.903973509933775, "grad_norm": 0.6792056366835123, "learning_rate": 3.121310224644569e-07, "loss": 0.207, "step": 8915 }, { "epoch": 5.904635761589404, "grad_norm": 0.7420015346272435, "learning_rate": 3.0784208081840457e-07, "loss": 0.2314, "step": 8916 }, { "epoch": 5.905298013245033, "grad_norm": 0.711953305718888, "learning_rate": 3.035827799357793e-07, "loss": 0.2217, "step": 8917 }, { "epoch": 5.905960264900663, "grad_norm": 0.7561913498532168, "learning_rate": 2.9935312065998973e-07, "loss": 0.2412, "step": 8918 }, { "epoch": 5.906622516556292, "grad_norm": 0.7552253886960993, "learning_rate": 2.951531038285326e-07, "loss": 0.252, "step": 8919 }, { "epoch": 5.9072847682119205, "grad_norm": 0.7655349594693397, "learning_rate": 2.9098273027307604e-07, "loss": 0.2676, "step": 8920 }, { "epoch": 5.907947019867549, "grad_norm": 0.6124692977778528, "learning_rate": 2.868420008193928e-07, "loss": 0.2051, "step": 8921 }, { "epoch": 5.908609271523179, "grad_norm": 0.7442683914941131, "learning_rate": 2.8273091628739365e-07, "loss": 0.2266, "step": 8922 }, { "epoch": 5.909271523178808, "grad_norm": 0.7271860891384281, "learning_rate": 2.786494774911274e-07, "loss": 0.2236, "step": 8923 }, { "epoch": 5.909933774834437, "grad_norm": 0.7444541760562383, "learning_rate": 2.7459768523878103e-07, "loss": 0.2383, "step": 8924 }, { "epoch": 5.910596026490066, "grad_norm": 0.6595465670496837, "learning_rate": 2.7057554033261263e-07, "loss": 0.207, "step": 8925 }, { "epoch": 5.911258278145695, "grad_norm": 0.7503823419992977, "learning_rate": 2.6658304356910187e-07, "loss": 0.252, "step": 8926 }, { "epoch": 5.911920529801325, "grad_norm": 0.6541646241929427, "learning_rate": 2.626201957387664e-07, "loss": 0.2061, "step": 8927 }, { "epoch": 5.912582781456954, "grad_norm": 0.7714786689901325, "learning_rate": 2.586869976263284e-07, "loss": 0.2676, "step": 8928 }, { "epoch": 5.913245033112583, "grad_norm": 0.7076523115736347, "learning_rate": 2.547834500105983e-07, "loss": 0.1885, "step": 8929 }, { "epoch": 5.913907284768212, "grad_norm": 0.7177180222041608, "learning_rate": 2.509095536645078e-07, "loss": 0.2012, "step": 8930 }, { "epoch": 5.914569536423841, "grad_norm": 0.6845776312434835, "learning_rate": 2.4706530935514337e-07, "loss": 0.2148, "step": 8931 }, { "epoch": 5.9152317880794705, "grad_norm": 0.8254567230512837, "learning_rate": 2.4325071784371265e-07, "loss": 0.2715, "step": 8932 }, { "epoch": 5.915894039735099, "grad_norm": 0.8788908583005908, "learning_rate": 2.3946577988554484e-07, "loss": 0.332, "step": 8933 }, { "epoch": 5.916556291390728, "grad_norm": 0.6701938657144477, "learning_rate": 2.3571049623009042e-07, "loss": 0.1973, "step": 8934 }, { "epoch": 5.917218543046357, "grad_norm": 0.7826204246774577, "learning_rate": 2.3198486762095458e-07, "loss": 0.2539, "step": 8935 }, { "epoch": 5.917880794701987, "grad_norm": 0.7442361396024337, "learning_rate": 2.2828889479586387e-07, "loss": 0.2246, "step": 8936 }, { "epoch": 5.918543046357616, "grad_norm": 0.7795026848096929, "learning_rate": 2.2462257848663289e-07, "loss": 0.2354, "step": 8937 }, { "epoch": 5.919205298013245, "grad_norm": 0.7936331109401208, "learning_rate": 2.2098591941926425e-07, "loss": 0.2637, "step": 8938 }, { "epoch": 5.919867549668874, "grad_norm": 0.8852704433846098, "learning_rate": 2.1737891831383192e-07, "loss": 0.3066, "step": 8939 }, { "epoch": 5.920529801324504, "grad_norm": 0.7566920775364205, "learning_rate": 2.138015758845646e-07, "loss": 0.2334, "step": 8940 }, { "epoch": 5.921192052980133, "grad_norm": 0.6295109339026174, "learning_rate": 2.102538928398456e-07, "loss": 0.1885, "step": 8941 }, { "epoch": 5.921854304635762, "grad_norm": 0.6435344531999437, "learning_rate": 2.0673586988212976e-07, "loss": 0.1865, "step": 8942 }, { "epoch": 5.922516556291391, "grad_norm": 0.6851958843778964, "learning_rate": 2.032475077080431e-07, "loss": 0.2285, "step": 8943 }, { "epoch": 5.92317880794702, "grad_norm": 0.7597811284828032, "learning_rate": 1.9978880700831647e-07, "loss": 0.25, "step": 8944 }, { "epoch": 5.923841059602649, "grad_norm": 0.6853322428700527, "learning_rate": 1.96359768467802e-07, "loss": 0.2295, "step": 8945 }, { "epoch": 5.924503311258278, "grad_norm": 0.7301361681712475, "learning_rate": 1.9296039276550658e-07, "loss": 0.2441, "step": 8946 }, { "epoch": 5.925165562913907, "grad_norm": 0.8623177801798151, "learning_rate": 1.8959068057452508e-07, "loss": 0.291, "step": 8947 }, { "epoch": 5.925827814569536, "grad_norm": 0.7786931342149757, "learning_rate": 1.8625063256210714e-07, "loss": 0.2383, "step": 8948 }, { "epoch": 5.926490066225165, "grad_norm": 0.7546414152781539, "learning_rate": 1.829402493896237e-07, "loss": 0.2441, "step": 8949 }, { "epoch": 5.927152317880795, "grad_norm": 0.7945899176684996, "learning_rate": 1.7965953171256708e-07, "loss": 0.2832, "step": 8950 }, { "epoch": 5.927814569536424, "grad_norm": 0.7405557193678926, "learning_rate": 1.7640848018056765e-07, "loss": 0.2275, "step": 8951 }, { "epoch": 5.928476821192053, "grad_norm": 0.8294018284339708, "learning_rate": 1.7318709543734376e-07, "loss": 0.3008, "step": 8952 }, { "epoch": 5.929139072847682, "grad_norm": 0.8166003948271976, "learning_rate": 1.6999537812080188e-07, "loss": 0.2578, "step": 8953 }, { "epoch": 5.929801324503311, "grad_norm": 0.7577792984753371, "learning_rate": 1.6683332886291979e-07, "loss": 0.248, "step": 8954 }, { "epoch": 5.930463576158941, "grad_norm": 0.721794264715447, "learning_rate": 1.6370094828979664e-07, "loss": 0.2334, "step": 8955 }, { "epoch": 5.9311258278145695, "grad_norm": 0.6716264902714889, "learning_rate": 1.605982370217196e-07, "loss": 0.2295, "step": 8956 }, { "epoch": 5.9317880794701985, "grad_norm": 0.7211981896507037, "learning_rate": 1.5752519567304721e-07, "loss": 0.2051, "step": 8957 }, { "epoch": 5.932450331125828, "grad_norm": 0.7167560264605896, "learning_rate": 1.5448182485225945e-07, "loss": 0.2285, "step": 8958 }, { "epoch": 5.933112582781457, "grad_norm": 0.8019413025919309, "learning_rate": 1.5146812516200757e-07, "loss": 0.2393, "step": 8959 }, { "epoch": 5.933774834437086, "grad_norm": 0.7087708373301929, "learning_rate": 1.484840971990142e-07, "loss": 0.2295, "step": 8960 }, { "epoch": 5.934437086092715, "grad_norm": 0.644850188859022, "learning_rate": 1.4552974155417342e-07, "loss": 0.1963, "step": 8961 }, { "epoch": 5.935099337748344, "grad_norm": 0.8498228833874456, "learning_rate": 1.426050588124672e-07, "loss": 0.2832, "step": 8962 }, { "epoch": 5.935761589403974, "grad_norm": 0.8501592837948648, "learning_rate": 1.3971004955301568e-07, "loss": 0.2988, "step": 8963 }, { "epoch": 5.936423841059603, "grad_norm": 0.7226528478314316, "learning_rate": 1.368447143490603e-07, "loss": 0.2451, "step": 8964 }, { "epoch": 5.937086092715232, "grad_norm": 0.7848160160347092, "learning_rate": 1.3400905376798055e-07, "loss": 0.2754, "step": 8965 }, { "epoch": 5.937748344370861, "grad_norm": 0.6933778326261003, "learning_rate": 1.312030683712606e-07, "loss": 0.2061, "step": 8966 }, { "epoch": 5.93841059602649, "grad_norm": 0.7635571846794397, "learning_rate": 1.2842675871452267e-07, "loss": 0.2656, "step": 8967 }, { "epoch": 5.9390728476821195, "grad_norm": 0.6742102297033106, "learning_rate": 1.2568012534751038e-07, "loss": 0.21, "step": 8968 }, { "epoch": 5.9397350993377485, "grad_norm": 0.7167534303218163, "learning_rate": 1.229631688141053e-07, "loss": 0.2295, "step": 8969 }, { "epoch": 5.940397350993377, "grad_norm": 0.6898771344894374, "learning_rate": 1.2027588965226043e-07, "loss": 0.2266, "step": 8970 }, { "epoch": 5.941059602649006, "grad_norm": 0.8328464812824243, "learning_rate": 1.1761828839410014e-07, "loss": 0.252, "step": 8971 }, { "epoch": 5.941721854304635, "grad_norm": 0.8059568970953456, "learning_rate": 1.1499036556587015e-07, "loss": 0.2715, "step": 8972 }, { "epoch": 5.942384105960265, "grad_norm": 0.8818303296257425, "learning_rate": 1.1239212168792089e-07, "loss": 0.2695, "step": 8973 }, { "epoch": 5.943046357615894, "grad_norm": 0.724471518722728, "learning_rate": 1.0982355727474079e-07, "loss": 0.2314, "step": 8974 }, { "epoch": 5.943708609271523, "grad_norm": 0.7727995916615588, "learning_rate": 1.0728467283492304e-07, "loss": 0.2812, "step": 8975 }, { "epoch": 5.944370860927152, "grad_norm": 0.6947875783966496, "learning_rate": 1.0477546887121546e-07, "loss": 0.1982, "step": 8976 }, { "epoch": 5.945033112582782, "grad_norm": 0.8502345682682994, "learning_rate": 1.0229594588043732e-07, "loss": 0.3184, "step": 8977 }, { "epoch": 5.945695364238411, "grad_norm": 0.634425220976975, "learning_rate": 9.984610435359586e-08, "loss": 0.1934, "step": 8978 }, { "epoch": 5.94635761589404, "grad_norm": 1.151886437656613, "learning_rate": 9.742594477578636e-08, "loss": 0.4141, "step": 8979 }, { "epoch": 5.947019867549669, "grad_norm": 0.7204509985431728, "learning_rate": 9.503546762620884e-08, "loss": 0.2305, "step": 8980 }, { "epoch": 5.947682119205298, "grad_norm": 0.7320720243540163, "learning_rate": 9.267467337820134e-08, "loss": 0.2324, "step": 8981 }, { "epoch": 5.948344370860927, "grad_norm": 0.683142897070884, "learning_rate": 9.034356249925656e-08, "loss": 0.2002, "step": 8982 }, { "epoch": 5.949006622516556, "grad_norm": 0.7600911883561219, "learning_rate": 8.804213545093864e-08, "loss": 0.2246, "step": 8983 }, { "epoch": 5.949668874172185, "grad_norm": 0.7608084260086367, "learning_rate": 8.577039268894969e-08, "loss": 0.2422, "step": 8984 }, { "epoch": 5.950331125827814, "grad_norm": 0.8187098813903155, "learning_rate": 8.352833466314657e-08, "loss": 0.2637, "step": 8985 }, { "epoch": 5.950993377483444, "grad_norm": 0.7884743340572617, "learning_rate": 8.131596181747413e-08, "loss": 0.2598, "step": 8986 }, { "epoch": 5.951655629139073, "grad_norm": 0.7079191689803381, "learning_rate": 7.913327458999863e-08, "loss": 0.2305, "step": 8987 }, { "epoch": 5.952317880794702, "grad_norm": 0.7190076557478448, "learning_rate": 7.698027341292434e-08, "loss": 0.2373, "step": 8988 }, { "epoch": 5.952980132450331, "grad_norm": 0.661997952224156, "learning_rate": 7.485695871256026e-08, "loss": 0.2129, "step": 8989 }, { "epoch": 5.95364238410596, "grad_norm": 0.7830668521858762, "learning_rate": 7.276333090935338e-08, "loss": 0.249, "step": 8990 }, { "epoch": 5.95430463576159, "grad_norm": 0.8346824930967212, "learning_rate": 7.069939041787209e-08, "loss": 0.2334, "step": 8991 }, { "epoch": 5.954966887417219, "grad_norm": 0.7530102295444446, "learning_rate": 6.86651376467895e-08, "loss": 0.2217, "step": 8992 }, { "epoch": 5.9556291390728475, "grad_norm": 0.709389326019918, "learning_rate": 6.66605729989167e-08, "loss": 0.2275, "step": 8993 }, { "epoch": 5.9562913907284765, "grad_norm": 0.772154107130087, "learning_rate": 6.46856968711862e-08, "loss": 0.2266, "step": 8994 }, { "epoch": 5.956953642384106, "grad_norm": 0.7888967870102803, "learning_rate": 6.274050965463517e-08, "loss": 0.2773, "step": 8995 }, { "epoch": 5.957615894039735, "grad_norm": 0.6383178439909735, "learning_rate": 6.082501173443887e-08, "loss": 0.2012, "step": 8996 }, { "epoch": 5.958278145695364, "grad_norm": 0.8201207578033066, "learning_rate": 5.893920348987724e-08, "loss": 0.2578, "step": 8997 }, { "epoch": 5.958940397350993, "grad_norm": 0.6680759579425741, "learning_rate": 5.708308529438488e-08, "loss": 0.208, "step": 8998 }, { "epoch": 5.959602649006623, "grad_norm": 0.6923520621417799, "learning_rate": 5.525665751548447e-08, "loss": 0.2354, "step": 8999 }, { "epoch": 5.960264900662252, "grad_norm": 0.722502093629824, "learning_rate": 5.3459920514820065e-08, "loss": 0.2148, "step": 9000 }, { "epoch": 5.960927152317881, "grad_norm": 0.6002838986280469, "learning_rate": 5.169287464815708e-08, "loss": 0.1826, "step": 9001 }, { "epoch": 5.96158940397351, "grad_norm": 0.8085497432595289, "learning_rate": 4.995552026543226e-08, "loss": 0.2754, "step": 9002 }, { "epoch": 5.962251655629139, "grad_norm": 0.7009574085482572, "learning_rate": 4.8247857710620455e-08, "loss": 0.2148, "step": 9003 }, { "epoch": 5.9629139072847686, "grad_norm": 0.7387097878668679, "learning_rate": 4.656988732188449e-08, "loss": 0.2344, "step": 9004 }, { "epoch": 5.9635761589403975, "grad_norm": 0.6900575763483855, "learning_rate": 4.492160943145861e-08, "loss": 0.2051, "step": 9005 }, { "epoch": 5.964238410596026, "grad_norm": 0.7958175892699306, "learning_rate": 4.330302436573174e-08, "loss": 0.248, "step": 9006 }, { "epoch": 5.964900662251655, "grad_norm": 0.7228270617125553, "learning_rate": 4.171413244521416e-08, "loss": 0.2314, "step": 9007 }, { "epoch": 5.965562913907284, "grad_norm": 0.7947810738875899, "learning_rate": 4.01549339845042e-08, "loss": 0.2715, "step": 9008 }, { "epoch": 5.966225165562914, "grad_norm": 0.7546574346759855, "learning_rate": 3.86254292923549e-08, "loss": 0.2432, "step": 9009 }, { "epoch": 5.966887417218543, "grad_norm": 0.6110909130843617, "learning_rate": 3.7125618671624e-08, "loss": 0.1719, "step": 9010 }, { "epoch": 5.967549668874172, "grad_norm": 0.6475206357154525, "learning_rate": 3.5655502419290605e-08, "loss": 0.2109, "step": 9011 }, { "epoch": 5.968211920529801, "grad_norm": 0.7599164505975731, "learning_rate": 3.4215080826455186e-08, "loss": 0.2188, "step": 9012 }, { "epoch": 5.968874172185431, "grad_norm": 0.6196737687773664, "learning_rate": 3.280435417832294e-08, "loss": 0.1621, "step": 9013 }, { "epoch": 5.96953642384106, "grad_norm": 0.7950140765419488, "learning_rate": 3.142332275425374e-08, "loss": 0.2988, "step": 9014 }, { "epoch": 5.970198675496689, "grad_norm": 0.7900795565787098, "learning_rate": 3.007198682769552e-08, "loss": 0.252, "step": 9015 }, { "epoch": 5.970860927152318, "grad_norm": 0.6788574921060473, "learning_rate": 2.8750346666250868e-08, "loss": 0.2168, "step": 9016 }, { "epoch": 5.9715231788079475, "grad_norm": 0.8099272988068814, "learning_rate": 2.745840253157716e-08, "loss": 0.2754, "step": 9017 }, { "epoch": 5.972185430463576, "grad_norm": 0.7357799257356616, "learning_rate": 2.6196154679536395e-08, "loss": 0.2246, "step": 9018 }, { "epoch": 5.972847682119205, "grad_norm": 0.8013839698399275, "learning_rate": 2.4963603360045327e-08, "loss": 0.2656, "step": 9019 }, { "epoch": 5.973509933774834, "grad_norm": 0.6660548114662181, "learning_rate": 2.3760748817158726e-08, "loss": 0.2041, "step": 9020 }, { "epoch": 5.974172185430463, "grad_norm": 0.7349700601059688, "learning_rate": 2.2587591289086047e-08, "loss": 0.2637, "step": 9021 }, { "epoch": 5.974834437086093, "grad_norm": 0.667400940470393, "learning_rate": 2.1444131008091503e-08, "loss": 0.21, "step": 9022 }, { "epoch": 5.975496688741722, "grad_norm": 0.7559925456982503, "learning_rate": 2.0330368200610624e-08, "loss": 0.2324, "step": 9023 }, { "epoch": 5.976158940397351, "grad_norm": 0.7652894571373917, "learning_rate": 1.9246303087167016e-08, "loss": 0.2354, "step": 9024 }, { "epoch": 5.97682119205298, "grad_norm": 0.6945270360811201, "learning_rate": 1.8191935882438945e-08, "loss": 0.2217, "step": 9025 }, { "epoch": 5.977483443708609, "grad_norm": 0.6966980302305292, "learning_rate": 1.7167266795192757e-08, "loss": 0.2344, "step": 9026 }, { "epoch": 5.978145695364239, "grad_norm": 0.7081063474240175, "learning_rate": 1.6172296028332808e-08, "loss": 0.2295, "step": 9027 }, { "epoch": 5.978807947019868, "grad_norm": 0.8486262265249113, "learning_rate": 1.5207023778851522e-08, "loss": 0.2793, "step": 9028 }, { "epoch": 5.979470198675497, "grad_norm": 0.8455903720013623, "learning_rate": 1.4271450237895998e-08, "loss": 0.2852, "step": 9029 }, { "epoch": 5.9801324503311255, "grad_norm": 0.7789746189338085, "learning_rate": 1.3365575590734701e-08, "loss": 0.2656, "step": 9030 }, { "epoch": 5.980794701986755, "grad_norm": 0.6698492092125282, "learning_rate": 1.2489400016740814e-08, "loss": 0.1787, "step": 9031 }, { "epoch": 5.981456953642384, "grad_norm": 1.0380567101179006, "learning_rate": 1.1642923689375582e-08, "loss": 0.3359, "step": 9032 }, { "epoch": 5.982119205298013, "grad_norm": 0.6496764689730753, "learning_rate": 1.0826146776288237e-08, "loss": 0.2012, "step": 9033 }, { "epoch": 5.982781456953642, "grad_norm": 0.7098449100817591, "learning_rate": 1.0039069439182757e-08, "loss": 0.2383, "step": 9034 }, { "epoch": 5.983443708609272, "grad_norm": 0.8301135912109756, "learning_rate": 9.281691833917804e-09, "loss": 0.2637, "step": 9035 }, { "epoch": 5.984105960264901, "grad_norm": 0.8514348291367677, "learning_rate": 8.554014110473406e-09, "loss": 0.2617, "step": 9036 }, { "epoch": 5.98476821192053, "grad_norm": 0.7502464940641256, "learning_rate": 7.856036412917654e-09, "loss": 0.2236, "step": 9037 }, { "epoch": 5.985430463576159, "grad_norm": 0.8053979136224846, "learning_rate": 7.187758879489969e-09, "loss": 0.252, "step": 9038 }, { "epoch": 5.986092715231788, "grad_norm": 0.6914402351157083, "learning_rate": 6.5491816424845244e-09, "loss": 0.2217, "step": 9039 }, { "epoch": 5.986754966887418, "grad_norm": 0.7787907859863512, "learning_rate": 5.940304828366827e-09, "loss": 0.2695, "step": 9040 }, { "epoch": 5.9874172185430465, "grad_norm": 0.7673742816924171, "learning_rate": 5.361128557690442e-09, "loss": 0.2656, "step": 9041 }, { "epoch": 5.9880794701986755, "grad_norm": 0.7257990142145353, "learning_rate": 4.81165294514696e-09, "loss": 0.2461, "step": 9042 }, { "epoch": 5.988741721854304, "grad_norm": 0.6471936496950759, "learning_rate": 4.291878099516033e-09, "loss": 0.207, "step": 9043 }, { "epoch": 5.989403973509933, "grad_norm": 0.7031518017806361, "learning_rate": 3.8018041237652955e-09, "loss": 0.2246, "step": 9044 }, { "epoch": 5.990066225165563, "grad_norm": 0.7661618121761746, "learning_rate": 3.34143111488383e-09, "loss": 0.2461, "step": 9045 }, { "epoch": 5.990728476821192, "grad_norm": 0.7643603317054753, "learning_rate": 2.9107591640820106e-09, "loss": 0.2207, "step": 9046 }, { "epoch": 5.991390728476821, "grad_norm": 0.8316668984638713, "learning_rate": 2.5097883565916577e-09, "loss": 0.2871, "step": 9047 }, { "epoch": 5.99205298013245, "grad_norm": 0.6584138238965226, "learning_rate": 2.138518771832576e-09, "loss": 0.2139, "step": 9048 }, { "epoch": 5.99271523178808, "grad_norm": 0.9476265383849117, "learning_rate": 1.7969504833126314e-09, "loss": 0.3652, "step": 9049 }, { "epoch": 5.993377483443709, "grad_norm": 0.6243357553043218, "learning_rate": 1.4850835586610598e-09, "loss": 0.1904, "step": 9050 }, { "epoch": 5.994039735099338, "grad_norm": 0.7253004500732355, "learning_rate": 1.2029180596617727e-09, "loss": 0.2314, "step": 9051 }, { "epoch": 5.994701986754967, "grad_norm": 0.880250012640625, "learning_rate": 9.504540421534369e-10, "loss": 0.3047, "step": 9052 }, { "epoch": 5.9953642384105965, "grad_norm": 0.694671528865614, "learning_rate": 7.27691556129395e-10, "loss": 0.2344, "step": 9053 }, { "epoch": 5.9960264900662255, "grad_norm": 0.7138490846613068, "learning_rate": 5.346306457210125e-10, "loss": 0.207, "step": 9054 }, { "epoch": 5.996688741721854, "grad_norm": 0.7706952956004545, "learning_rate": 3.7127134914771706e-10, "loss": 0.2295, "step": 9055 }, { "epoch": 5.997350993377483, "grad_norm": 0.8234903775824498, "learning_rate": 2.376136987336519e-10, "loss": 0.2793, "step": 9056 }, { "epoch": 5.998013245033112, "grad_norm": 0.740101684295858, "learning_rate": 1.336577209742895e-10, "loss": 0.25, "step": 9057 }, { "epoch": 5.998675496688742, "grad_norm": 0.7083457684665877, "learning_rate": 5.940343645316481e-11, "loss": 0.2334, "step": 9058 }, { "epoch": 5.999337748344371, "grad_norm": 0.822543277060649, "learning_rate": 1.4850859841875063e-11, "loss": 0.2656, "step": 9059 }, { "epoch": 6.0, "grad_norm": 0.9273724863004569, "learning_rate": 0.0, "loss": 0.3105, "step": 9060 }, { "epoch": 6.0, "eval_loss": 2.969489574432373, "eval_runtime": 34.0302, "eval_samples_per_second": 9.932, "eval_steps_per_second": 9.932, "step": 9060 }, { "epoch": 6.0, "step": 9060, "total_flos": 14820153753600.0, "train_loss": 1.7485878647557946, "train_runtime": 43424.1913, "train_samples_per_second": 0.209, "train_steps_per_second": 0.209 } ], "logging_steps": 1.0, "max_steps": 9060, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 14820153753600.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }